In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(10, 8)})
pd.set_option('display.max_columns', 1500)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, classification_report, cohen_kappa_score
from sklearn.svm import NuSVC
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
def print_results(y_true, y_pred):
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    print('F1-score:', f1_score(y_true, y_pred, average='macro'))
    print('cohen_kappa_score:', cohen_kappa_score(y_true, y_pred))
def plot_validation_curve(model_grid, param_name, params=None):
    results_df = pd.DataFrame(model_grid.cv_results_)
    
    if params == None:
        plt.plot(results_df['param_'+param_name], results_df['mean_test_score'])
    else:
        plt.plot(params, results_df['mean_test_score'])

    plt.xlabel(param_name)
    plt.ylabel('Test F1 score')
    plt.title('Validation curve')
    plt.show()

In [4]:
df = pd.read_csv('age_final_numeric_corr_2.csv')
df.shape

(28430, 60)

In [5]:
labels = df['target']
df_mod = df.drop('target', axis=1)
df_mod.shape

(28430, 59)

In [6]:
scaler = StandardScaler()
X = scaler.fit_transform(df_mod)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, labels, test_size=0.15, random_state=16)

One-vs-One (SVC, Gaus., NuSVC)

In [8]:
# kernel = 1.0 * RBF(1.0)
# gpc = GaussianProcessClassifier(kernel=kernel,
#         random_state=0).fit(X_train, y_train)
# gpc.score(X_valid, y_valid)
# gpc.predict_proba(X[:2,:])
# error - unable allocate enough memory

In [9]:
svm_clf = NuSVC(kernel='rbf', random_state=42, verbose=3)
svm_clf.fit(X_train, y_train)
y_test_pred_svm = svm_clf.predict(X_valid)
f1_score(y_valid, y_test_pred_svm, average='macro')

[LibSVM]

0.40832474609638386

In [10]:
pas_agr_classif = PassiveAggressiveClassifier(warm_start=False, random_state=1)
pas_agr_classif.fit(X_train, y_train)
y_pred = pas_agr_classif.predict(X_valid)
print_results(y_valid, y_pred)

[[277 114  64  86  46  56]
 [253 169  76  72  33  36]
 [146  72 176 135  61 108]
 [ 90  36 212 166  81 190]
 [ 68  23 131 126 111 292]
 [ 44  12  99  92 149 363]]
              precision    recall  f1-score   support

           1       0.32      0.43      0.36       643
           2       0.40      0.26      0.32       639
           3       0.23      0.25      0.24       698
           4       0.25      0.21      0.23       775
           5       0.23      0.15      0.18       751
           6       0.35      0.48      0.40       759

    accuracy                           0.30      4265
   macro avg       0.29      0.30      0.29      4265
weighted avg       0.29      0.30      0.29      4265

F1-score: 0.28910785960747765
cohen_kappa_score: 0.15433930974812538


In [11]:
bern_cl = BernoulliNB()
bern_cl.fit(X_train, y_train)
y_pred = bern_cl.predict(X_valid)
print_results(y_valid, y_pred)

[[254 177  17  24  55 116]
 [170 311  41  11  43  63]
 [ 88 166 126  61 132 125]
 [ 57  62 172 112 195 177]
 [ 24  31  87 104 183 322]
 [ 14   5  36  57 128 519]]
              precision    recall  f1-score   support

           1       0.42      0.40      0.41       643
           2       0.41      0.49      0.45       639
           3       0.26      0.18      0.21       698
           4       0.30      0.14      0.20       775
           5       0.25      0.24      0.25       751
           6       0.39      0.68      0.50       759

    accuracy                           0.35      4265
   macro avg       0.34      0.36      0.33      4265
weighted avg       0.34      0.35      0.33      4265

F1-score: 0.33473332899562624
cohen_kappa_score: 0.22264790633526055


In [12]:
bern_cl_multi = GaussianNB()
bern_cl_multi.fit(X_train, y_train)
y_pred = bern_cl_multi.predict(X_valid)
print_results(y_valid, y_pred)

[[190  43  24  39  45 302]
 [182 105  33  52  47 220]
 [ 74  35  62  92  58 377]
 [ 41  14  46 101  67 506]
 [ 12   9  19  55  53 603]
 [  5   2   8  20  20 704]]
              precision    recall  f1-score   support

           1       0.38      0.30      0.33       643
           2       0.50      0.16      0.25       639
           3       0.32      0.09      0.14       698
           4       0.28      0.13      0.18       775
           5       0.18      0.07      0.10       751
           6       0.26      0.93      0.41       759

    accuracy                           0.28      4265
   macro avg       0.32      0.28      0.23      4265
weighted avg       0.31      0.28      0.23      4265

F1-score: 0.23402687253521726
cohen_kappa_score: 0.13536582269411057


In [13]:
quad_cl = QuadraticDiscriminantAnalysis()
quad_cl.fit(X_train, y_train)
y_pred = quad_cl.predict(X_valid)
print_results(y_valid, y_pred)

[[178  44  29  34  64 294]
 [147 129  38  49  61 215]
 [ 56  52  57  71  84 378]
 [ 26  19  41  94  84 511]
 [ 13  14  15  40  58 611]
 [  5   3   7  15  26 703]]
              precision    recall  f1-score   support

           1       0.42      0.28      0.33       643
           2       0.49      0.20      0.29       639
           3       0.30      0.08      0.13       698
           4       0.31      0.12      0.17       775
           5       0.15      0.08      0.10       751
           6       0.26      0.93      0.41       759

    accuracy                           0.29      4265
   macro avg       0.32      0.28      0.24      4265
weighted avg       0.32      0.29      0.24      4265

F1-score: 0.23851967585673542
cohen_kappa_score: 0.13641269177576576
