In [51]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# from skimage.transform import pyramid_gaussian
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV
# import matplotlib.pyplot as plt
from inspect import signature

# print(glob.glob("../ILIYAN Master Thesis/Dataset/*"))

%matplotlib inline
%precision 2
pd.options.display.max_columns = None

In [52]:
df_t1_text = pd.read_csv('df_top_1_stem_1.csv')
df_t1_t2_text = pd.read_csv('df_top_1_and_2_stem_1.csv')

In [53]:
df_t1_structured = pd.read_csv('df_top_1_structured.csv', index_col=0)
df_t1_t2_structured = pd.read_csv('df_top_1_and_2_structured.csv', index_col=0)

In [54]:
df_t1_structured.drop('Class', axis=1, inplace=True)
df_t1_t2_structured.drop('Class', axis=1, inplace=True)

In [55]:
df_t1 = pd.concat([df_t1_structured, df_t1_text], axis=1)
df_t1t2 = pd.concat([df_t1_t2_structured, df_t1_t2_text], axis=1)

In [60]:
df_t1.shape

(50, 72)

In [61]:
df_t1t2.shape

(50, 126)

In [62]:
def draw_confusionmatrix(y_test, y_hat):
    plt.figure(figsize=(10,7))
    cm = confusion_matrix(y_test, y_hat)
    ax = sns.heatmap(cm, annot=True, fmt="d")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    acc = accuracy_score(y_test, y_hat)
    print(f"Sum Axis-1 as Classification accuracy: {acc}")
    print('\n')
    print(classification_report(y_test, y_hat))
    print('\n')

In [66]:
def gen_train_and_test(df, test_size=0.20, random_state=42):
    X = df.loc[:, df.columns != 'Class']
    y = df.Class
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [67]:
def split_train_and_test(X, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)    
    return X_train, X_test, y_train, y_test

In [68]:
def start_model(df, model):
    X_train, X_test, y_train, y_test = gen_train_and_test(df)
    model.fit(X_train, y_train)
    y_hat = model.predict(X_test)
    draw_confusionmatrix(y_test, y_hat)

### GaussianNB

In [69]:
model = GaussianNB()

In [70]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
def run_cross_validation(df, model, scoring='accuracy', cv=10):
    X = df.loc[:, df.columns != 'Class']
    y = df.Class
    
    print("avg accuracy:"+str(np.average(cross_val_score(model, X, y, scoring='accuracy', cv=cv))))
    print("avg f1:"+str(np.average(cross_val_score(model, X, y, scoring='f1', cv=cv))))
    print("avg precision:"+str(np.average(cross_val_score(model, X, y, scoring='precision', cv=cv))))
    print("avg recall:"+str(np.average(cross_val_score(model, X, y, scoring='recall', cv=cv))))
    return None

In [71]:
run_cross_validation(df1_t1, model)

avg accuracy:0.9400000000000001
avg f1:0.9514285714285714
avg precision:0.95
avg recall:0.9666666666666666


In [72]:
run_cross_validation(df1_t1t2, model)

avg accuracy:0.9800000000000001
avg f1:0.9857142857142858
avg precision:0.975
avg recall:1.0


### SVM

In [73]:
def start_grid_cv(df, cv=10, n_jobs=-1):
    param_grid = {
                  'kernel':('linear', 'rbf'),
                  'C':(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1, 10),
                  'gamma': (0.001, 0.01, 0.1, 1,2,3,'auto'),
                  'decision_function_shape':('ovo','ovr'),
                  'shrinking':(True,False)
                 }
    X_train, X_test, y_train, y_test = gen_train_and_test(df)
    clf = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid, cv=cv, n_jobs=n_jobs)
    clf.fit(X_train, y_train)
    
    print("avg accuracy:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='accuracy'))))
    print("avg f1:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='f1'))))
    print("avg precision:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='precision'))))
    print("avg recall:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='recall'))))
    
    best_params = clf.best_params_
    print(best_params)
    
    return best_params

In [74]:
# Duc's
params = start_grid_cv(df1_t1, cv=10)

avg accuracy:0.7
avg f1:0.7318181818181817
avg precision:0.6795238095238095
avg recall:0.8
{'C': 1, 'decision_function_shape': 'ovo', 'gamma': 0.001, 'kernel': 'linear', 'shrinking': True}


In [20]:
# Pierre's

avg accuracy:0.975
avg f1:0.9777777777777779
avg precision:0.96
avg recall:1.0
{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 2, 'kernel': 'rbf', 'shrinking': True}


In [75]:
#Duc's
params = start_grid_cv(df1_t1t2, cv=10)

avg accuracy:0.7
avg f1:0.7415584415584415
avg precision:0.677142857142857
avg recall:0.85
{'C': 0.75, 'decision_function_shape': 'ovo', 'gamma': 0.001, 'kernel': 'linear', 'shrinking': True}


In [21]:
# Pierre's

avg accuracy:0.95
avg f1:0.9555555555555555
avg precision:0.96
avg recall:0.96
{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'rbf', 'shrinking': True}
