# Plotting performances

In [0]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score, log_loss, accuracy_score, confusion_matrix

def plot_cm(ax, y_true, y_pred, classes, title, th=0.5, cmap=plt.cm.Blues):
    y_pred_labels = (y_pred>th).astype(int)
    
    cm = confusion_matrix(y_true, y_pred_labels)
    
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.set_title(title)

    tick_marks = np.arange(len(classes))
    ax.set_xticks(tick_marks)
    ax.set_yticks(tick_marks)
    ax.set_xticklabels(classes)
    ax.set_yticklabels(classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        ax.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')

def plot_auc(ax, y_train, y_train_pred, y_test, y_test_pred, th=0.5):

    y_train_pred_labels = (y_train_pred>th).astype(int)
    y_test_pred_labels  = (y_test_pred>th).astype(int)

    fpr_train, tpr_train, _ = roc_curve(y_train,y_train_pred)
    roc_auc_train = auc(fpr_train, tpr_train)
    acc_train = accuracy_score(y_train, y_train_pred_labels)

    fpr_test, tpr_test, _ = roc_curve(y_test,y_test_pred)
    roc_auc_test = auc(fpr_test, tpr_test)
    acc_test = accuracy_score(y_test, y_test_pred_labels)

    ax.plot(fpr_train, tpr_train)
    ax.plot(fpr_test, tpr_test)

    ax.plot([0, 1], [0, 1], 'k--')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('ROC curve')
    
    train_text = 'train acc = {:.3f}, auc = {:.2f}'.format(acc_train, roc_auc_train)
    test_text = 'test acc = {:.3f}, auc = {:.2f}'.format(acc_test, roc_auc_test)
    ax.legend([train_text, test_text])
    

# Reading File

In [0]:
import collections
import json

def readFile(name):
    d = collections.defaultdict(dict)
    with open(name) as f:
        d = json.load(f)
    return d

# Loading Dataset

In [0]:
import pandas as pd

final_dataset = readFile('name_dataset.json')

df = pd.DataFrame.from_dict(final_dataset, orient='index', columns=['Name_Similarity', 'Img_Similarity', 'Country_Similarity', 'Gender_Similarity', 'Class'])
# delete null cells
df = df.dropna()
# shape dataset
df.shape


# Define Values, Targets and Splitting dataset

In [0]:
#delete columns 
#X = df.drop(['Col1', 'Col2'], axis=1)

#define X
X = df.drop(['Class'], axis=1)
#define y
y = df['Class']


In [0]:
from sklearn.model_selection import train_test_split  

X.shape,y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)  


# SVM

In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV


tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] # kernel rbf con parametri + kernel lineare con parametri 


clf = GridSearchCV(SVC(class_weight='balanced'), tuned_parameters, cv=2,
                       scoring='f1_macro', verbose=5)
clf.fit(X_train, y_train)



y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

y_train_pred_labels = y_train_pred
y_test_pred_labels  = y_test_pred

print("Best params: ")
print(clf.best_params_)


acc_train = accuracy_score(y_train, y_train_pred_labels)

print("Train Classification Report: \n")
print(classification_report(y_train, y_train_pred_labels))
print("Accuracy Train: " + str(acc_train) + "\n")

print("Test Classification Report: \n")
print(classification_report(y_test, y_test_pred_labels))


acc_test = accuracy_score(y_test, y_test_pred_labels)

print("Accuracy Test: " + str(acc_test) + "\n")

threshold = 0.5

fig,ax = plt.subplots(1,3)
fig.set_size_inches(15,5)

plot_cm(ax[0],  y_train, y_train_pred, [0,1], 'Confusion matrix (TRAIN)', threshold)
plot_cm(ax[1],  y_test, y_test_pred,   [0,1], 'Confusion matrix (TEST)', threshold)
plot_auc(ax[2], y_train, y_train_pred, y_test, y_test_pred, threshold)
    
plt.tight_layout()
plt.show()


'\ntuned_parameters = [{\'kernel\': [\'rbf\'], \'gamma\': [1e-3, 1e-4], \'C\': [1, 10, 100, 1000]}, {\'kernel\': [\'linear\'], \'C\': [1, 10, 100, 1000]}] # kernel rbf con parametri + kernel lineare con parametri \n\n\nclf = GridSearchCV(SVC(class_weight=\'balanced\'), tuned_parameters, cv=2,\n                       scoring=\'f1_macro\', verbose=5)\nclf.fit(X_train, y_train)\n\n\n\ny_train_pred = clf.predict(X_train)\ny_test_pred = clf.predict(X_test)\n\ny_train_pred_labels = y_train_pred\ny_test_pred_labels  = y_test_pred\n\nprint("Best params: ")\nprint(clf.best_params_)\n\n\nacc_train = accuracy_score(y_train, y_train_pred_labels)\n\nprint("Train Classification Report: \n")\nprint(classification_report(y_train, y_train_pred_labels))\nprint("Accuracy Train: " + str(acc_train) + "\n")\n\nprint("Test Classification Report: \n")\nprint(classification_report(y_test, y_test_pred_labels))\n\n\nacc_test = accuracy_score(y_test, y_test_pred_labels)\n\nprint("Accuracy Test: " + str(acc_test) 