# SVM

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

import pickle
import sys 
sys.path.append("../../")
from E4525_ML import text

In [8]:
from sklearn.svm import SVC

def SVM_subject_model(feature="set",n=-1):
    
    # load subject features
    data_dir = "subject_feature"

    set_vectorizer_filename=   data_dir+"/{}_vectorizer.p".format(feature)
    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    labels=pickle.load(open( labels_filename, "rb" ) )
    X_set_subject=pickle.load(open( set_features_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set_subject=X_set_subject[:n]
    
    # split train and test data set
    x_train,x_test,y_train,y_test=train_test_split(X_set_subject,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    param_grid={"C":[1],"gamma":[0.1,0.5,1,10,20]}
    model=SVC(probability=True)
    g=GridSearchCV(model,param_grid,cv=5, scoring=["roc_auc","accuracy"],n_jobs=-1,refit="roc_auc")
    g.fit(x_train, y_train)
    
    print("{} feature: ".format(feature))
    print(g.cv_results_['mean_test_roc_auc'])
    print("The best alpha is",g.best_estimator_.get_params()["C"])
    print("The best gamma is",g.best_estimator_.get_params()["gamma"])
    print("The 5-fold auc is",g.cv_results_['mean_test_roc_auc'].max())
    print("The 5-fold acc is",g.cv_results_['mean_test_accuracy'].max())
    
    Y_pred=g.best_estimator_.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(g.best_estimator_.predict(x_test)==y_test))
    
    return x_train,x_test,y_train,y_test

print("----------------------------------------------------------------")
SVM_subject_model(feature="set",n=10000)
print("----------------------------------------------------------------")
SVM_subject_model(feature="count",n=10000)
print("----------------------------------------------------------------")
SVM_subject_model(feature="tfidf",n=10000)

----------------------------------------------------------------
set feature: 
[0.97643157 0.95783366 0.94494609 0.90969739 0.85912073]
The best alpha is 1
The best gamma is 0.1
The 5-fold auc is 0.9764315660633169
The 5-fold acc is 0.915764705882353
auc on test set is 0.9822184556363383
Accuracy on test set is 0.9233333333333333
----------------------------------------------------------------
count feature: 
[0.97678289 0.96158607 0.94235695 0.90664906 0.85706971]
The best alpha is 1
The best gamma is 0.1
The 5-fold auc is 0.9767828877063442
The 5-fold acc is 0.9156470588235294
auc on test set is 0.9823196862370509
Accuracy on test set is 0.924
----------------------------------------------------------------
tfidf feature: 
[0.96400107 0.96853357 0.97242232 0.95806268 0.95322456]
The best alpha is 1
The best gamma is 1
The 5-fold auc is 0.9724223219823693
The 5-fold acc is 0.9342352941176472
auc on test set is 0.9761355347960722
Accuracy on test set is 0.938


(<8500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 37516 stored elements in Compressed Sparse Row format>,
 <1500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 6438 stored elements in Compressed Sparse Row format>,
 array([1, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([1, 0, 1, ..., 1, 1, 1], dtype=int64))

# NB model

In [9]:
from sklearn.naive_bayes import MultinomialNB

def subject_model(feature="set",n=-1):
    data_dir = "subject_feature"

    set_vectorizer_filename=   data_dir+"/{}_vectorizer.p".format(feature)
    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    X_set=pickle.load(open( set_features_filename, "rb" ) )
    labels=pickle.load(open( labels_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set=X_set[:n]
    
    x_train,x_test,y_train,y_test=train_test_split(X_set,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    alphalst=[1e-4,1e-3,0.1,1,10,100,1000]
    auclst=[]
    accuracylst=[]
    for a in alphalst:
        model=MultinomialNB(alpha=a)
        cross_result=cross_validate(model,x_train,y_train,scoring=["roc_auc","accuracy"],cv=5,return_train_score=True,n_jobs=-1)
        auclst.append(cross_result['test_roc_auc'].mean())
        accuracylst.append(cross_result['test_accuracy'].mean())

    print("{} feature: ".format(feature))
    print(auclst)
    print("The best alpha is",alphalst[np.argmax(auclst)])
    
    print("The 5-fold auc of NB is",np.max(auclst))
    print("The 5-fold accuracy of NB is",accuracylst[np.argmax(auclst)])
    
    model=MultinomialNB(alpha=alphalst[np.argmax(auclst)])
    model.fit(x_train,y_train)
    Y_pred=model.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(model.predict(x_test)==y_test))
    
    return x_train,x_test,y_train,y_test
    
print("----------------------------------------------------------------")
subject_model(feature="set",n=10000)
print("----------------------------------------------------------------")
subject_model(feature="count",n=10000)
print("----------------------------------------------------------------")
subject_model(feature="tfidf",n=10000)

----------------------------------------------------------------
set feature: 
[0.9841144204850073, 0.9844869246536755, 0.9835653935967231, 0.9819905969870788, 0.9704602255366183, 0.9534588434005604, 0.9467589447479907]
The best alpha is 0.001
The 5-fold auc of NB is 0.9844869246536755
The 5-fold accuracy of NB is 0.9358823529411765
auc on test set is 0.9863507406705619
Accuracy on test set is 0.948
----------------------------------------------------------------
count feature: 
[0.9842224695737922, 0.9844691227791511, 0.9832119523683425, 0.9814791392441075, 0.9693363778604583, 0.9511886158436933, 0.9436412171162448]
The best alpha is 0.001
The 5-fold auc of NB is 0.9844691227791511
The 5-fold accuracy of NB is 0.9349411764705883
auc on test set is 0.9865713714669871
Accuracy on test set is 0.9433333333333334
----------------------------------------------------------------
tfidf feature: 
[0.9835362291759019, 0.983872486056024, 0.984007003358706, 0.9810784025925227, 0.9698360270387457,

(<8500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 37516 stored elements in Compressed Sparse Row format>,
 <1500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 6438 stored elements in Compressed Sparse Row format>,
 array([1, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([1, 0, 1, ..., 1, 1, 1], dtype=int64))

# Logistic

In [11]:
from sklearn.linear_model import LogisticRegression

def Logistic_subject_model(feature="set",n=-1):
    
    # load subject features
    data_dir = "subject_feature"

    set_vectorizer_filename=   data_dir+"/{}_vectorizer.p".format(feature)
    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    labels=pickle.load(open( labels_filename, "rb" ) )
    X_set_subject=pickle.load(open( set_features_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set_subject=X_set_subject[:n]
    
    # split train and test data set
    x_train,x_test,y_train,y_test=train_test_split(X_set_subject,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    alphalst=[1e-4,1e-3,0.1,1,10,100,1000]
    gammalst=[]
    auclst=[]
    accuracylst=[]
    modellst=[]
    for a in alphalst:
        model=LogisticRegression(C=a)
        cross_result=cross_validate(model,x_train,y_train,scoring=["roc_auc","accuracy"],cv=5,return_train_score=True,n_jobs=-1)
        auclst.append(cross_result['test_roc_auc'].mean())
        accuracylst.append(cross_result['test_accuracy'].mean())
        modellst.append(model)

    print("{} feature: ".format(feature))
    print(auclst)
    print("The best C is",alphalst[np.argmax(auclst)])
    
    print("The 5-fold auc is",np.max(auclst))
    print("The 5-fold accuracy is",accuracylst[np.argmax(auclst)])
    
    model=LogisticRegression(C=np.max(auclst))
    model.fit(x_train,y_train)
    Y_pred=model.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(model.predict(x_test)==y_test))
    
    return x_train,x_test,y_train,y_test

print("----------------------------------------------------------------")
SVM_subject_model(feature="set",n=10000)
print("----------------------------------------------------------------")
SVM_subject_model(feature="count",n=10000)
print("----------------------------------------------------------------")
SVM_subject_model(feature="tfidf",n=10000)

----------------------------------------------------------------
set feature: 
[0.97643157 0.95783366 0.94494609 0.90969739 0.85912073]
The best alpha is 1
The best gamma is 0.1
The 5-fold auc is 0.9764315660633169
The 5-fold acc is 0.915764705882353
auc on test set is 0.9822171578081239
Accuracy on test set is 0.9233333333333333
----------------------------------------------------------------
count feature: 
[0.97678289 0.96158607 0.94235695 0.90664906 0.85706971]
The best alpha is 1
The best gamma is 0.1
The 5-fold auc is 0.9767828877063442
The 5-fold acc is 0.9156470588235294
auc on test set is 0.9823170905806224
Accuracy on test set is 0.924
----------------------------------------------------------------
tfidf feature: 
[0.96400107 0.96853357 0.97242232 0.95806268 0.95322456]
The best alpha is 1
The best gamma is 1
The 5-fold auc is 0.9724223219823693
The 5-fold acc is 0.9342352941176472
auc on test set is 0.9761433217653577
Accuracy on test set is 0.938


(<8500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 37516 stored elements in Compressed Sparse Row format>,
 <1500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 6438 stored elements in Compressed Sparse Row format>,
 array([1, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([1, 0, 1, ..., 1, 1, 1], dtype=int64))

# KNN

In [14]:
from sklearn.neighbors import KNeighborsClassifier

def KNN_subject_model(feature="set",n=-1):
    
    # load subject features
    data_dir = "subject_feature"

    set_vectorizer_filename=   data_dir+"/{}_vectorizer.p".format(feature)
    set_features_filename=     data_dir+"/{}_features.p".format(feature)
    labels_filename=     data_dir+"/labels.p"

    labels=pickle.load(open( labels_filename, "rb" ) )
    X_set_subject=pickle.load(open( set_features_filename, "rb" ) )
    
    if n!=-1:
        labels=labels[:n]
        X_set_subject=X_set_subject[:n]
    
    # split train and test data set
    x_train,x_test,y_train,y_test=train_test_split(X_set_subject,labels,shuffle=True,random_state=42,test_size=0.15)
    
    # Select the best hyper param alpha
    param_grid={"n_neighbors":[1,3,5,7,10,20,50,100] }
    model=KNeighborsClassifier()
    g=GridSearchCV(model,param_grid,cv=5, scoring=["roc_auc","accuracy"],n_jobs=-1,refit="roc_auc")
    g.fit(x_train, y_train)
    
    print("{} feature: ".format(feature))
    print(g.cv_results_['mean_test_roc_auc'])
    print("The best n_neighbors is",g.best_estimator_.get_params()["n_neighbors"])
    print("The 5-fold auc is",g.cv_results_['mean_test_roc_auc'].max())
    print("The 5-fold acc is",g.cv_results_['mean_test_accuracy'].max())
    
    Y_pred=g.best_estimator_.predict_proba(x_test)
    print("auc on test set is",roc_auc_score(y_test,Y_pred[:,1]))
    print("Accuracy on test set is",np.mean(g.best_estimator_.predict(x_test)==y_test))

    return x_train,x_test,y_train,y_test

print("----------------------------------------------------------------")
KNN_subject_model(feature="set",n=10000)
print("----------------------------------------------------------------")
KNN_subject_model(feature="count",n=10000)
print("----------------------------------------------------------------")
KNN_subject_model(feature="tfidf",n=10000)

----------------------------------------------------------------
set feature: 
[0.7593091  0.92012413 0.87928582 0.87379267 0.8594593  0.83961441
 0.83029565 0.84899393]
The best n_neighbors is 3
The 5-fold auc is 0.9201241294250279
The 5-fold acc is 0.8828235294117647
auc on test set is 0.9382168878598554
Accuracy on test set is 0.8806666666666667
----------------------------------------------------------------
count feature: 
[0.76147424 0.92144812 0.88491777 0.87905236 0.86781636 0.84745339
 0.82665201 0.85861654]
The best n_neighbors is 3
The 5-fold auc is 0.921448118373932
The 5-fold acc is 0.884470588235294
auc on test set is 0.9386114276369922
Accuracy on test set is 0.8793333333333333
----------------------------------------------------------------
tfidf feature: 
[0.75373099 0.89472195 0.81983488 0.81012887 0.79539486 0.76370638
 0.72328621 0.71020662]
The best n_neighbors is 3
The 5-fold auc is 0.8947219468127114
The 5-fold acc is 0.884705882352941
auc on test set is 0.923971

(<8500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 37516 stored elements in Compressed Sparse Row format>,
 <1500x36750 sparse matrix of type '<class 'numpy.float64'>'
 	with 6438 stored elements in Compressed Sparse Row format>,
 array([1, 1, 0, ..., 1, 1, 1], dtype=int64),
 array([1, 0, 1, ..., 1, 1, 1], dtype=int64))