# import libraries


In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import time
import csv
import pickle

# Import Data

In [6]:
mainPath = os.getcwd()

X_train = pd.read_pickle(mainPath + '/dataset/output/dataset_train.pkl')
X_test = pd.read_pickle(mainPath + '/dataset/output/dataset_test.pkl')

y_train = X_train.loc[:,0]
y_test = X_test.loc[:,0]

X_train = X_train.drop(columns=0)
X_test = X_test.drop(columns=0)

In [7]:

#create csv object for saving result
PATH_SAVE_RESULT = mainPath + '/modelResult/modelResult.csv'
file = open(PATH_SAVE_RESULT, 'w')
csv_writer = csv.writer(file)
headers = ["algorithm", "training_time(sec)", "prediction_time(ms)", 
            "balanced_accuracy_score", "accuracy", "percision", "recall","f1-score", "confusion_matrix"]
csv_writer.writerow(headers)

#select classifiers
estimators = [DecisionTreeClassifier(),
                RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1), 
                #make_pipeline(StandardScaler(), SVC(C=0.7)),
                SGDClassifier(loss="log", penalty="l1", max_iter=25),
                GaussianNB(),
                KNeighborsClassifier(n_neighbors=8),
                #GaussianProcessClassifier(kernel=1.0 * RBF(1.0), random_state=0),
                MLPClassifier(max_iter= 70),
                AdaBoostClassifier(n_estimators=100, random_state=0),
                QuadraticDiscriminantAnalysis()
                ]
#main loop
for i in range(0,len(estimators)):
    
    ##make_pipeline
    pipe = make_pipeline(estimators[i])
    print(pipe.steps[0])
    
    ##training
    start_time = time.time()
    pipe.fit(X=X_train, y=y_train)
    end_time = time.time()
    training_time = (end_time-start_time)
    print('trainig time (sec): ', training_time)
    
    ##predition on test data
    start_time = time.time()
    y_pred = pipe.predict(X_test)
    end_time = time.time()
    prediction_time = (end_time-start_time)*10e3/y_test.shape[0]
    print('prediction time (ms): ', prediction_time)

    ##metrics
    balanced_accuracy_score = metrics.balanced_accuracy_score(y_test, y_pred)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    percision = metrics.precision_score(y_test, y_pred, average=None)
    f1_score = metrics.f1_score(y_test, y_pred, average=None)
    recall= metrics.recall_score(y_test, y_pred, average=None)


    print('Balanced accuracy on source domain data: ', balanced_accuracy_score)
    
    ##write to csv
    data = [pipe.steps[0][0], training_time, prediction_time, balanced_accuracy_score, accuracy, percision, f1_score, recall]
    csv_writer.writerow(data)

    #save model
    with open(mainPath+'/modelResult/'+pipe.steps[0][0]+'_model.pkl','wb') as f:
        pickle.dump(pipe,f)
#close the csv file
file.close()


('decisiontreeclassifier', DecisionTreeClassifier())
trainig time (sec):  223.02738642692566
prediction time (ms):  0.02227306814176027
Balanced accuracy on source domain data:  0.7663015689191861
('randomforestclassifier', RandomForestClassifier(n_jobs=-1, random_state=0))
trainig time (sec):  18.633565664291382
prediction time (ms):  0.07562036728733104
Balanced accuracy on source domain data:  0.8297236068574168
('sgdclassifier', SGDClassifier(loss='log', max_iter=25, penalty='l1'))


  _warn_prf(average, modifier, msg_start, len(result))


trainig time (sec):  108.15455484390259
prediction time (ms):  0.012274439619712097
Balanced accuracy on source domain data:  0.1975331412020634
('gaussiannb', GaussianNB())
trainig time (sec):  1.1390864849090576
prediction time (ms):  0.43736084294030114
Balanced accuracy on source domain data:  0.3960392580396167
('kneighborsclassifier', KNeighborsClassifier(n_neighbors=8))
trainig time (sec):  9.507059574127197
prediction time (ms):  67.05828204433892
Balanced accuracy on source domain data:  0.7291199329432521
('mlpclassifier', MLPClassifier(max_iter=70))




trainig time (sec):  392.2771990299225
prediction time (ms):  0.3504735299349205
Balanced accuracy on source domain data:  0.7187727667231962
('adaboostclassifier', AdaBoostClassifier(n_estimators=100, random_state=0))
trainig time (sec):  679.3573687076569
prediction time (ms):  1.92029188105147
Balanced accuracy on source domain data:  0.43563880229083773
('quadraticdiscriminantanalysis', QuadraticDiscriminantAnalysis())




trainig time (sec):  7.754666328430176
prediction time (ms):  0.7846848410922912
Balanced accuracy on source domain data:  0.514883303646564


# Test Models

In [8]:
columns = ['classifier', 'prediction_time', 'balanced_accuracy_score', 'accuracy', 'percision', 'f1_score', 'recall', 'confusion_matrix']
df = pd.DataFrame(data= np.zeros((10,len(columns))),columns=columns)
pkl_path = mainPath + '/modelResult/'
print(df.head())
index = 0
for i in os.listdir(pkl_path):
    print(i)
    if len(i.split('.pkl'))>1:
        pipe = pickle.load(open(pkl_path+i, 'rb'))
        ##predition on test data
        start_time = time.time()
        y_pred = pipe.predict(X_test)
        end_time = time.time()
        prediction_time = (end_time-start_time)*10e3/y_test.shape[0]
        print('prediction time (ms): ', prediction_time)

        ##metrics
        balanced_accuracy_score = metrics.balanced_accuracy_score(y_test, y_pred)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        percision = metrics.precision_score(y_test, y_pred, average=None)
        f1_score = metrics.f1_score(y_test, y_pred, average=None)
        recall= metrics.recall_score(y_test, y_pred, average=None)
        confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

        df_dummy = pd.DataFrame([[pipe.steps[0][0], prediction_time, balanced_accuracy_score, accuracy, percision, f1_score, recall, confusion_matrix]], columns=columns)
        for i in columns:
            print(i)
            df.loc[index, i] = str(df_dummy.loc[0,i])
        print(df.head())
        index +=1
df.to_csv(mainPath + '/modelResult/finalResult.csv')

   classifier  presiction_time  balanced_accuracy_score  accuracy  percision  \
0         0.0              0.0                      0.0       0.0        0.0   
1         0.0              0.0                      0.0       0.0        0.0   
2         0.0              0.0                      0.0       0.0        0.0   
3         0.0              0.0                      0.0       0.0        0.0   
4         0.0              0.0                      0.0       0.0        0.0   

   f1_score  recall  confusion_matrix  
0       0.0     0.0               0.0  
1       0.0     0.0               0.0  
2       0.0     0.0               0.0  
3       0.0     0.0               0.0  
4       0.0     0.0               0.0  
adaboostclassifier_model.pkl
prediction time (ms):  1.9276768176715553
classifier
presiction_time
balanced_accuracy_score
accuracy
percision
f1_score
recall
confusion_matrix
           classifier     presiction_time balanced_accuracy_score  \
0  adaboostclassifier  1.92767681767

  _warn_prf(average, modifier, msg_start, len(result))


classifier
presiction_time
balanced_accuracy_score
accuracy
percision
f1_score
recall
confusion_matrix
                      classifier       presiction_time  \
0             adaboostclassifier    1.9276768176715553   
1           kneighborsclassifier     68.64197212635361   
2         decisiontreeclassifier  0.022358112970053228   
3  quadraticdiscriminantanalysis    0.8077496306152815   
4                     gaussiannb    0.4456477719047881   

  balanced_accuracy_score             accuracy  \
0     0.43563880229083773   0.6493215396109999   
1      0.7291199329432521   0.8777807235304041   
2      0.7663015689191861   0.8516786773035117   
3       0.514883303646564  0.45959364681497195   
4      0.3960392580396167  0.28926755199922877   

                                           percision  \
0  [0.8249177  0.42307692 0.50207814 0.52910053 0...   
1  [0.86410768 0.79141104 0.86407147 0.89553429 0...   
2  [0.90635572 0.50852713 0.72751046 0.67907501 0...   
3  [0.88266421 0.130263