# Data Preparation

In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import joblib
import pandas as pd

In [2]:
x_train = joblib.load('../output/x_train_preprocessed.pkl')
x_test = joblib.load('../output/x_test_preprocessed.pkl')
x_valid = joblib.load('../output/x_valid_preprocessed.pkl')
y_train = joblib.load('../output/y_train.pkl')
y_test = joblib.load('../output/y_test.pkl')
y_valid = joblib.load('../output/y_valid.pkl')

In [3]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=0.0001, max_iter=10000, activation='logistic'),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

f1 = []

In [4]:
q = 0
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

  classifiers[q].fit(x_train, y_train)


{'name': 'Nearest Neighbors', 'f1_score': 0.24442379182156132, 'accuracy': 0.729, 'roc': 0.5389630275157307}


In [5]:
q = 1
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

  return f(**kwargs)


{'name': 'Linear SVM', 'f1_score': 0.0, 'accuracy': 0.7788333333333334, 'roc': 0.5}


In [6]:
q = 2
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

  return f(**kwargs)


{'name': 'RBF SVM', 'f1_score': 0.004501125281320331, 'accuracy': 0.7788333333333334, 'roc': 0.5008093763158009}


In [7]:
q = 4
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

{'name': 'Decision Tree', 'f1_score': 0.4798482693219535, 'accuracy': 0.8171666666666667, 'roc': 0.6611242638570014}


In [8]:
q = 5
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

{'name': 'Random Forest', 'f1_score': 0.04647785039941902, 'accuracy': 0.7811666666666667, 'roc': 0.5101313144132683}


  classifiers[q].fit(x_train, y_train)


In [9]:
q = 6
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

  return f(**kwargs)


{'name': 'Neural Net', 'f1_score': 0.0, 'accuracy': 0.7786666666666666, 'roc': 0.49989300235394823}


In [10]:
q = 7
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

  return f(**kwargs)


{'name': 'AdaBoost', 'f1_score': 0.4230960676953709, 'accuracy': 0.8068333333333333, 'roc': 0.6326372492751656}


In [11]:
q = 8
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

{'name': 'Naive Bayes', 'f1_score': 0.34966148944643566, 'accuracy': 0.7278333333333333, 'roc': 0.5856974545203562}


  return f(**kwargs)


In [12]:
q = 9
classifiers[q].fit(x_train, y_train)
y_pred = classifiers[q].predict(x_test)
result = {"name" : names[q], "f1_score":  f1_score(y_true=y_test, y_pred=y_pred)}
result["accuracy"] = accuracy_score(y_true=y_test, y_pred=y_pred)
result["roc"] = roc_auc_score(y_true=y_test, y_score=y_pred)
f1.append(result)
print(result)

{'name': 'QDA', 'f1_score': 0.23225806451612904, 'accuracy': 0.6826666666666666, 'roc': 0.5159624845450085}


  return f(**kwargs)


In [13]:
df_f1 = pd.DataFrame(f1)
df_f1

Unnamed: 0,name,f1_score,accuracy,roc
0,Nearest Neighbors,0.244424,0.729,0.538963
1,Linear SVM,0.0,0.778833,0.5
2,RBF SVM,0.004501,0.778833,0.500809
3,Decision Tree,0.479848,0.817167,0.661124
4,Random Forest,0.046478,0.781167,0.510131
5,Neural Net,0.0,0.778667,0.499893
6,AdaBoost,0.423096,0.806833,0.632637
7,Naive Bayes,0.349661,0.727833,0.585697
8,QDA,0.232258,0.682667,0.515962


In [17]:
print('model dengan score f1 terbaik adalah:')
print(df_f1.loc[df_f1.f1_score == df_f1.f1_score.max()])
print("=====================")
print('model dengan score accuracy terbaik adalah:')
print(df_f1.loc[df_f1.accuracy == df_f1.accuracy.max()])
print("=====================")
print('model dengan score roc terbaik adalah:')
print(df_f1.loc[df_f1.roc == df_f1.roc.max()])

model dengan score f1 terbaik adalah:
            name  f1_score  accuracy       roc
3  Decision Tree  0.479848  0.817167  0.661124
model dengan score accuracy terbaik adalah:
            name  f1_score  accuracy       roc
3  Decision Tree  0.479848  0.817167  0.661124
model dengan score roc terbaik adalah:
            name  f1_score  accuracy       roc
3  Decision Tree  0.479848  0.817167  0.661124
