# Experiment 2

- try hyperparameter tuning for the best algorithms founded in experiment 1
- use `StandardScaler`
- compare the result


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,roc_auc_score
import time

In [3]:
def evaluate_model(model,model_name:str,train:tuple,test:tuple):
    start_train=time.time()
    model.fit(train[0],train[1])
    end_train=time.time()
    start_test=time.time()
    preds = model.predict(test[0])
    end_test=time.time()
    accuracy=accuracy_score(test[1],preds)
    f1_=f1_score(test[1],preds)
    auc=roc_auc_score(test[1],preds)
    cm=confusion_matrix(test[1],preds)
    return {"model":model_name,
           "accuracy":accuracy,
            "auc":auc,
            'f1_score':f1_,
            "cm":cm,
            'training_time(ms)':1000*(end_train-start_train),
            'testing_time(ms)':1000*(end_test-start_test)
           }

In [4]:
df=pd.read_csv('../data/cleaned_data.csv')

In [5]:
X_train,X_test,y_train,y_test=train_test_split(df.drop('target',axis=1),df.target,test_size=0.2,random_state=42)

In [6]:
scaler=StandardScaler()
train=(scaler.fit_transform(X_train),y_train)
test=(scaler.transform(X_test),y_test)

## SVC


In [56]:
svc_result=[]

In [110]:
svc_params={
 'C': 1,
 'class_weight': None,
 'decision_function_shape': 'ovr',
 'degree': 2,
 'gamma': 0.1,
 'kernel': 'rbf',
 'max_iter': -1,
 'random_state': 0,
 'shrinking': True,
 'tol': 0.01,
 }

In [111]:
rs=evaluate_model(SVC(**svc_params),'svc',train,test)
rs.update(svc_params)
print(rs.items())

dict_items([('model', 'svc'), ('accuracy', 0.8804347826086957), ('auc', 0.8789901687097949), ('f1_score', 0.8962264150943396), ('cm', array([[67, 10],
       [12, 95]], dtype=int64)), ('training_time(ms)', 23.152589797973633), ('testing_time(ms)', 15.793800354003906), ('C', 1), ('class_weight', None), ('decision_function_shape', 'ovr'), ('degree', 2), ('gamma', 0.1), ('kernel', 'rbf'), ('max_iter', -1), ('random_state', 0), ('shrinking', True), ('tol', 0.01)])


In [90]:
# best
print(rs.items())

dict_items([('model', 'svc'), ('accuracy', 0.8804347826086957), ('auc', 0.8789901687097949), ('f1_score', 0.8962264150943396), ('cm', array([[67, 10],
       [12, 95]], dtype=int64)), ('training_time(ms)', 26.46040916442871), ('testing_time(ms)', 14.915227890014648), ('C', 1), ('class_weight', None), ('decision_function_shape', 'ovo'), ('degree', 2), ('gamma', 0.1), ('kernel', 'rbf'), ('max_iter', -1), ('random_state', 0), ('shrinking', False), ('tol', 0.001)])


In [112]:
svc_result.append(rs)

In [114]:
svc_result_df=pd.DataFrame(svc_result)

In [233]:
svc_result_df.to_csv("../result/exp2-svc_huperparameter_tune.csv",index=False)

In [125]:
best_params=svc_result_df.iloc[svc_result_df['auc'].idxmax()].iloc[7:].to_dict()

In [149]:
best_params

{'C': 1.0,
 'class_weight': None,
 'decision_function_shape': 'ovo',
 'degree': 2,
 'gamma': 0.1,
 'kernel': 'rbf',
 'max_iter': -1,
 'random_state': 0,
 'shrinking': False,
 'tol': 0.001}

In [152]:
import json
with open('../result/svc_best_hyperparameters.json','w') as f:
    json.dump(best_params,f,indent=4)

In [12]:
import json
with open('../result/svc_best_hyperparameters.json','r') as f:
    svc_best_params=json.load(f)

In [15]:
evaluate_model(SVC(**svc_best_params),'svc',train,test)

{'model': 'svc',
 'accuracy': 0.8804347826086957,
 'auc': 0.8789901687097949,
 'f1_score': 0.8962264150943396,
 'cm': array([[67, 10],
        [12, 95]], dtype=int64),
 'training_time(ms)': 33.18476676940918,
 'testing_time(ms)': 9.97614860534668}

## RandomForsetClassier


In [157]:
rfc_result=[]

In [224]:
rfc_params={'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',#{"gini", "entropy", "log_loss"}
 'max_depth': None,
 'max_features': 'sqrt',#{"sqrt", "log2", None}or int or float
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 195,
 'random_state': 0,
}

In [225]:
rs=evaluate_model(RandomForestClassifier(**rfc_params),'rfc',train,test)
rs.update(rfc_params)
print(rs.items())

dict_items([('model', 'rfc'), ('accuracy', 0.9021739130434783), ('auc', 0.9013229760893312), ('f1_score', 0.9150943396226415), ('cm', array([[69,  8],
       [10, 97]], dtype=int64)), ('training_time(ms)', 500.78415870666504), ('testing_time(ms)', 0.0), ('bootstrap', True), ('class_weight', None), ('criterion', 'gini'), ('max_depth', None), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 195), ('random_state', 0)])


In [226]:
rfc_result.append(rs)

In [227]:
#best
print(rs.items())

dict_items([('model', 'rfc'), ('accuracy', 0.9021739130434783), ('auc', 0.9013229760893312), ('f1_score', 0.9150943396226415), ('cm', array([[69,  8],
       [10, 97]], dtype=int64)), ('training_time(ms)', 500.78415870666504), ('testing_time(ms)', 0.0), ('bootstrap', True), ('class_weight', None), ('criterion', 'gini'), ('max_depth', None), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 195), ('random_state', 0)])


In [229]:
rfc_result= pd.DataFrame(rfc_result)

In [265]:
rfc_result.max_depth=rfc_result.max_depth.replace("None",np.nan)

In [267]:
rfc_result.to_csv("../result/exp2-rfc_huperparameter_tune.csv",index=False)

In [2]:
rfc_result=pd.read_csv('../result/exp2-rfc_huperparameter_tune.csv')

In [3]:
rfc_result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),bootstrap,class_weight,criterion,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,random_state
0,rfc,0.896739,0.898471,0.909091,[[70 7]\n [12 95]],273.606539,16.768932,True,,gini,,sqrt,1,2,100,0
1,rfc,0.88587,0.887304,0.899522,[[69 8]\n [13 94]],245.991945,16.894102,False,,gini,,sqrt,1,2,100,0
2,rfc,0.88587,0.887304,0.899522,[[69 8]\n [13 94]],267.641783,3.940105,True,,entropy,,sqrt,1,2,100,0
3,rfc,0.88587,0.887304,0.899522,[[69 8]\n [13 94]],297.615528,0.0,True,,log_loss,,sqrt,1,2,100,0
4,rfc,0.896739,0.898471,0.909091,[[70 7]\n [12 95]],271.363735,0.0,True,,gini,,sqrt,1,2,100,0
5,rfc,0.88587,0.883663,0.901408,[[67 10]\n [11 96]],416.378498,5.984068,True,,gini,5.0,sqrt,1,2,100,0
6,rfc,0.88587,0.885484,0.900474,[[68 9]\n [12 95]],237.28919,17.707825,True,,gini,8.0,sqrt,1,2,100,0
7,rfc,0.880435,0.87899,0.896226,[[67 10]\n [12 95]],450.332403,16.622782,True,,gini,8.0,sqrt,1,2,200,0
8,rfc,0.891304,0.891977,0.904762,[[69 8]\n [12 95]],722.287893,19.7649,True,,gini,,sqrt,1,2,300,0
9,rfc,0.902174,0.901323,0.915094,[[69 8]\n [10 97]],476.942778,8.012056,True,,gini,,sqrt,1,2,200,0


In [268]:
rfc_result[rfc_result["auc"]==rfc_result["auc"].max()]

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),bootstrap,class_weight,criterion,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,random_state
9,rfc,0.902174,0.901323,0.915094,"[[69, 8], [10, 97]]",476.942778,8.012056,True,,gini,,sqrt,1,2,200,0
10,rfc,0.902174,0.901323,0.915094,"[[69, 8], [10, 97]]",919.444799,37.731409,True,,gini,,log2,1,2,200,0
16,rfc,0.902174,0.901323,0.915094,"[[69, 8], [10, 97]]",500.784159,0.0,True,,gini,,sqrt,1,2,195,0


In [4]:
rfc_best_params=rfc_result[rfc_result["auc"]==rfc_result["auc"].max()].loc[16].iloc[7:].to_dict()

In [6]:
rfc_best_params.update({
    'class_weight': None,
    'max_depth':None
})



In [16]:
evaluate_model(RandomForestClassifier(**rfc_best_params),'rfc',train,test)

{'model': 'rfc',
 'accuracy': 0.9021739130434783,
 'auc': 0.9013229760893312,
 'f1_score': 0.9150943396226415,
 'cm': array([[69,  8],
        [10, 97]], dtype=int64),
 'training_time(ms)': 498.248815536499,
 'testing_time(ms)': 16.50381088256836}

In [17]:
with open("../result/rfc_best_hyperparameters.json",'w') as f:
    json.dump(rfc_best_params,f,indent=4)

## GradientBoostingClassifer


In [18]:
gbc_result=[]

In [97]:
gbc_params={
    
 'criterion': 'squared_error',#{'friedman_mse', 'squared_error'}
 'learning_rate': 0.01,
 
 'loss': 'log_loss',#{'log_loss', 'exponential'}
 'max_depth': 5,
 'max_features': None,#{'sqrt', 'log2'}
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 145,
 'n_iter_no_change': 10,
 'random_state': 0,
 'subsample': 1,
 'tol': 0.0001,
 'validation_fraction': 0.1,

 }

In [98]:
rs=evaluate_model(GradientBoostingClassifier(**gbc_params),'gbc',train,test)
rs.update(gbc_params)
gbc_result.append(rs)
print(rs.items())

dict_items([('model', 'gbc'), ('accuracy', 0.8913043478260869), ('auc', 0.8883359631023182), ('f1_score', 0.9065420560747663), ('cm', array([[67, 10],
       [10, 97]], dtype=int64)), ('training_time(ms)', 593.0478572845459), ('testing_time(ms)', 0.0), ('criterion', 'squared_error'), ('learning_rate', 0.01), ('loss', 'log_loss'), ('max_depth', 5), ('max_features', None), ('max_leaf_nodes', None), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 2), ('min_weight_fraction_leaf', 0.0), ('n_estimators', 145), ('n_iter_no_change', 10), ('random_state', 0), ('subsample', 1), ('tol', 0.0001), ('validation_fraction', 0.1)])


In [40]:
#best
print(rs.items())


dict_items([('model', 'gbc'), ('accuracy', 0.8913043478260869), ('auc', 0.8883359631023182), ('f1_score', 0.9065420560747663), ('cm', array([[67, 10],
       [10, 97]], dtype=int64)), ('training_time(ms)', 848.2606410980225), ('testing_time(ms)', 0.0), ('criterion', 'squared_error'), ('learning_rate', 0.01), ('loss', 'log_loss'), ('max_depth', 5), ('max_features', None), ('max_leaf_nodes', None), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 2), ('min_weight_fraction_leaf', 0.0), ('n_estimators', 230), ('n_iter_no_change', 5), ('random_state', 0), ('subsample', 1.0), ('tol', 0.0001), ('validation_fraction', 0.1)])


In [99]:
gbc_result=pd.DataFrame(gbc_result)

In [100]:
gbc_result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),criterion,learning_rate,loss,...,min_impurity_decrease,min_samples_leaf,min_samples_split,min_weight_fraction_leaf,n_estimators,n_iter_no_change,random_state,subsample,tol,validation_fraction
0,gbc,0.869565,0.871465,0.884615,"[[68, 9], [15, 92]]",223.459959,15.623808,friedman_mse,0.1,log_loss,...,0.0,1,2,0.0,100,,0,1.0,0.0001,0.1
1,gbc,0.880435,0.87899,0.896226,"[[67, 10], [12, 95]]",90.156794,0.0,squared_error,0.1,log_loss,...,0.0,1,2,0.0,100,5.0,0,1.0,0.0001,0.1
2,gbc,0.875,0.872497,0.892019,"[[66, 11], [12, 95]]",238.483191,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,100,5.0,0,1.0,0.0001,0.1
3,gbc,0.880435,0.87717,0.897196,"[[66, 11], [11, 96]]",568.66312,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,200,5.0,0,1.0,0.0001,0.1
4,gbc,0.880435,0.87717,0.897196,"[[66, 11], [11, 96]]",530.489683,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,220,5.0,0,1.0,0.0001,0.1
5,gbc,0.880435,0.87717,0.897196,"[[66, 11], [11, 96]]",444.595814,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,230,5.0,0,1.0,0.0001,0.1
6,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",848.260641,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,230,5.0,0,1.0,0.0001,0.1
7,gbc,0.86413,0.864971,0.880383,"[[67, 10], [15, 92]]",1381.664991,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,230,5.0,0,1.0,0.0001,0.1
8,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",838.873386,1.99151,squared_error,0.01,log_loss,...,0.0,1,2,0.0,230,5.0,0,1.0,0.0001,0.1
9,gbc,0.875,0.876138,0.889952,"[[68, 9], [14, 93]]",776.477814,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,230,5.0,0,1.0,0.0001,0.1


In [101]:
gbc_result.to_csv("../result/exp2-gbc_huperparameter_tune.csv",index=False)

In [102]:
gbc_result[gbc_result["auc"]==gbc_result["auc"].max()]

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),criterion,learning_rate,loss,...,min_impurity_decrease,min_samples_leaf,min_samples_split,min_weight_fraction_leaf,n_estimators,n_iter_no_change,random_state,subsample,tol,validation_fraction
6,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",848.260641,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,230,5.0,0,1.0,0.0001,0.1
8,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",838.873386,1.99151,squared_error,0.01,log_loss,...,0.0,1,2,0.0,230,5.0,0,1.0,0.0001,0.1
23,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",984.389782,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,500,10.0,0,1.0,0.0001,0.1
24,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",884.473801,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,500,10.0,0,1.0,0.0001,0.1
25,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",983.389854,2.263069,squared_error,0.01,log_loss,...,0.0,1,2,0.0,200,10.0,0,1.0,0.0001,0.1
26,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",825.778484,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,195,10.0,0,1.0,0.0001,0.1
27,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",809.792757,1.995564,squared_error,0.01,log_loss,...,0.0,1,2,0.0,190,10.0,0,1.0,0.0001,0.1
28,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",775.731802,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,180,10.0,0,1.0,0.0001,0.1
29,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",1103.782654,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,150,10.0,0,1.0,0.0001,0.1
34,gbc,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",593.047857,0.0,squared_error,0.01,log_loss,...,0.0,1,2,0.0,145,10.0,0,1.0,0.0001,0.1


In [109]:
gbc_best_params=gbc_result[gbc_result["auc"]==gbc_result["auc"].max()].loc[34].iloc[7:].to_dict()

In [113]:
gbc_best_params.update({
    'n_iter_no_change': 10
})

In [115]:
with open("../result/gbc_best_hyperparameters.json",'w') as f:
    json.dump(gbc_best_params,f,indent=4)

## Compare the result


In [9]:
import json 
with open('../result/svc_best_hyperparameters.json') as f :
    svc_best_params=json.load(f)
with open('../result/rfc_best_hyperparameters.json') as f :
    rfc_best_params=json.load(f)
with open('../result/gbc_best_hyperparameters.json') as f :
    gbc_best_params=json.load(f)

In [10]:
models=[SVC(**svc_best_params),RandomForestClassifier(**rfc_best_params),GradientBoostingClassifier(**gbc_best_params) ]


In [12]:
result= pd.DataFrame([evaluate_model(model,model.__class__.__name__,train,test) for model in models] )

In [13]:
result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms)
0,SVC,0.880435,0.87899,0.896226,"[[67, 10], [12, 95]]",34.53064,15.695333
1,RandomForestClassifier,0.902174,0.901323,0.915094,"[[69, 8], [10, 97]]",501.249552,12.759209
2,GradientBoostingClassifier,0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",631.972075,0.0


In [14]:
result.to_csv("../result/exp2.csv",index=False)