# Experiment 3

- try to use stacking and voting approche using the tuned models from the experiment 2
- use `StandardScaler`
- compare the result


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier,StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,roc_auc_score
import time

import json

In [2]:
def evaluate_model(model,model_name:str,train:tuple,test:tuple):
    start_train=time.time()
    model.fit(train[0],train[1])
    end_train=time.time()
    start_test=time.time()
    preds = model.predict(test[0])
    end_test=time.time()
    accuracy=accuracy_score(test[1],preds)
    f1_=f1_score(test[1],preds)
    auc=roc_auc_score(test[1],preds)
    cm=confusion_matrix(test[1],preds)
    return {"model":model_name,
           "accuracy":accuracy,
            "auc":auc,
            'f1_score':f1_,
            "cm":cm,
            'training_time(ms)':1000*(end_train-start_train),
            'testing_time(ms)':1000*(end_test-start_test)
           }

In [3]:
df=pd.read_csv('../data/cleaned_data.csv')
X_train,X_test,y_train,y_test=train_test_split(df.drop('target',axis=1),df.target,test_size=0.2,random_state=42)
scaler=StandardScaler()
train=(scaler.fit_transform(X_train),y_train)
test=(scaler.transform(X_test),y_test)

In [4]:
 
with open('../result/svc_best_hyperparameters.json') as f :
    svc_best_params=json.load(f)
with open('../result/rfc_best_hyperparameters.json') as f :
    rfc_best_params=json.load(f)
with open('../result/gbc_best_hyperparameters.json') as f :
    gbc_best_params=json.load(f)
with open('../result/xgbc_best_hyperparameters.json') as f :
    xgbc_best_params=json.load(f)

## VotingClassifer


In [31]:
vc_1=VotingClassifier([
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
    ('svc',SVC(**svc_best_params)),
],weights=[3,2,1],voting='hard')

In [32]:
rs_vc_1=evaluate_model(vc_1,"VotingClassifier",train,test)
rs_vc_1

{'model': 'VotingClassifier',
 'accuracy': 0.9021739130434783,
 'auc': 0.9013229760893312,
 'f1_score': 0.9150943396226415,
 'cm': array([[69,  8],
        [10, 97]]),
 'training_time(ms)': 669.2221164703369,
 'testing_time(ms)': 14.55378532409668}

In [33]:
vc_2=VotingClassifier([
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
],voting='hard')

In [34]:
rs_vc2=evaluate_model(vc_2,"VotingClassifier2",train,test)
rs_vc2

{'model': 'VotingClassifier2',
 'accuracy': 0.9021739130434783,
 'auc': 0.903143585386576,
 'f1_score': 0.9142857142857143,
 'cm': array([[70,  7],
        [11, 96]]),
 'training_time(ms)': 664.9234294891357,
 'testing_time(ms)': 12.387275695800781}

In [35]:
vc_3=VotingClassifier([
    ('xgb',XGBClassifier(**xgbc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
],voting='hard')

In [78]:
voting_models=[
    VotingClassifier([
    ('xgb',XGBClassifier(**xgbc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
],voting='hard'),
VotingClassifier([
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
],voting='hard'),
VotingClassifier([
    ('xgb',XGBClassifier(**xgbc_best_params)),
    ('rfc',RandomForestClassifier(**rfc_best_params)),
],voting='hard'),
VotingClassifier([
    ('xgb',XGBClassifier(**xgbc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
    ('rfc',RandomForestClassifier(**rfc_best_params)),
],voting='hard'),
VotingClassifier([
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
    ('svc',SVC(**svc_best_params)),
],voting='hard'),
VotingClassifier([
   ('xgb',XGBClassifier(**xgbc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
    ('svc',SVC(**svc_best_params)),
],voting='hard')
]

In [79]:
def evaluate_votings_models(vc_models,train,test):
    result = []
    for model in vc_models :
        res_vc = evaluate_model(model,"",train,test)
        res_vc.update({
               "model" :f'Voting ({"+".join([sub for sub in model.named_estimators])})'
            }) 
        result.append(res_vc)
    return pd.DataFrame(result)
        


In [80]:
vc_result = evaluate_votings_models(voting_models,train,test)

In [91]:
vc_result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms)
0,Voting (xgb+gbc),0.902174,0.903144,0.914286,"[[70, 7], [11, 96]]",485.37755,3.940105
1,Voting (rfc+gbc),0.902174,0.903144,0.914286,"[[70, 7], [11, 96]]",631.735086,12.429237
2,Voting (xgb+rfc),0.907609,0.907816,0.919431,"[[70, 7], [10, 97]]",368.053675,14.972448
3,Voting (xgb+gbc+rfc),0.896739,0.894829,0.910798,"[[68, 9], [10, 97]]",793.935537,15.076399
4,Voting (rfc+gbc+svc),0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",665.289879,14.461994
5,Voting (xgb+gbc+svc),0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",434.290886,7.319689


## StackingClassifier


In [86]:
stacking_models=[
    StackingClassifier(
    [
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
    ('svc',SVC(**svc_best_params,probability=True)) 
    ],
    cv=3,
    stack_method='auto'
),StackingClassifier(
    [
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
   ('xgb',XGBClassifier(**xgbc_best_params)),
    ],
    cv=3,
    stack_method='auto'
),
StackingClassifier(
    [
    ('xgb',XGBClassifier(**xgbc_best_params)),
    ('gbc',GradientBoostingClassifier(**gbc_best_params)),
    ('svc',SVC(**svc_best_params,probability=True)) 
    ],
),
StackingClassifier(
    [
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('xgb',XGBClassifier(**xgbc_best_params)),
    ('svc',SVC(**svc_best_params,probability=True)) 
    ],

)
,StackingClassifier(
    [
    ('rfc',RandomForestClassifier(**rfc_best_params)),
    ('xgb',XGBClassifier(**xgbc_best_params)),
    ],
)
,
]

In [83]:
def evaluate_stacking_models(stc_models,train,test):
    result = []
    for model in stc_models :
        res_vc = evaluate_model(model,"",train,test)
        res_vc.update({
               "model" :f'stacking ({"+".join([sub for sub in model.named_estimators])})'
            }) 
        result.append(res_vc)
    return pd.DataFrame(result)
        


In [87]:
stc_result=evaluate_stacking_models(stacking_models,train,test)

In [90]:
stc_result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms)
0,stacking (rfc+gbc+svc),0.880435,0.87717,0.897196,"[[66, 11], [11, 96]]",2494.272947,17.808199
1,stacking (rfc+gbc+xgb),0.896739,0.894829,0.910798,"[[68, 9], [10, 97]]",2750.690937,35.861969
2,stacking (xgb+gbc+svc),0.875,0.872497,0.892019,"[[66, 11], [12, 95]]",2640.911102,7.463217
3,stacking (rfc+xgb+svc),0.880435,0.87717,0.897196,"[[66, 11], [11, 96]]",2285.324812,17.988443
4,stacking (rfc+xgb),0.896739,0.89665,0.909953,"[[69, 8], [11, 96]]",2132.259369,18.867493


In [96]:
result =pd.concat([vc_result,stc_result])

In [97]:
result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms)
0,Voting (xgb+gbc),0.902174,0.903144,0.914286,"[[70, 7], [11, 96]]",485.37755,3.940105
1,Voting (rfc+gbc),0.902174,0.903144,0.914286,"[[70, 7], [11, 96]]",631.735086,12.429237
2,Voting (xgb+rfc),0.907609,0.907816,0.919431,"[[70, 7], [10, 97]]",368.053675,14.972448
3,Voting (xgb+gbc+rfc),0.896739,0.894829,0.910798,"[[68, 9], [10, 97]]",793.935537,15.076399
4,Voting (rfc+gbc+svc),0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",665.289879,14.461994
5,Voting (xgb+gbc+svc),0.891304,0.888336,0.906542,"[[67, 10], [10, 97]]",434.290886,7.319689
0,stacking (rfc+gbc+svc),0.880435,0.87717,0.897196,"[[66, 11], [11, 96]]",2494.272947,17.808199
1,stacking (rfc+gbc+xgb),0.896739,0.894829,0.910798,"[[68, 9], [10, 97]]",2750.690937,35.861969
2,stacking (xgb+gbc+svc),0.875,0.872497,0.892019,"[[66, 11], [12, 95]]",2640.911102,7.463217
3,stacking (rfc+xgb+svc),0.880435,0.87717,0.897196,"[[66, 11], [11, 96]]",2285.324812,17.988443


In [98]:
result.to_csv("../result/exp3.csv",index=False)