# Experiment 1

- try caple of algirithmes with default params
- try both with/without normalisation
- try both `MinMaxScaler` and `StandardScaler`
- compare the result


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,HistGradientBoostingClassifier,BaggingClassifier,AdaBoostClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,roc_auc_score
import time
from xgboost import XGBClassifier

In [4]:
df=pd.read_csv(
    '../data/cleaned_data.csv'
)

In [5]:
X_train,X_test,y_train,y_test=train_test_split(df.drop('target',axis=1),df.target,test_size=0.2,random_state=42)

In [6]:
train=(X_train,y_train)
test=(X_test,y_test)

In [7]:
def evaluate_model(model,model_name:str,train:tuple,test:tuple):
    start_train=time.time()
    model.fit(train[0],train[1])
    end_train=time.time()
    start_test=time.time()
    preds = model.predict(test[0])
    end_test=time.time()
    accuracy=accuracy_score(test[1],preds)
    f1_=f1_score(test[1],preds)
    auc=roc_auc_score(test[1],preds)
    cm=confusion_matrix(test[1],preds)
    return {"model":model_name,
           "accuracy":accuracy,
            "auc":auc,
            'f1_score':f1_,
            "cm":cm,
            'training_time(ms)':1000*(end_train-start_train),
            'testing_time(ms)':1000*(end_test-start_test)
           }

In [52]:
def evaluate_models(models:list,train:tuple,test:tuple)->pd.DataFrame:
    result=[evaluate_model(model,model.__class__.__name__,train,test)for model in models]
    return pd.DataFrame(result)

In [63]:
models =[
    LogisticRegression(random_state=0),
    SVC(random_state=0),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(random_state=0),
    GradientBoostingClassifier(random_state=0),
    HistGradientBoostingClassifier(random_state=0),
    BaggingClassifier(random_state=0),
    AdaBoostClassifier(random_state=0)
]

## Without normalisation


In [65]:
result =evaluate_models(models,train,test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [83]:
result.iloc[result['auc'].idxmax()]

model                RandomForestClassifier
accuracy                           0.896739
auc                                0.898471
f1_score                           0.909091
cm                      [[70, 7], [12, 95]]
training_time(ms)                238.950253
testing_time(ms)                   8.750916
normalisation                       without
Name: 3, dtype: object

In [82]:
result.iloc[result['auc'].idxmin()]

model                                 SVC
accuracy                         0.684783
auc                              0.687098
f1_score                         0.712871
cm                   [[54, 23], [35, 72]]
training_time(ms)               32.946348
testing_time(ms)                19.329548
normalisation                     without
Name: 1, dtype: object

In [68]:
result['normalisation']='without'

In [69]:
result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),normalisation
0,LogisticRegression,0.831522,0.836934,0.847291,"[[67, 10], [21, 86]]",53.141594,1.994848,without
1,SVC,0.684783,0.687098,0.712871,"[[54, 23], [35, 72]]",32.946348,19.329548,without
2,DecisionTreeClassifier,0.804348,0.81357,0.818182,"[[67, 10], [26, 81]]",6.979704,3.436804,without
3,RandomForestClassifier,0.896739,0.898471,0.909091,"[[70, 7], [12, 95]]",238.950253,8.750916,without
4,GradientBoostingClassifier,0.869565,0.871465,0.884615,"[[68, 9], [15, 92]]",191.018343,1.994848,without
5,HistGradientBoostingClassifier,0.853261,0.859267,0.866995,"[[69, 8], [19, 88]]",473.515511,0.0,without
6,BaggingClassifier,0.820652,0.829409,0.834171,"[[68, 9], [24, 83]]",62.511921,5.492687,without
7,AdaBoostClassifier,0.836957,0.841607,0.852941,"[[67, 10], [20, 87]]",111.458302,13.316631,without


## With normalisation


In [70]:
scaler =MinMaxScaler()
train=(scaler.fit_transform(X_train),y_train)
test=(scaler.transform(X_test),y_test)

In [71]:
result_min_max=evaluate_models(models,train,test)



In [72]:
result_min_max['normalisation']='minmax'

In [73]:
result_min_max

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),normalisation
0,LogisticRegression,0.847826,0.854594,0.861386,"[[69, 8], [20, 87]]",3.949642,0.0,minmax
1,SVC,0.836957,0.832504,0.859813,"[[62, 15], [15, 92]]",34.428358,9.973526,minmax
2,DecisionTreeClassifier,0.815217,0.822915,0.83,"[[67, 10], [24, 83]]",0.0,0.0,minmax
3,RandomForestClassifier,0.896739,0.898471,0.909091,"[[70, 7], [12, 95]]",249.080658,7.978916,minmax
4,GradientBoostingClassifier,0.869565,0.871465,0.884615,"[[68, 9], [15, 92]]",224.46537,0.0,minmax
5,HistGradientBoostingClassifier,0.853261,0.859267,0.866995,"[[69, 8], [19, 88]]",300.05002,0.0,minmax
6,BaggingClassifier,0.820652,0.829409,0.834171,"[[68, 9], [24, 83]]",75.645924,2.515316,minmax
7,AdaBoostClassifier,0.836957,0.841607,0.852941,"[[67, 10], [20, 87]]",150.208473,15.498161,minmax


In [81]:
result_min_max.iloc[result_min_max['auc'].idxmax()]

model                RandomForestClassifier
accuracy                           0.896739
auc                                0.898471
f1_score                           0.909091
cm                      [[70, 7], [12, 95]]
training_time(ms)                249.080658
testing_time(ms)                   7.978916
normalisation                        minmax
Name: 3, dtype: object

In [80]:
result_min_max.iloc[result_min_max['auc'].idxmin()]

model                DecisionTreeClassifier
accuracy                           0.815217
auc                                0.822915
f1_score                               0.83
cm                     [[67, 10], [24, 83]]
training_time(ms)                       0.0
testing_time(ms)                        0.0
normalisation                        minmax
Name: 2, dtype: object

In [101]:
scaler=  StandardScaler()
train=(scaler.fit_transform(X_train),y_train)
test=(scaler.transform(X_test),y_test)

In [102]:
result_standard_scaler=evaluate_models(models,train,test)



In [103]:
result_standard_scaler['normalisation']='standard'

In [104]:
result_standard_scaler

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),normalisation
0,LogisticRegression,0.847826,0.852773,0.862745,"[[68, 9], [19, 88]]",11.655569,0.0,standard
1,SVC,0.875,0.870676,0.893023,"[[65, 12], [11, 96]]",23.468256,6.981134,standard
2,DecisionTreeClassifier,0.815217,0.822915,0.83,"[[67, 10], [24, 83]]",4.851818,0.996351,standard
3,RandomForestClassifier,0.896739,0.898471,0.909091,"[[70, 7], [12, 95]]",255.483389,8.496761,standard
4,GradientBoostingClassifier,0.869565,0.871465,0.884615,"[[68, 9], [15, 92]]",199.975491,0.0,standard
5,HistGradientBoostingClassifier,0.853261,0.859267,0.866995,"[[69, 8], [19, 88]]",298.398018,0.0,standard
6,BaggingClassifier,0.815217,0.824736,0.828283,"[[68, 9], [25, 82]]",33.116579,0.0,standard
7,AdaBoostClassifier,0.836957,0.841607,0.852941,"[[67, 10], [20, 87]]",133.118153,16.908884,standard


In [105]:
result_standard_scaler.iloc[result_standard_scaler['auc'].idxmax()]

model                RandomForestClassifier
accuracy                           0.896739
auc                                0.898471
f1_score                           0.909091
cm                      [[70, 7], [12, 95]]
training_time(ms)                255.483389
testing_time(ms)                   8.496761
normalisation                      standard
Name: 3, dtype: object

In [106]:
result_standard_scaler.iloc[result_standard_scaler['auc'].idxmin()]

model                DecisionTreeClassifier
accuracy                           0.815217
auc                                0.822915
f1_score                               0.83
cm                     [[67, 10], [24, 83]]
training_time(ms)                  4.851818
testing_time(ms)                   0.996351
normalisation                      standard
Name: 2, dtype: object

## Compare the result


In [120]:
def interpret(result,result_min_max,result_standard_scaler,metric):
    print(f"{metric}",end='\n\n')
    for i in range(result.shape[0]):
        no_normalisation=result.iloc[i]
        mainmax_normalisation=result_min_max.iloc[i]
        standard_normalisation=result_standard_scaler.iloc[i]
        print(f'{no_normalisation['model']} >> without {no_normalisation[metric]:.3f} | min_max {mainmax_normalisation[metric]:.3f} | | standard {standard_normalisation[metric]:.3f}',end='\n\n')

In [121]:
interpret(result,result_min_max,result_standard_scaler,metric='auc')

auc

LogisticRegression >> without 0.837 | min_max 0.855 | | standard 0.853

SVC >> without 0.687 | min_max 0.833 | | standard 0.871

DecisionTreeClassifier >> without 0.814 | min_max 0.823 | | standard 0.823

RandomForestClassifier >> without 0.898 | min_max 0.898 | | standard 0.898

GradientBoostingClassifier >> without 0.871 | min_max 0.871 | | standard 0.871

HistGradientBoostingClassifier >> without 0.859 | min_max 0.859 | | standard 0.859

BaggingClassifier >> without 0.829 | min_max 0.829 | | standard 0.825

AdaBoostClassifier >> without 0.842 | min_max 0.842 | | standard 0.842



for _auc_

- **low improvement** in LogisticRegression and DecisionTreeClassifier
- **high improvement** in svc with standardscaler
- **non improvement** in tree ensembles algorithms


In [122]:
interpret(result,result_min_max,result_standard_scaler,metric='accuracy')

accuracy

LogisticRegression >> without 0.832 | min_max 0.848 | | standard 0.848

SVC >> without 0.685 | min_max 0.837 | | standard 0.875

DecisionTreeClassifier >> without 0.804 | min_max 0.815 | | standard 0.815

RandomForestClassifier >> without 0.897 | min_max 0.897 | | standard 0.897

GradientBoostingClassifier >> without 0.870 | min_max 0.870 | | standard 0.870

HistGradientBoostingClassifier >> without 0.853 | min_max 0.853 | | standard 0.853

BaggingClassifier >> without 0.821 | min_max 0.821 | | standard 0.815

AdaBoostClassifier >> without 0.837 | min_max 0.837 | | standard 0.837



for _accuracy_

- **low improvement** in LogisticRegression and DecisionTreeClassifier
- **high improvement** in svc with standardscaler
- **non improvement** in tree ensembles algorithms


In [123]:
interpret(result,result_min_max,result_standard_scaler,metric='f1_score')

f1_score

LogisticRegression >> without 0.847 | min_max 0.861 | | standard 0.863

SVC >> without 0.713 | min_max 0.860 | | standard 0.893

DecisionTreeClassifier >> without 0.818 | min_max 0.830 | | standard 0.830

RandomForestClassifier >> without 0.909 | min_max 0.909 | | standard 0.909

GradientBoostingClassifier >> without 0.885 | min_max 0.885 | | standard 0.885

HistGradientBoostingClassifier >> without 0.867 | min_max 0.867 | | standard 0.867

BaggingClassifier >> without 0.834 | min_max 0.834 | | standard 0.828

AdaBoostClassifier >> without 0.853 | min_max 0.853 | | standard 0.853



for _f1_score_

- **low improvement** in LogisticRegression and DecisionTreeClassifier
- **high improvement** in svc with standardscaler
- **non improvement** in tree ensembles algorithms


## Save the result


In [108]:
all_result = pd.concat([result,result_min_max,result_standard_scaler])

In [109]:
all_result.to_csv('../result/exp1.csv',index=False)

## try xgboost 

In [8]:
xgb=XGBClassifier(random_state=0)

### without normalisation

In [9]:
rs_no_norm=evaluate_model(xgb,'XGBClassifier',train,test)
rs_no_norm

{'model': 'XGBClassifier',
 'accuracy': 0.8804347826086957,
 'auc': 0.8862726058987741,
 'f1_score': 0.8921568627450981,
 'cm': array([[71,  6],
        [16, 91]]),
 'training_time(ms)': 225.9230613708496,
 'testing_time(ms)': 3.3571720123291016}

###  using Minmaxscaler

In [10]:
scaler =MinMaxScaler()
train=(scaler.fit_transform(X_train),y_train)
test=(scaler.transform(X_test),y_test)

In [12]:
rs_minmax=evaluate_model(xgb,'XGBClassifier',train,test)
rs_minmax

{'model': 'XGBClassifier',
 'accuracy': 0.8804347826086957,
 'auc': 0.8862726058987741,
 'f1_score': 0.8921568627450981,
 'cm': array([[71,  6],
        [16, 91]]),
 'training_time(ms)': 79.15210723876953,
 'testing_time(ms)': 1.031637191772461}

### with StandardScaler 

In [13]:
scaler =StandardScaler()
train=(scaler.fit_transform(X_train),y_train)
test=(scaler.transform(X_test),y_test)

In [14]:
rs_standard=evaluate_model(xgb,'XGBClassifier',train,test)
rs_standard

{'model': 'XGBClassifier',
 'accuracy': 0.8804347826086957,
 'auc': 0.8862726058987741,
 'f1_score': 0.8921568627450981,
 'cm': array([[71,  6],
        [16, 91]]),
 'training_time(ms)': 42.03629493713379,
 'testing_time(ms)': 1.0373592376708984}

In [15]:
rs_no_norm['normalisation']='without'
rs_minmax['normalisation']='minmax'
rs_standard['normalisation']='standard'

In [16]:
xgb_result=pd.DataFrame([rs_no_norm,rs_minmax,rs_standard])

In [17]:
xgb_result

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms),normalisation
0,XGBClassifier,0.880435,0.886273,0.892157,"[[71, 6], [16, 91]]",225.923061,3.357172,without
1,XGBClassifier,0.880435,0.886273,0.892157,"[[71, 6], [16, 91]]",79.152107,1.031637,minmax
2,XGBClassifier,0.880435,0.886273,0.892157,"[[71, 6], [16, 91]]",42.036295,1.037359,standard


In [18]:
all_result = pd.read_csv("../result/exp1.csv")

In [21]:
all_result=pd.concat((all_result,xgb_result))

In [22]:
all_result.to_csv("../result/exp1.csv",index=False)