# Experiment 7

- identify outliers with methode of `IQR`
- try to replace outliers with mean and midian
- try to delete the outliers
- compare the result


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,roc_auc_score
import time

import json

In [2]:
def evaluate_model(model,model_name:str,train:tuple,test:tuple):
    start_train=time.time()
    model.fit(train[0],train[1])
    end_train=time.time()
    start_test=time.time()
    preds = model.predict(test[0])
    end_test=time.time()
    accuracy=accuracy_score(test[1],preds)
    f1_=f1_score(test[1],preds)
    auc=roc_auc_score(test[1],preds)
    cm=confusion_matrix(test[1],preds)
    return {"model":model_name,
           "accuracy":accuracy,
            "auc":auc,
            'f1_score':f1_,
            "cm":cm,
            'training_time(ms)':1000*(end_train-start_train),
            'testing_time(ms)':1000*(end_test-start_test)
           }

In [3]:
def evaluate_models(models:list,train:tuple,test:tuple)->pd.DataFrame:
    result=[evaluate_model(model,model.__class__.__name__,train,test)for model in models]
    return pd.DataFrame(result)

In [4]:
df=pd.read_csv('../data/cleaned_data.csv')


In [5]:
def get_outlier_indexs(df:pd.DataFrame,num_features:list):
    outliers={}
    for col in num_features:
        q1=df[col].quantile(0.25)
        q3=df[col].quantile(0.75)
        iqr=q3-q1
        upper=q3+1.5*iqr
        lower=q1-1.5*iqr
        idxs= df[(df[col] > upper) | (df [col] <lower)].index.tolist()
        if idxs:
            outliers[col]= idxs
        
    return outliers
    

In [6]:
num_features = [col for col in df.columns if df[col].nunique()>25]


In [7]:
num_features

['age', 'resting bp s', 'cholesterol', 'max heart rate', 'oldpeak']

In [8]:
def get_train_test(df:pd.DataFrame)->tuple:
    X_train,X_test,y_train,y_test=train_test_split(df.drop('target',axis=1),df.target,test_size=0.2,random_state=42)
    scaler=StandardScaler()
    train=(scaler.fit_transform(X_train),y_train)
    test=(scaler.transform(X_test),y_test)
    return train,test

In [9]:
 
with open('../result/svc_best_hyperparameters.json') as f :
    svc_best_params=json.load(f)
with open('../result/rfc_best_hyperparameters.json') as f :
    rfc_best_params=json.load(f)
with open('../result/gbc_best_hyperparameters.json') as f :
    gbc_best_params=json.load(f)
with open('../result/xgbc_best_hyperparameters.json') as f :
    xgbc_best_params=json.load(f)

In [10]:
models=[
    XGBClassifier(**xgbc_best_params),
    RandomForestClassifier(**rfc_best_params),
    VotingClassifier([
        ('xgb',XGBClassifier(**xgbc_best_params)),
        ('rfc',RandomForestClassifier(**rfc_best_params))
    ],voting='soft'),
    SVC(**svc_best_params,probability=True),
    GradientBoostingClassifier(**gbc_best_params)
]

In [11]:
outliers=get_outlier_indexs(df,num_features)

In [12]:
def replace_outliers(df:pd.DataFrame,outliers:dict,strategy ):
    
    for col in outliers:
        df.loc[outliers[col],col]= df[col].mean() if strategy =='mean' else df[col].median()
    return df

## fill outliers by `mean`


In [13]:
outliers.keys()

dict_keys(['resting bp s', 'cholesterol', 'max heart rate', 'oldpeak'])

In [14]:
df = replace_outliers(df,outliers,'mean')

  df.loc[outliers[col],col]= df[col].mean() if strategy =='mean' else df[col].median()
  df.loc[outliers[col],col]= df[col].mean() if strategy =='mean' else df[col].median()
  df.loc[outliers[col],col]= df[col].mean() if strategy =='mean' else df[col].median()


In [15]:
train,test=get_train_test(df)
res_mean=evaluate_models(models,train,test)

In [16]:
res_mean

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms)
0,XGBClassifier,0.86413,0.864971,0.880383,"[[67, 10], [15, 92]]",1100.599289,3.987551
1,RandomForestClassifier,0.858696,0.858478,0.87619,"[[66, 11], [15, 92]]",880.856037,15.623093
2,VotingClassifier,0.858696,0.858478,0.87619,"[[66, 11], [15, 92]]",914.822102,31.243086
3,SVC,0.858696,0.860299,0.875,"[[67, 10], [16, 91]]",197.122574,11.17897
4,GradientBoostingClassifier,0.858696,0.858478,0.87619,"[[66, 11], [15, 92]]",916.395426,15.476227


## fill outliers by `median`


In [17]:
df=replace_outliers(df,outliers,'median')
train,test=get_train_test(df)
res_median=evaluate_models(models,train,test)

In [18]:
res_median

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms)
0,XGBClassifier,0.875,0.874317,0.890995,"[[67, 10], [13, 94]]",406.073332,0.0
1,RandomForestClassifier,0.869565,0.869644,0.885714,"[[67, 10], [14, 93]]",714.136839,36.874294
2,VotingClassifier,0.869565,0.869644,0.885714,"[[67, 10], [14, 93]]",782.511711,30.105352
3,SVC,0.858696,0.860299,0.875,"[[67, 10], [16, 91]]",114.576101,16.855717
4,GradientBoostingClassifier,0.869565,0.871465,0.884615,"[[68, 9], [15, 92]]",549.932003,0.0


## drop outliers


In [19]:
all_outliers=[]
for col in outliers:
    all_outliers.extend(outliers[col])

In [20]:
df.drop(all_outliers,inplace=True)

In [21]:
train,test=get_train_test(df)
res_=evaluate_models(models,train,test)

In [22]:
res_

Unnamed: 0,model,accuracy,auc,f1_score,cm,training_time(ms),testing_time(ms)
0,XGBClassifier,0.851064,0.851006,0.848921,"[[61, 10], [11, 59]]",214.893818,0.0
1,RandomForestClassifier,0.886525,0.886519,0.885714,"[[63, 8], [8, 62]]",547.575712,28.30863
2,VotingClassifier,0.87234,0.872233,0.869565,"[[63, 8], [10, 60]]",884.962559,32.949448
3,SVC,0.886525,0.88672,0.888889,"[[61, 10], [6, 64]]",90.142965,5.867958
4,GradientBoostingClassifier,0.836879,0.836922,0.836879,"[[59, 12], [11, 59]]",538.210392,0.0


## compare the result


In [23]:
res_mean['strategy']='mean'
res_median['strategy']='median'
res_['strategy']='drop'


In [28]:
def interpret(result_mean,result_median,result_drop,metric):
    print(f"{metric}",end='\n\n')
    for i in range(result_mean.shape[0]):
        rs_mn=result_mean.iloc[i]
        rs_md=result_median.iloc[i]
        rs_dr=result_drop.iloc[i]
        print(f"{rs_mn['model']} >> mean {rs_mn[metric]:.3f} | median {rs_md[metric]:.3f} | | drop {rs_dr[metric]:.3f}",end='\n\n ')

In [29]:
interpret(result_mean=res_mean,result_median=res_median,result_drop=res_,metric='auc')

auc

XGBClassifier >> mean 0.865 | median 0.874 | | drop 0.851

 RandomForestClassifier >> mean 0.858 | median 0.870 | | drop 0.887

 VotingClassifier >> mean 0.858 | median 0.870 | | drop 0.872

 SVC >> mean 0.860 | median 0.860 | | drop 0.887

 GradientBoostingClassifier >> mean 0.858 | median 0.871 | | drop 0.837

 

In [30]:
interpret(result_mean=res_mean,result_median=res_median,result_drop=res_,metric='f1_score')

f1_score

XGBClassifier >> mean 0.880 | median 0.891 | | drop 0.849

 RandomForestClassifier >> mean 0.876 | median 0.886 | | drop 0.886

 VotingClassifier >> mean 0.876 | median 0.886 | | drop 0.870

 SVC >> mean 0.875 | median 0.875 | | drop 0.889

 GradientBoostingClassifier >> mean 0.876 | median 0.885 | | drop 0.837

 