In [11]:
import pandas as pd
import numpy as np
import time

# Training Test
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, plot_confusion_matrix

# Algorithms
from sklearn.ensemble      import RandomForestClassifier
from sklearn.ensemble      import ExtraTreesClassifier
from sklearn.ensemble      import AdaBoostClassifier
from sklearn.ensemble      import GradientBoostingClassifier
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier
from lightgbm              import LGBMClassifier
from catboost              import CatBoostClassifier

# Load and Prepro

In [3]:
# Load data
data = pd.read_csv('data/heart.csv')

# remove duplicates
data = data.drop_duplicates()
print(data.shape)

# Train test sets
X = data.drop(columns='output')
y = data['output']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_train.shape)

# Preprocessor
num_var = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
cat_var = [col for col in X_train.columns if col not in num_var]

num_prep = ColumnTransformer([('num_prepo', StandardScaler(), num_var)],
                             remainder='passthrough')

(302, 14)
(241, 13) (241,) (61, 13) (241,)


In [4]:
tree_classifiers = {
  "Extra Trees": ExtraTreesClassifier(random_state=0),
  "Random Forest":RandomForestClassifier(random_state=0),
  "AdaBoost": AdaBoostClassifier(random_state=0),
  "Skl GBM": GradientBoostingClassifier(random_state=0),
  "Skl HistGBM": HistGradientBoostingClassifier(random_state=0),
  "XGBoost": XGBClassifier(),
  "LightGBM": LGBMClassifier(random_state=0),
  "CatBoost": CatBoostClassifier(random_state=0)}

tree_classifiers = {name: make_pipeline(num_prep, model) for name, model in tree_classifiers.items()}

# Benchmark accuracy

In [10]:
results = pd.DataFrame({'Model': [], 'Accuracy': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # Training
    model.fit(X_train, y_train)
    
    # Prediction
    pred = model.predict(X_test)

    total_time = time.time() - start_time # Time taken to fit and predict

    model_results = pd.DataFrame({"Model":    [model_name],
                              "Accuracy": [accuracy_score(y_test, pred)*100],
                              "Time":     [total_time]})
    results = pd.concat([results, model_results])
    
    

results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord



Learning rate set to 0.005611
0:	learn: 0.6889865	total: 1.82ms	remaining: 1.82s
1:	learn: 0.6848531	total: 4.41ms	remaining: 2.2s
2:	learn: 0.6809871	total: 6.52ms	remaining: 2.17s
3:	learn: 0.6763709	total: 8.2ms	remaining: 2.04s
4:	learn: 0.6724458	total: 10.7ms	remaining: 2.13s
5:	learn: 0.6680178	total: 12.4ms	remaining: 2.06s
6:	learn: 0.6639458	total: 13.9ms	remaining: 1.97s
7:	learn: 0.6593551	total: 15.4ms	remaining: 1.91s
8:	learn: 0.6556964	total: 17ms	remaining: 1.87s
9:	learn: 0.6532886	total: 17.7ms	remaining: 1.76s
10:	learn: 0.6508652	total: 19.2ms	remaining: 1.73s
11:	learn: 0.6470071	total: 20.9ms	remaining: 1.72s
12:	learn: 0.6428190	total: 22.7ms	remaining: 1.73s
13:	learn: 0.6392243	total: 24.7ms	remaining: 1.74s
14:	learn: 0.6353196	total: 26.3ms	remaining: 1.73s
15:	learn: 0.6311971	total: 27.9ms	remaining: 1.72s
16:	learn: 0.6273486	total: 29.8ms	remaining: 1.72s
17:	learn: 0.6246184	total: 31.7ms	remaining: 1.73s
18:	learn: 0.6218933	total: 33.1ms	remaining: 1.

Unnamed: 0,Model,Accuracy,Time
0,AdaBoost,91.803279,0.104275
1,Extra Trees,88.52459,0.234892
2,Random Forest,86.885246,0.231001
3,CatBoost,86.885246,2.502807
4,Skl GBM,81.967213,0.107057
5,XGBoost,81.967213,0.13
6,LightGBM,80.327869,0.062
7,Skl HistGBM,78.688525,0.356996


# Data augmentation

In [14]:
def data_enhancement(data):
    np.random.seed(0)
    gen_data = data
    sep_on = 'sex'
    divide_std_by = 10
    
    for value in data[sep_on].unique():
        sub_data =  gen_data[gen_data[sep_on] == value]

        age_std = sub_data['age'].std()
        
        trtbps_std = sub_data['trtbps'].std()
       
        chol_std = sub_data['chol'].std()
      
        thalachh_std = sub_data['thalachh'].std()
    
        oldpeak_std = sub_data['oldpeak'].std()
     
        
        for i in gen_data[gen_data[sep_on] == value].index:
            if np.random.randint(2) == 1:
                gen_data.loc[i,'age'] += age_std/divide_std_by
            else:
                gen_data.loc[i,'age'] -= age_std/divide_std_by
                
            if np.random.randint(2) == 1:
                gen_data.loc[i,'trtbps'] += trtbps_std/divide_std_by
            else:
                gen_data.loc[i,'trtbps'] -= trtbps_std/divide_std_by
                
            if np.random.randint(2) == 1:
                gen_data.loc[i,'chol'] += chol_std/divide_std_by
            else:
                gen_data.loc[i,'chol'] -= chol_std/divide_std_by
                
            if np.random.randint(2) == 1:
                gen_data.loc[i,'thalachh'] += thalachh_std/divide_std_by
            else:
                gen_data.loc[i,'thalachh'] -= thalachh_std/divide_std_by

            if np.random.randint(2) == 1:
                gen_data.loc[i,'oldpeak'] += oldpeak_std/divide_std_by
            else:
                gen_data.loc[i,'oldpeak'] -= oldpeak_std/divide_std_by

    return gen_data




In [20]:
gen = data_enhancement(data)
np.random.seed(0)
extra_data = gen.sample(gen.shape[0] // 5)

X_train_aug = pd.concat([X_train, extra_data.drop(['output'], axis=1 ) ])
y_train_aug = pd.concat([y_train, extra_data['output'] ])

print(f'Augmented X_train by {((len(X_train_aug) - len(X_train)) / len(X_train)) * 100 }%')

Augmented X_train by 24.896265560165975%


In [21]:
results_aug = pd.DataFrame({'Model': [], 'Accuracy': [], 'Time': []})

for model_name, model in tree_classifiers.items():
    start_time = time.time()
    
    # Training
    model.fit(X_train_aug, y_train_aug)
    
    # Prediction
    pred_aug = model.predict(X_test)

    total_time = time.time() - start_time # Time taken to fit and predict

    model_results_aug = pd.DataFrame({"Model":    [model_name],
                              "Accuracy": [accuracy_score(y_test, pred_aug)*100],
                              "Time":     [total_time]})


    results_aug = pd.concat([results_aug, model_results_aug])
    results_ord_aug = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)
results_ord_aug



Learning rate set to 0.00617
0:	learn: 0.6879724	total: 1.81ms	remaining: 1.81s
1:	learn: 0.6829164	total: 3.55ms	remaining: 1.77s
2:	learn: 0.6793905	total: 6.58ms	remaining: 2.19s
3:	learn: 0.6742403	total: 8.23ms	remaining: 2.05s
4:	learn: 0.6690829	total: 10ms	remaining: 2s
5:	learn: 0.6649636	total: 11.8ms	remaining: 1.96s
6:	learn: 0.6617309	total: 13.2ms	remaining: 1.87s
7:	learn: 0.6568757	total: 14.9ms	remaining: 1.85s
8:	learn: 0.6514359	total: 16.8ms	remaining: 1.85s
9:	learn: 0.6469713	total: 18.5ms	remaining: 1.83s
10:	learn: 0.6420637	total: 22.5ms	remaining: 2.02s
11:	learn: 0.6374783	total: 25.1ms	remaining: 2.07s
12:	learn: 0.6332609	total: 27ms	remaining: 2.05s
13:	learn: 0.6290902	total: 28.8ms	remaining: 2.03s
14:	learn: 0.6252074	total: 30.6ms	remaining: 2.01s
15:	learn: 0.6211623	total: 32.5ms	remaining: 2s
16:	learn: 0.6173588	total: 35.8ms	remaining: 2.07s
17:	learn: 0.6133948	total: 38.6ms	remaining: 2.1s
18:	learn: 0.6097454	total: 40.2ms	remaining: 2.08s
19:	

Unnamed: 0,Model,Accuracy,Time
0,AdaBoost,91.803279,0.104275
1,Extra Trees,88.52459,0.234892
2,Random Forest,86.885246,0.231001
3,CatBoost,86.885246,2.502807
4,Skl GBM,81.967213,0.107057
5,XGBoost,81.967213,0.13
6,LightGBM,80.327869,0.062
7,Skl HistGBM,78.688525,0.356996
