In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
# from lazypredict.Supervised import LazyClassifier
warnings.filterwarnings('ignore')

In [2]:
# Load dataset
df = pd.read_csv("train.csv", sep=",")

In [3]:
df.head()

Unnamed: 0,id,age,education,sex,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,64,2.0,F,YES,3.0,0.0,0,0,0,221.0,148.0,85.0,,90.0,80.0,1
1,1,36,4.0,M,NO,0.0,0.0,0,1,0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,2,46,1.0,F,YES,10.0,0.0,0,0,0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,3,50,1.0,M,YES,20.0,0.0,0,1,0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,4,64,1.0,F,YES,30.0,0.0,0,0,0,241.0,136.5,85.0,26.42,70.0,77.0,0


In [4]:
# Preprocessing
df = df.drop('id', axis=1)
# df = df.drop_duplicates()
# df = df.dropna()
df_columns = list(df.columns)

In [5]:
le=LabelEncoder()
df['sex']=le.fit_transform(df['sex'])
df['is_smoking']=le.fit_transform(df['is_smoking'])

In [6]:
df.head()

Unnamed: 0,age,education,sex,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,64,2.0,0,1,3.0,0.0,0,0,0,221.0,148.0,85.0,,90.0,80.0,1
1,36,4.0,1,0,0.0,0.0,0,1,0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,46,1.0,0,1,10.0,0.0,0,0,0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,50,1.0,1,1,20.0,0.0,0,1,0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,64,1.0,0,1,30.0,0.0,0,0,0,241.0,136.5,85.0,26.42,70.0,77.0,0


In [7]:
# Data preprocessing
# Impute missing values using the mean
imp_mean = SimpleImputer(strategy='mean')
df = imp_mean.fit_transform(df)

In [8]:
df = pd.DataFrame(df, columns=df_columns)

In [9]:
X = df.drop(columns=["TenYearCHD"])
y = df["TenYearCHD"]

In [10]:
# Outlier detection
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [11]:
# Normalization
scaler = MinMaxScaler()
df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

In [12]:
X.shape

(3390, 15)

In [13]:
y.shape

(3390,)

In [14]:
# Resampling
# X = df.iloc[:, :-1]
# y = df.iloc[:, -1]
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split( X_resampled,y_resampled , test_size = 0.2, random_state = 0) 
print(X_train.shape)
print(X_test.shape)

(4606, 15)
(1152, 15)


In [16]:
# Model selection
models = [XGBClassifier(), RandomForestClassifier(), SVC(), KNeighborsClassifier(), LogisticRegression(), DecisionTreeClassifier()]

In [17]:
# Grid search parameters
Cs = []
for i in range(-5, 6):
    Cs.append(2**i)
logistic_regression_params = {"C": np.logspace(-3,3,7), "penalty": ["l1", "l2"]}
decision_tree_params = {'criterion':['gini','entropy'],'max_depth': np.arange(3, 5), 'min_samples_leaf': np.arange(1, 40)}
svc_params = {"C": Cs, "gamma":Cs, "kernel": ["rbf"]}
knn_params = {
        'n_neighbors': range(3,32),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
rf_params = { 
        'n_estimators': [10, 50, 200, 300],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : np.arange(3, 32),
        'criterion' :['gini', 'entropy']
    }

xgb_params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10],
        'learning_rate': [0.01, 0.1, 0.5]
    }

grid_params = [xgb_params, rf_params, svc_params, knn_params, logistic_regression_params, decision_tree_params]

In [18]:
# Model analysis
for model, params in zip(models, grid_params):
    grid_search = GridSearchCV(model, params, cv=5)
    grid_search.fit(X_train, y_train)
    predictions = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    print(f"{grid_search.best_estimator_}")
    print(f"{grid_search.best_estimator_.__class__.__name__}: Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}")

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.5, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
XGBClassifier: Accuracy: 0.96, Precision: 0.93, Recall: 0.99, F1 Score: 0.96
RandomForestClassifier(max_depth=31, n_estimators=300)
RandomForestClassifier: Accuracy: 0.97, Precision: 0.96, Recall: 0.99, F1 Score: 0.97
SVC(C=1, gamma=0.0625)
SVC: Accur

In [19]:
# Feature selection
selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X_resampled, y_resampled)

In [20]:
# Model selection with grid search
pipeline = Pipeline(steps=[('selector', selector),
                           ('classifier', RandomForestClassifier())])

parameters = {'classifier__n_estimators': [100, 200, 300],
              'classifier__max_depth': [10, 20, 30],
              'classifier__min_samples_split': [2, 4, 6]}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_resampled, y_resampled)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters:  {'classifier__max_depth': 20, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}
Best score:  0.969780957380056
