In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [None]:
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.model_selection import StratifiedKFold

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC

In [None]:
train_df = pd.read_csv("train_df.csv")

In [None]:
train_df_copy = train_df.copy()

In [None]:
train_df.shape

(306854, 14)

In [None]:
train_df = train_df.drop(["Height_(cm)", "Weight_(kg)", "BMI", 'Alcohol_Consumption_Category', 'Fruit_Consumption_Category',
       'Green_Vegetables_Consumption_Category', 'FriedPotato_Consumption_Category'], axis = 1)

In [None]:
train_df.shape

(306854, 7)

In [None]:
test_df = pd.read_csv("test_df.csv")
test_df.drop(["Height_(cm)", "Weight_(kg)", "BMI", 'Alcohol_Consumption_Category', 'Fruit_Consumption_Category',
       'Green_Vegetables_Consumption_Category', 'FriedPotato_Consumption_Category'], axis = 1, inplace = True)
test_df

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Sex,Age_Category,Smoking_History
0,Very Good,Within the past 2 years,Yes,No,Male,18-24,Yes
1,Good,Within the past year,No,Yes,Female,80+,No
2,Good,Within the past year,Yes,No,Male,45-49,No
3,Very Good,Within the past 2 years,Yes,Yes,Female,80+,Yes
4,Excellent,Within the past year,Yes,No,Male,25-29,Yes
...,...,...,...,...,...,...,...
1995,Good,Within the past year,No,No,Female,60-64,Yes
1996,Very Good,Within the past year,No,No,Male,80+,Yes
1997,Good,Within the past 2 years,No,Yes,Male,40-44,Yes
1998,Excellent,Within the past year,Yes,No,Male,60-64,No


In [None]:
train_df.reset_index(drop = True, inplace = True)
test_df.reset_index(drop = True, inplace = True)

In [None]:
X = train_df.drop("Heart_Disease", axis=1)
y = train_df["Heart_Disease"]

In [None]:
X

Unnamed: 0,General_Health,Checkup,Exercise,Sex,Age_Category,Smoking_History
0,Poor,Within the past 2 years,No,Female,70-74,Yes
1,Very Good,Within the past year,No,Female,70-74,No
2,Very Good,Within the past year,Yes,Female,60-64,No
3,Poor,Within the past year,Yes,Male,75-79,No
4,Good,Within the past year,No,Male,80+,Yes
...,...,...,...,...,...,...
306849,Very Good,Within the past year,Yes,Male,25-29,No
306850,Fair,Within the past 5 years,Yes,Male,65-69,No
306851,Very Good,5 or more years ago,Yes,Female,30-34,Yes
306852,Very Good,Within the past year,Yes,Male,65-69,No


In [None]:
le = LabelEncoder()
y = le.fit_transform(y)
y

array([0, 1, 0, ..., 0, 0, 0])

In [None]:
test_df["Heart_Disease"] = le.transform(test_df["Heart_Disease"])

In [None]:
ordinal_cols = ["General_Health", "Checkup", "Age_Category"]
label_cols = ["Exercise", "Sex", "Smoking_History"]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", OrdinalEncoder(), X.columns)
    ],
    remainder = "passthrough"
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight = "balanced"))
])

In [None]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring = "accuracy")
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())
print("Standard Deviation", scores.std())

Cross-validation scores: [0.67962067 0.67944143 0.66313079 0.67497678 0.69191788]
Mean accuracy: 0.6778175096311783
Standard Deviation 0.009257193344089663


In [None]:
pipeline.fit(X, y)

In [None]:
y_pred = pipeline.predict(test_df.drop(columns = ["Heart_Disease"]))

print(classification_report(test_df["Heart_Disease"], y_pred))

              precision    recall  f1-score   support

           0       0.74      0.68      0.71      1000
           1       0.70      0.76      0.73      1000

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000



In [None]:
def scorer(model_name, model):
  sample_weights = None
  output = []

  output.append(model_name)

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
  ])

  if hasattr(model, 'class_weight'):
    if 'class_weight' in model.get_params().keys():
      sample_weights = None

    else:
      sample_weights = compute_sample_weight(class_weight='balanced', y=y)
  else:
    sample_weights = compute_sample_weight(class_weight='balanced', y=y)

  if sample_weights is None:
    scores = cross_val_score(pipeline, X, y, cv=5, scoring = "accuracy")

  else:
    scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy", fit_params={'classifier__sample_weight': sample_weights})

  output.append(scores.mean())

  if sample_weights is None:
    pipeline.fit(X, y)

  else:
    pipeline.fit(X, y, classifier__sample_weight=sample_weights)

  y_pred = pipeline.predict(test_df.drop(columns = ["Heart_Disease"]))

  output.append(accuracy_score(test_df["Heart_Disease"], y_pred))
  print("Models Done: ", model_name)

  return output

In [None]:
model_dict = {
    "Logistic Regression": LogisticRegression(class_weight = "balanced"),
    "Random Forest": RandomForestClassifier(class_weight = "balanced_subsample"),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "extra trees": ExtraTreesClassifier(class_weight = "balanced_subsample"),
    "decision tree": DecisionTreeClassifier(class_weight = "balanced"),
    "adaboost": AdaBoostClassifier(),
}


In [None]:
model_output = []
for model_name, model in model_dict.items():
  model_output.append(scorer(model_name, model))

Models Done:  Logistic Regression
Models Done:  Random Forest




Models Done:  Gradient Boosting




Models Done:  XGBoost
Models Done:  extra trees
Models Done:  decision tree




Models Done:  adaboost


In [None]:
model_output

[['Logistic Regression', 0.6778175096311783, 0.7195],
 ['Random Forest', 0.716643768431347, 0.7495],
 ['Gradient Boosting', 0.7100477946182991, 0.7525],
 ['XGBoost', 0.7101260139865239, 0.7515],
 ['extra trees', 0.7106343977966695, 0.7495],
 ['decision tree', 0.7105692202203601, 0.7495],
 ['adaboost', 0.7250418897185249, 0.751]]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", OrdinalEncoder(), ordinal_cols),
        ("one-hot", OneHotEncoder(drop = 'first'), label_cols)
    ],
    remainder = "passthrough"
)

In [None]:
def scorer(model_name, model):
  sample_weights = None
  output = []

  output.append(model_name)

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
  ])

  if hasattr(model, 'class_weight'):
    if 'class_weight' in model.get_params().keys():
      sample_weights = None

    else:
      sample_weights = compute_sample_weight(class_weight='balanced', y=y)
  else:
    sample_weights = compute_sample_weight(class_weight='balanced', y=y)

  if sample_weights is None:
    scores = cross_val_score(pipeline, X, y, cv=5, scoring = "accuracy")

  else:
    scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy", fit_params={'classifier__sample_weight': sample_weights})

  output.append(scores.mean())

  if sample_weights is None:
    pipeline.fit(X, y)

  else:
    pipeline.fit(X, y, classifier__sample_weight=sample_weights)

  y_pred = pipeline.predict(test_df.drop(columns = ["Heart_Disease"]))

  output.append(accuracy_score(test_df["Heart_Disease"], y_pred))
  print("Models Done: ", model_name)

  return output

In [None]:
model_output = []
for model_name, model in model_dict.items():
  model_output.append(scorer(model_name, model))

Models Done:  Logistic Regression
Models Done:  Random Forest




Models Done:  Gradient Boosting




Models Done:  XGBoost
Models Done:  extra trees
Models Done:  decision tree




Models Done:  adaboost


In [None]:
model_output

[['Logistic Regression', 0.6778175096311783, 0.7195],
 ['Random Forest', 0.7159463533263569, 0.7495],
 ['Gradient Boosting', 0.7100477946182991, 0.7525],
 ['XGBoost', 0.7101260139865239, 0.7515],
 ['extra trees', 0.7106343978497713, 0.7495],
 ['decision tree', 0.7105692202203601, 0.7495],
 ['adaboost', 0.7250418897185249, 0.751]]

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", OrdinalEncoder(), ordinal_cols),
        ("target", ce.TargetEncoder(), label_cols)
    ],
    remainder = "passthrough"
)

In [None]:
def scorer(model_name, model):
  sample_weights = None
  output = []

  output.append(model_name)

  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
  ])

  if hasattr(model, 'class_weight'):
    if 'class_weight' in model.get_params().keys():
      sample_weights = None

    else:
      sample_weights = compute_sample_weight(class_weight='balanced', y=y)
  else:
    sample_weights = compute_sample_weight(class_weight='balanced', y=y)

  if sample_weights is None:
    scores = cross_val_score(pipeline, X, y, cv=5, scoring = "accuracy")

  else:
    scores = cross_val_score(pipeline, X, y, cv=5, scoring="accuracy", fit_params={'classifier__sample_weight': sample_weights})

  output.append(scores.mean())

  if sample_weights is None:
    pipeline.fit(X, y)

  else:
    pipeline.fit(X, y, classifier__sample_weight=sample_weights)

  y_pred = pipeline.predict(test_df.drop(columns = ["Heart_Disease"]))

  output.append(accuracy_score(test_df["Heart_Disease"], y_pred))
  print("Models Done: ", model_name)

  return output

In [None]:
model_output = []
for model_name, model in model_dict.items():
  model_output.append(scorer(model_name, model))

Models Done:  Logistic Regression
Models Done:  Random Forest




Models Done:  Gradient Boosting




Models Done:  XGBoost
Models Done:  extra trees
Models Done:  decision tree




Models Done:  adaboost


In [None]:
model_output

[['Logistic Regression', 0.6770321238830203, 0.722],
 ['Random Forest', 0.7167741252832289, 0.75],
 ['Gradient Boosting', 0.7100477946182991, 0.7525],
 ['XGBoost', 0.7101260139865239, 0.7515],
 ['extra trees', 0.7106246212451862, 0.7495],
 ['decision tree', 0.7105692202203601, 0.7495],
 ['adaboost', 0.7250418897185249, 0.751]]

In [None]:
X

Unnamed: 0,General_Health,Checkup,Exercise,Sex,Age_Category,Smoking_History
0,Poor,Within the past 2 years,No,Female,70-74,Yes
1,Very Good,Within the past year,No,Female,70-74,No
2,Very Good,Within the past year,Yes,Female,60-64,No
3,Poor,Within the past year,Yes,Male,75-79,No
4,Good,Within the past year,No,Male,80+,Yes
...,...,...,...,...,...,...
69535,Very Good,Within the past year,Yes,Male,18-24,No
69536,Good,Within the past year,Yes,Female,30-34,Yes
69537,Fair,Within the past year,No,Male,70-74,Yes
69538,Poor,Within the past year,No,Female,70-74,No


In [None]:
sample_weights = compute_sample_weight(class_weight='balanced', y=y)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal", OrdinalEncoder(), X.columns),
    ],
    remainder = "passthrough"
)

# HyperParameter Tuning Using Optuna

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.3-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.2/233.2 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
import optuna

In [None]:
def objective(trial):
    # Define hyperparameter search space
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 10.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1, 10)
    }

    # XGBoost Classifier
    model = XGBClassifier(**param, eval_metric="logloss")

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    # Stratified K-Fold Cross Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    for train_index, val_index in skf.split(X, y):
        X_fold_train, X_fold_val = X.iloc[train_index], X.iloc[val_index]
        y_fold_train, y_fold_val = y[train_index], y[val_index]
        sample_weight_fold = sample_weights[train_index]  # Get sample weights for the training fold

        # Fit the pipeline with sample weights
        pipeline.fit(X_fold_train, y_fold_train, classifier__sample_weight=sample_weight_fold)

        # Evaluate the model on the validation fold
        score = pipeline.score(X_fold_val, y_fold_val)
        scores.append(score)

    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-10-04 12:09:37,513] A new study created in memory with name: no-name-fbe8daf7-d50e-4c10-a11c-73344526b782
[I 2024-10-04 12:10:20,767] Trial 0 finished with value: 0.3949630706431475 and parameters: {'n_estimators': 570, 'learning_rate': 0.11914415881203656, 'max_depth': 5, 'min_child_weight': 10, 'gamma': 1.2183707466793459, 'subsample': 0.9643219591589657, 'colsample_bytree': 0.7868418140507965, 'reg_alpha': 6.832848306054623, 'reg_lambda': 5.1177795599289535, 'scale_pos_weight': 6.674921545235186}. Best is trial 0 with value: 0.3949630706431475.
[I 2024-10-04 12:13:20,082] Trial 1 finished with value: 0.5502779608269227 and parameters: {'n_estimators': 931, 'learning_rate': 0.12707899736340594, 'max_depth': 11, 'min_child_weight': 6, 'gamma': 0.14793889996103093, 'subsample': 0.6649766444311676, 'colsample_bytree': 0.9862677896436871, 'reg_alpha': 2.5093838552473304, 'reg_lambda': 6.3068572220216925, 'scale_pos_weight': 2.631596312469978}. Best is trial 1 with value: 0.550277

In [None]:
print("Best trail accuracy: ", study.best_trial.value)
print("Best hyperparameters: ", study.best_trial.params)

Best trail accuracy:  0.7167219687890647
Best hyperparameters:  {'n_estimators': 468, 'learning_rate': 0.2668166626325435, 'max_depth': 15, 'min_child_weight': 3, 'gamma': 6.479045905238172, 'subsample': 0.9336730118143985, 'colsample_bytree': 0.5558097631052673, 'reg_alpha': 8.822955565702205, 'reg_lambda': 5.897575179998551, 'scale_pos_weight': 1.004249325118252}


In [None]:
best_model = XGBClassifier(
    **study.best_trial.params,
    eval_metric="logloss"
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

pipeline.fit(X, y, classifier__sample_weight = sample_weights)

y_pred = pipeline.predict(test_df.drop(columns = ["Heart_Disease"]))
print(classification_report(test_df.Heart_Disease, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.70      0.74      1000
           1       0.73      0.81      0.77      1000

    accuracy                           0.75      2000
   macro avg       0.76      0.75      0.75      2000
weighted avg       0.76      0.75      0.75      2000



In [None]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [None]:
plot_optimization_history(study).show()

In [None]:
plot_parallel_coordinate(study).show()

In [None]:
plot_slice(study).show()