In [197]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [198]:
df = pd.read_csv("/Users/mdnaif/Desktop/Winter_Arc_ML/Day06_XGBoost/heart_cleveland_upload.csv")

In [199]:
from sklearn.model_selection import train_test_split

cat_coln = ["sex","cp","fbs","restecg","exang","slope","ca","thal"]
num_coln =["age","trestbps","chol","thalach","oldpeak"]

X = df[cat_coln + num_coln]
y = df["condition"]

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2,stratify=y)

In [200]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

preprocessing = ColumnTransformer([
    ("cat_col",OneHotEncoder(handle_unknown="ignore"),cat_coln),
    ("num_col","passthrough",num_coln)
])

pipeline = make_pipeline(
    (preprocessing),
    (XGBClassifier(random_state=42,
                   objective ="binary:logistic",
                   eval_metric = "logloss",
                   tree_method = "hist",))
)

In [201]:
pipeline.named_steps

{'columntransformer': ColumnTransformer(transformers=[('cat_col',
                                  OneHotEncoder(handle_unknown='ignore'),
                                  ['sex', 'cp', 'fbs', 'restecg', 'exang',
                                   'slope', 'ca', 'thal']),
                                 ('num_col', 'passthrough',
                                  ['age', 'trestbps', 'chol', 'thalach',
                                   'oldpeak'])]),
 'xgbclassifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='logloss',
               feature_types=None, feature_weights=None, gamma=None,
               grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_oneho

In [202]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

In [203]:
from sklearn.model_selection import RandomizedSearchCV

param_distribution = {
    "xgbclassifier__learning_rate": [0.1, 0.05, 0.01],
    "xgbclassifier__max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
    "xgbclassifier__n_estimators": [200,300,400,500,600,700,800,900,1000],
    "xgbclassifier__colsample_bytree": [0.6, 0.8, 1.0],
    "xgbclassifier__subsample": [0.6, 0.7, 0.8, 0.9],
    "xgbclassifier__min_child_weight": [1,2,3,4,5],
}

grid = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distribution,
    scoring="f1",
    cv=cv,
    n_iter= 50,
    n_jobs= -1,
    verbose=1
)

grid.fit(X_train,y_train)

best_model = grid.best_estimator_
print(grid.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'xgbclassifier__subsample': 0.6, 'xgbclassifier__n_estimators': 400, 'xgbclassifier__min_child_weight': 5, 'xgbclassifier__max_depth': 9, 'xgbclassifier__learning_rate': 0.05, 'xgbclassifier__colsample_bytree': 0.6}


In [204]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

y_pred = best_model.predict(X_test)

print(f"Acccuracy : \n{accuracy_score(y_test,y_pred)}\n")
print(f"Confusion_matrix : \n{confusion_matrix(y_test,y_pred)}\n")
print(f"Classification_report : \n{classification_report(y_test,y_pred)}\n")

Acccuracy : 
0.8833333333333333

Confusion_matrix : 
[[30  2]
 [ 5 23]]

Classification_report : 
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        32
           1       0.92      0.82      0.87        28

    accuracy                           0.88        60
   macro avg       0.89      0.88      0.88        60
weighted avg       0.89      0.88      0.88        60




In [205]:
print("Train acc:", best_model.score(X_train, y_train))
print("Test acc :", best_model.score(X_test, y_test))

Train acc: 0.8734177215189873
Test acc : 0.8833333333333333
