In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures

In [2]:
heart_data = pd.read_csv("c:/Users/Userpc/Desktop/Projects/ML Shortcuts/Classification/CVD_cleaned.csv")

In [3]:
heart_data.dtypes

General_Health                   object
Checkup                          object
Exercise                         object
Heart_Disease                    object
Skin_Cancer                      object
Other_Cancer                     object
Depression                       object
Diabetes                         object
Arthritis                        object
Sex                              object
Age_Category                     object
Height_(cm)                     float64
Weight_(kg)                     float64
BMI                             float64
Smoking_History                  object
Alcohol_Consumption             float64
Fruit_Consumption               float64
Green_Vegetables_Consumption    float64
FriedPotato_Consumption         float64
dtype: object

In [4]:
X = heart_data.drop(["Height_(cm)", "Weight_(kg)", "Heart_Disease"], axis=1)
y = heart_data["Heart_Disease"]

In [5]:
X.select_dtypes('object').nunique()

General_Health      5
Checkup             5
Exercise            2
Skin_Cancer         2
Other_Cancer        2
Depression          2
Diabetes            4
Arthritis           2
Sex                 2
Age_Category       13
Smoking_History     2
dtype: int64

In [6]:
categorical_columns = ["General_Health", "Checkup", "Exercise", "Skin_Cancer",
                       "Other_Cancer", "Depression", "Diabetes", "Arthritis", "Sex", "Age_Category", "Smoking_History"]
numerical_columns = X.columns.difference(categorical_columns)

In [7]:
def get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test):

    preprocessor = ColumnTransformer(transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', 'passthrough', numerical_columns)
    ])

    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('model', DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=0))
    ])

    pred = pipe.fit(X_train, y_train).predict(X_test)
    return pred

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.9)

for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, X_train, X_test, y_train, y_test)
    accuracy = accuracy_score(y_test, my_mae)
    print(f"Accuracy: {accuracy:.6f}")

Accuracy: 0.919316
Accuracy: 0.918992
Accuracy: 0.916532
Accuracy: 0.907175


It seems like 5 leaves is the optimal number

In [8]:
from sklearn.ensemble import RandomForestClassifier

preprocessor = ColumnTransformer(transformers=[
    ('num', 'passthrough', numerical_columns),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=0))
])

y_pred = pipeline.fit(X_train, y_train).predict(X_test)

rf_accuracy = accuracy_score(y_test, y_pred)
print(f"Random Fores accuracy: {rf_accuracy: .6f}")

Random Fores accuracy:  0.915949


In [10]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, random_state=0))
])

y_pred = pipeline.fit(X_train, y_train).predict(X_test)

gb_accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting accuracy: {gb_accuracy:.6f}")

Gradient Boosting accuracy: 0.918636
