In [75]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import pickle

### Preprocessing

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
df.drop('education', axis=1, inplace=True)

In [5]:
df.describe()

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4240.0,4240.0,4240.0,4211.0,4187.0,4240.0,4240.0,4240.0,4190.0,4240.0,4240.0,4221.0,4239.0,3852.0,4240.0
mean,0.429245,49.580189,0.494104,9.005937,0.029615,0.005896,0.310613,0.025708,236.699523,132.354599,82.897759,25.800801,75.878981,81.963655,0.151887
std,0.495027,8.572942,0.500024,11.922462,0.169544,0.076569,0.462799,0.15828,44.591284,22.0333,11.910394,4.07984,12.025348,23.954335,0.358953
min,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,90.0,28.04,83.0,87.0,0.0
max,1.0,70.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [6]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [7]:
# binary columns
bin_cols = ['male', 'currentSmoker', 'prevalentStroke', 'prevalentHyp', 'diabetes']

# fill missing values with mode
for col in bin_cols:
    mode_val = df[col].mode()[0]
    df[col].fillna(mode_val, inplace=True)

In [8]:
# fill missing values with median for numeric columns
num_cols = ['cigsPerDay', 'BPMeds', 'totChol', 'BMI', 'heartRate', 'glucose']
for col in num_cols:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)


In [9]:
df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [10]:
df['TenYearCHD'].value_counts()

TenYearCHD
0    3596
1     644
Name: count, dtype: int64

In [11]:
# resample data to balance classes

df_majority = df[df['TenYearCHD'] == 0]
df_minority = df[df['TenYearCHD'] == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) 

# combine majority and upsampled minority
df_balanced = pd.concat([df_minority_upsampled, df_majority])

In [12]:
df_balanced['TenYearCHD'].value_counts()

TenYearCHD
1    3596
0    3596
Name: count, dtype: int64

In [13]:
# split

x = df_balanced.drop(columns = ['TenYearCHD'])
y = df_balanced['TenYearCHD']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((5753, 14), (1439, 14), (5753,), (1439,))

In [15]:
# scale data
scaler = StandardScaler()

# fit scaler
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [16]:
x_train

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
3135,0,49,0,0.0,0.0,0,0,0,286.0,144.0,91.0,29.35,65.0,67.0
1278,1,53,1,30.0,0.0,0,0,0,253.0,121.0,85.5,28.52,80.0,68.0
259,0,60,0,0.0,0.0,0,1,0,352.0,197.5,105.0,36.29,75.0,95.0
2362,1,58,0,0.0,0.0,0,0,0,200.0,128.0,83.0,29.63,68.0,80.0
2873,0,59,0,0.0,0.0,0,1,0,240.0,149.0,88.0,27.48,80.0,82.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
219,0,48,0,0.0,0.0,0,0,0,196.0,96.0,70.0,22.72,60.0,68.0
1881,0,55,0,0.0,0.0,0,0,0,310.0,135.0,76.5,26.31,110.0,74.0
1920,1,51,1,20.0,0.0,0,0,0,256.0,130.0,75.0,28.76,60.0,83.0
2116,0,39,1,9.0,0.0,0,0,0,180.0,113.0,73.0,17.65,70.0,73.0


In [17]:
x_train_scaled

array([[-0.9666601 , -0.28580492, -1.00540309, ...,  0.76561046,
        -0.92978078, -0.58792113],
       [ 1.03448979,  0.17674117,  0.99462595, ...,  0.57120618,
         0.31971983, -0.55318302],
       [-0.9666601 ,  0.98619683, -1.00540309, ...,  2.39111134,
        -0.09678037,  0.38474579],
       ...,
       [ 1.03448979, -0.05453188,  0.99462595, ...,  0.62741946,
        -1.34628098, -0.03211146],
       [-0.9666601 , -1.44217015,  0.99462595, ..., -1.97478728,
        -0.51328057, -0.3794925 ],
       [-0.9666601 ,  1.44874292, -1.00540309, ..., -0.07290441,
        -0.76318069, -0.62265923]])

### Training

In [49]:
classifiers = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    GradientBoostingClassifier(n_estimators=100, random_state=42),
    AdaBoostClassifier(n_estimators=100, random_state=42),
    LogisticRegression(random_state=42),
    SVC(random_state=42),
    DecisionTreeClassifier(random_state=42),
    KNeighborsClassifier(),
    GaussianNB(),
    XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
]

In [50]:
results = {}

In [51]:
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(x_train_scaled, y_train)
    pred = clf.predict(x_test_scaled)
    
    # accuracy
    accuracy = accuracy_score(y_test, pred)
    cm = confusion_matrix(y_test, pred)
    cr = classification_report(y_test, pred, output_dict=True)
    f1_score = cr['weighted avg']['f1-score']
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    
    results[clf_name] = {
        'accuracy': accuracy,
        'confusion_matrix': cm,
        'f1_score': f1_score,
        'precision': precision,
        'recall': recall
    }  
    

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [52]:
# Print results for each classifier
for clf_name, result in results.items():
    print(f"Classifier: {clf_name}")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Confusion Matrix:\n{result['confusion_matrix']}")
    print(f"F1 Score: {result['f1_score']}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print()

Classifier: RandomForestClassifier
Accuracy: 0.9784572619874913
Confusion Matrix:
[[677  27]
 [  4 731]]
F1 Score: 0.9784443226239578
Precision: 0.9789327072730019
Recall: 0.9784572619874913

Classifier: GradientBoostingClassifier
Accuracy: 0.7456567060458652
Confusion Matrix:
[[499 205]
 [161 574]]
F1 Score: 0.7452502683451245
Precision: 0.7462443460980457
Recall: 0.7456567060458652

Classifier: AdaBoostClassifier
Accuracy: 0.7067407922168172
Confusion Matrix:
[[493 211]
 [211 524]]
F1 Score: 0.7067407922168172
Precision: 0.7067407922168172
Recall: 0.7067407922168172

Classifier: LogisticRegression
Accuracy: 0.6706045865184156
Confusion Matrix:
[[473 231]
 [243 492]]
F1 Score: 0.6706408614436816
Precision: 0.6707703966657808
Recall: 0.6706045865184156

Classifier: SVC
Accuracy: 0.7303683113273106
Confusion Matrix:
[[504 200]
 [188 547]]
F1 Score: 0.7303010621444926
Precision: 0.7303354661561404
Recall: 0.7303683113273106

Classifier: DecisionTreeClassifier
Accuracy: 0.9284225156358582

In [53]:
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by='accuracy', ascending=False)

In [54]:
results_df

Unnamed: 0,accuracy,confusion_matrix,f1_score,precision,recall
RandomForestClassifier,0.978457,"[[677, 27], [4, 731]]",0.978444,0.978933,0.978457
DecisionTreeClassifier,0.928423,"[[604, 100], [3, 732]]",0.92799,0.936191,0.928423
XGBClassifier,0.926338,"[[615, 89], [17, 718]]",0.926073,0.93051,0.926338
KNeighborsClassifier,0.808895,"[[485, 219], [56, 679]]",0.805923,0.824794,0.808895
GradientBoostingClassifier,0.745657,"[[499, 205], [161, 574]]",0.74525,0.746244,0.745657
SVC,0.730368,"[[504, 200], [188, 547]]",0.730301,0.730335,0.730368
AdaBoostClassifier,0.706741,"[[493, 211], [211, 524]]",0.706741,0.706741,0.706741
LogisticRegression,0.670605,"[[473, 231], [243, 492]]",0.670641,0.67077,0.670605
GaussianNB,0.601807,"[[627, 77], [496, 239]]",0.568094,0.65946,0.601807


##### Best Model

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train_scaled, y_train)

In [55]:
pred = rf.predict(x_test_scaled)
pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [56]:
accuracy = accuracy_score(y_test, pred)
cm = confusion_matrix(y_test, pred)
cr = classification_report(y_test, pred)

In [57]:
accuracy

0.9784572619874913

In [58]:
cm

array([[677,  27],
       [  4, 731]], dtype=int64)

In [60]:
print(cr)

              precision    recall  f1-score   support

           0       0.99      0.96      0.98       704
           1       0.96      0.99      0.98       735

    accuracy                           0.98      1439
   macro avg       0.98      0.98      0.98      1439
weighted avg       0.98      0.98      0.98      1439



In [68]:
print("predicted class", rf.predict(x_test_scaled[10].reshape(1, -1))[0])
print("actual class", y_test.iloc[10])

predicted class 1
actual class 1


In [70]:
print("predicted class", rf.predict(x_test_scaled[200].reshape(1, -1))[0])
print("actual class", y_test.iloc[200])

predicted class 0
actual class 0


In [73]:
pickle.dump(rf, open('Models/random_forest_model.pkl', 'wb'))
pickle.dump(scaler, open('Models/scaler.pkl', 'wb'))

In [None]:
# Load
with open('Models/random_forest_model.pkl', 'rb') as f:
    rf_loaded = pickle.load(f)
    
with open('Models/scaler.pkl', 'rb') as f:
    scaler_loaded = pickle.load(f)

In [93]:
def predict(rf, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
    male_encoded = 1 if male.lower() == 'male' else 0
    currentSmoker_encoded = 1 if currentSmoker.lower() == 'yes' else 0
    prevalentStroke_encoded = 1 if prevalentStroke.lower() == 'yes' else 0
    prevalentHyp_encoded = 1 if prevalentHyp.lower() == 'yes' else 0
    diabetes_encoded = 1 if diabetes.lower() == 'yes' else 0
    BPMeds_encoded = 1 if BPMeds.lower() == 'yes' else 0
    
    # feature array
    features = np.array([[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]])
    
    # scaling
    scaled_features = scaler.transform(features)
    
    # prediction
    result = rf.predict(scaled_features)
    
    return result[0]

In [94]:
# testing
male = 'female'
age = 50
currentSmoker = 'yes'
cigsPerDay = 3.00
BPMeds = 'no'
prevalentStroke = 'no'
prevalentHyp = 'yes'
diabetes = 'no'
totChol = 285.00
sysBP = 145.00
diaBP = 100.00
BMI = 30.14
heartRate = 80.00
glucose = 85.00

In [98]:
result = predict(rf, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)
if result == 1:
    print("The model predicts that the patient is likely to have a heart disease in 10 years.")
else:
    print("The model predicts that the patient is unlikely to have a heart disease in 10 years.")

The model predicts that the patient is unlikely to have a heart disease in 10 years.




In [1]:
import sklearn
sklearn.__version__

'1.5.0'