In [3]:
import pandas as pd
df = pd.read_csv('/content/datasets_4123_6408_framingham.csv')
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
df.shape

(4240, 16)

In [5]:
df.isnull().sum()

Unnamed: 0,0
male,0
age,0
education,105
currentSmoker,0
cigsPerDay,29
BPMeds,53
prevalentStroke,0
prevalentHyp,0
diabetes,0
totChol,50


In [6]:
df.drop('education',axis=1,inplace=True)
df.isnull().sum()

Unnamed: 0,0
male,0
age,0
currentSmoker,0
cigsPerDay,29
BPMeds,53
prevalentStroke,0
prevalentHyp,0
diabetes,0
totChol,50
sysBP,0


In [7]:
from sklearn.impute import SimpleImputer

bin_cols = ["male","currentSmoker","prevalentStroke","prevalentHyp","diabetes"]

# Create a SimpleImputer instance with 'most_frequent' strategy
imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer on the binary columns and transform them
df[bin_cols] = imputer.fit_transform(df[bin_cols])
df.isnull().sum()

Unnamed: 0,0
male,0
age,0
currentSmoker,0
cigsPerDay,29
BPMeds,53
prevalentStroke,0
prevalentHyp,0
diabetes,0
totChol,50
sysBP,0


In [8]:
from sklearn.impute import SimpleImputer

numeric_cols = ["cigsPerDay","totChol","BMI","heartRate","glucose","BPMeds"]

# Create a SimpleImputer instance with 'median' strategy
imputer = SimpleImputer(strategy='median')

# Fit the imputer on the numeric columns and transform them
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
df.isnull().sum()

Unnamed: 0,0
male,0
age,0
currentSmoker,0
cigsPerDay,0
BPMeds,0
prevalentStroke,0
prevalentHyp,0
diabetes,0
totChol,0
sysBP,0


In [9]:
df['TenYearCHD'].value_counts()

Unnamed: 0_level_0,count
TenYearCHD,Unnamed: 1_level_1
0,3596
1,644


In [10]:
from sklearn.utils import resample

#Separating majority and minority classes
df_majority = df[df['TenYearCHD']==0]
df_minority = df[df['TenYearCHD']==1]

#Upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

# Combine majority class with upsampled minority class
df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
df_balanced['TenYearCHD'].value_counts()

Unnamed: 0_level_0,count
TenYearCHD,Unnamed: 1_level_1
0,3596
1,3596


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_balanced.drop('TenYearCHD', axis=1)
y = df_balanced['TenYearCHD']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# initialize standardscalar
scaler = StandardScaler()

# fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# transform the testing data
X_test_scaled = scaler.transform(X_test)

In [14]:
X_train_scaled

array([[-0.94172615,  1.4582083 , -1.02624378, ...,  0.840088  ,
        -0.50731448, -0.34072887],
       [-0.94172615, -1.66694628,  0.97442734, ..., -1.050363  ,
        -0.0898974 ,  0.25133934],
       [-0.94172615,  1.34246184,  0.97442734, ...,  0.42574258,
        -1.34214864, -0.34072887],
       ...,
       [ 1.06187982,  1.92119417, -1.02624378, ..., -0.6972277 ,
        -1.34214864, -0.18492145],
       [ 1.06187982,  1.68970124,  0.97442734, ...,  0.59289329,
        -0.0898974 ,  4.08420194],
       [ 1.06187982, -0.74097455,  0.97442734, ...,  1.0660946 ,
         0.32751968,  1.84057505]])

In [15]:
X_train

Unnamed: 0,male,age,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
1905,0,64,0,0.0,0.0,0,1,0,229.0,145.0,85.0,29.67,70.0,74.0
2075,0,37,1,20.0,0.0,0,0,0,166.0,112.0,73.5,21.64,75.0,93.0
1128,0,63,1,10.0,0.0,0,1,0,236.0,189.0,103.0,27.91,60.0,74.0
1782,0,65,0,0.0,0.0,0,1,0,245.0,171.0,89.0,23.07,82.0,93.0
241,1,65,1,15.0,0.0,0,1,0,219.0,148.0,90.0,29.35,77.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,1,63,1,20.0,0.0,0,1,0,269.0,180.0,101.0,24.42,72.0,84.0
485,1,54,1,40.0,0.0,0,0,0,230.0,145.0,90.0,25.72,75.0,85.0
4232,1,68,0,0.0,0.0,0,1,0,176.0,168.0,97.0,23.14,60.0,79.0
952,1,66,1,30.0,0.0,0,0,1,234.0,114.5,62.5,28.62,75.0,216.0


In [20]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [21]:
# Define a list of classifiers
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    GaussianNB(),
    XGBClassifier()
]

In [24]:
# Create a dictionary to store the results
results = {}

#create a dataframe to store the results
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])

# Train and evaluate each classifier
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy}")

    # Classification report
    print(f"Classification Report for {clf_name}:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    print(f"Confusion Matrix for {clf_name}:")
    print(confusion_matrix(y_test, y_pred))
    print("="*50)

    #calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']

    #append results to dataframe
    results_df = pd.concat([results_df, pd.DataFrame({'Model': [clf_name], 'Accuracy': [accuracy], 'F1-Score': [f1_score], 'Precision': [precision], 'Recall': [recall]})], ignore_index=True)


RandomForestClassifier Accuracy: 0.970813064628214
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for RandomForestClassifier:
[[698  37]
 [  5 699]]


  results_df = pd.concat([results_df, pd.DataFrame({'Model': [clf_name], 'Accuracy': [accuracy], 'F1-Score': [f1_score], 'Precision': [precision], 'Recall': [recall]})], ignore_index=True)


AdaBoostClassifier Accuracy: 0.6719944405837387
Classification Report for AdaBoostClassifier:
              precision    recall  f1-score   support

           0       0.69      0.66      0.67       735
           1       0.66      0.68      0.67       704

    accuracy                           0.67      1439
   macro avg       0.67      0.67      0.67      1439
weighted avg       0.67      0.67      0.67      1439

Confusion Matrix for AdaBoostClassifier:
[[486 249]
 [223 481]]
GradientBoostingClassifier Accuracy: 0.7289784572619875
Classification Report for GradientBoostingClassifier:
              precision    recall  f1-score   support

           0       0.76      0.69      0.72       735
           1       0.70      0.77      0.74       704

    accuracy                           0.73      1439
   macro avg       0.73      0.73      0.73      1439
weighted avg       0.73      0.73      0.73      1439

Confusion Matrix for GradientBoostingClassifier:
[[508 227]
 [163 541]]
Logist

In [25]:
results_df

Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall
0,RandomForestClassifier,0.970813,0.970813,0.971773,0.970813
1,AdaBoostClassifier,0.671994,0.672015,0.672474,0.671994
2,GradientBoostingClassifier,0.728978,0.728702,0.73132,0.728978
3,LogisticRegression,0.658791,0.65883,0.659053,0.658791
4,SVC,0.683113,0.683126,0.683656,0.683113
5,KNeighborsClassifier,0.787352,0.783833,0.812481,0.787352
6,DecisionTreeClassifier,0.913134,0.912693,0.92499,0.913134
7,GaussianNB,0.583044,0.530092,0.635597,0.583044
8,XGBClassifier,0.906185,0.905977,0.912148,0.906185


In [26]:
# run the best model
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

confusion_mat = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_mat)

classification_rep = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_rep)

Accuracy: 0.9735927727588604
Confusion Matrix:
[[702  33]
 [  5 699]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439



In [27]:
 # test1
 print("predicted class: ",rf_model.predict(X_test_scaled[10].reshape(1,-1))[0])
 print("actual class: ",y_test.iloc[10])

predicted class:  0
actual class:  0


In [28]:
 # test2
 print("predicted class: ",rf_model.predict(X_test_scaled[200].reshape(1,-1))[0])
 print("actual class: ",y_test.iloc[200])

predicted class:  1
actual class:  1


In [29]:
import pickle
pickle.dump(rf_model,open('rf_classfier.pkl','wb'))
pickle.dump(scaler,open('scaler.pkl','wb'))

In [30]:
# Load the RandomForestClassifier model
with open('rf_classfier.pkl', 'rb') as model_file:
    rf_model = pickle.load(model_file)

# Load the scaler
with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

In [31]:
import numpy as np

def predict(rf_model, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
  # encode categorical variables
  male_encoded = 1 if male.lower() == 'male' else 0
  currentSmoker_encoded = 1 if currentSmoker.lower() == 'yes' else 0
  BPMeds_encoded = 1 if BPMeds.lower() == 'yes' else 0
  prevalentStroke_encoded = 1 if prevalentStroke.lower() == 'yes' else 0
  prevalentHyp_encoded = 1 if prevalentHyp.lower() == 'yes' else 0
  diabetes_encoded = 1 if diabetes.lower() == 'yes' else 0

  # prepare features array
  features = np.array([[male_encoded, age, currentSmoker_encoded, cigsPerDay, BPMeds_encoded, prevalentStroke_encoded, prevalentHyp_encoded, diabetes_encoded, totChol, sysBP, diaBP, BMI, heartRate, glucose]])

  # scaling
  features_scaled = scaler.transform(features)

  # predict
  prediction = rf_model.predict(features_scaled)

  # return prediction
  return prediction[0]

In [33]:
#test1
male = 'female'
age = 56.00
currentSmoker = "yes"
cigsPerDay = 3.00
BPMeds = "no"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = "no"
totChol = 285.00
sysBP = 145.00
diaBP = 100.00
BMI = 30.14
heartRate = 80.00
glucose = 86.00

result = predict(rf_model, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)

if(result==1):
  print("The patient has heart disease")
else:
  print("The patient does not have heart disease")

The patient does not have heart disease


