In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('datasets_4123_6408_framingham.csv')

In [3]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
df.drop('education',axis=1,inplace=True)

In [5]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [6]:
bin_cols=['male','currentSmoker','prevalentStroke','prevalentHyp','diabetes']
for col in bin_cols:
    mode_value=df[col].mode()[0]
    df.fillna({col:mode_value},inplace=True)

In [7]:
df.isnull().sum()

male                 0
age                  0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [8]:
numeric_cols=['cigsPerDay','BPMeds','totChol','BMI','heartRate','glucose']
for col in numeric_cols:
    median_val=df[col].median()
    df.fillna({col:median_val},inplace=True)

In [9]:
df.isnull().sum()

male               0
age                0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [10]:
df['TenYearCHD'].value_counts()

TenYearCHD
0    3596
1     644
Name: count, dtype: int64

In [11]:
from sklearn.utils import resample

In [12]:
df_majority=df[df['TenYearCHD']==0]
df_minority=df[df['TenYearCHD']==1]
df_minority_upsamples=resample(df_minority,replace=True,n_samples=len(df_majority),random_state=42)
df_balanced=pd.concat([df_majority,df_minority_upsamples])

In [13]:
df_balanced['TenYearCHD'].value_counts()

TenYearCHD
0    3596
1    3596
Name: count, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X=df_balanced.drop('TenYearCHD',axis=1)
y=df_balanced['TenYearCHD']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [15]:
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.fit_transform(X_test)

In [16]:
X_train_scaled

array([[-0.94172615,  1.4582083 , -1.02624378, ...,  0.840088  ,
        -0.50731448, -0.34072887],
       [-0.94172615, -1.66694628,  0.97442734, ..., -1.050363  ,
        -0.0898974 ,  0.25133934],
       [-0.94172615,  1.34246184,  0.97442734, ...,  0.42574258,
        -1.34214864, -0.34072887],
       ...,
       [ 1.06187982,  1.92119417, -1.02624378, ..., -0.6972277 ,
        -1.34214864, -0.18492145],
       [ 1.06187982,  1.68970124,  0.97442734, ...,  0.59289329,
        -0.0898974 ,  4.08420194],
       [ 1.06187982, -0.74097455,  0.97442734, ...,  1.0660946 ,
         0.32751968,  1.84057505]])

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf_model=RandomForestClassifier()

In [18]:
rf_model.fit(X_train_scaled,y_train)

In [19]:
y_pred=rf_model.predict(X_test_scaled)

In [20]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [21]:
accuracy_score(y_test,y_pred)

0.965948575399583

In [22]:
confusion_matrix(y_test,y_pred)

array([[696,  39],
       [ 10, 694]], dtype=int64)

In [23]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439



In [24]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [25]:
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    GaussianNB(),
    XGBClassifier()
]

# Create a dictionary to store the results
results = {}


# Train and evaluate each classifier
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} Accuracy: {accuracy}")
    
    # Classification report
    print(f"Classification Report for {clf_name}:")
    print(classification_report(y_test, y_pred))
    
    # Confusion matrix
    print(f"Confusion Matrix for {clf_name}:")
    print(confusion_matrix(y_test, y_pred))
    print("="*50)


RandomForestClassifier Accuracy: 0.9673384294649062
Classification Report for RandomForestClassifier:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       735
           1       0.95      0.99      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for RandomForestClassifier:
[[697  38]
 [  9 695]]
AdaBoostClassifier Accuracy: 0.6518415566365532
Classification Report for AdaBoostClassifier:
              precision    recall  f1-score   support

           0       0.68      0.60      0.64       735
           1       0.63      0.71      0.66       704

    accuracy                           0.65      1439
   macro avg       0.65      0.65      0.65      1439
weighted avg       0.65      0.65      0.65      1439

Confusion Matrix for AdaBoostClassifier:
[[441 294]
 [207 497]]
GradientBoostingCl

In [28]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define a list of classifiers
classifiers = [
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    GaussianNB(),
    XGBClassifier()
]

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])

# Train and evaluate each classifier
for clf in classifiers:
    clf_name = clf.__class__.__name__
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    precision = report['weighted avg']['precision']
    recall = report['weighted avg']['recall']
    
    # Append results to DataFrame
    new_row = pd.DataFrame([{
        'Model': clf_name,
        'Accuracy': accuracy,
        'F1-Score': f1_score,
        'Precision': precision,
        'Recall': recall
    }])

    results_df = pd.concat([results_df, new_row], ignore_index=True)


results_df

  results_df = pd.concat([results_df, new_row], ignore_index=True)


Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall
0,RandomForestClassifier,0.969423,0.969425,0.97006,0.969423
1,AdaBoostClassifier,0.651842,0.651021,0.655,0.651842
2,GradientBoostingClassifier,0.718555,0.718396,0.720192,0.718555
3,LogisticRegression,0.653231,0.653241,0.653782,0.653231
4,SVC,0.690063,0.689977,0.691183,0.690063
5,KNeighborsClassifier,0.785268,0.781514,0.811568,0.785268
6,DecisionTreeClassifier,0.838082,0.8381,0.838255,0.838082
7,GaussianNB,0.585129,0.536072,0.63317,0.585129
8,XGBClassifier,0.8082,0.806292,0.817593,0.8082


#### Best Code

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix

rf=RandomForestClassifier()
rf.fit(X_train_scaled,y_train)
y_pred_rf=rf.predict(X_test_scaled)

accuracy_rf=accuracy_score(y_test,y_pred_rf)
print("Random Forest Classifier Accuracy:", accuracy_rf)

print("Classification Report for Random Forest Classifier:")
print(classification_report(y_test, y_pred_rf))

print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Classifier Accuracy: 0.9687282835302293
Classification Report for Random Forest Classifier:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       735
           1       0.95      0.98      0.97       704

    accuracy                           0.97      1439
   macro avg       0.97      0.97      0.97      1439
weighted avg       0.97      0.97      0.97      1439

Confusion Matrix for Random Forest Classifier:
[[702  33]
 [ 12 692]]


In [27]:
print('predicted',rf.predict(X_test_scaled[10].reshape(1,-1))[0])
print('actual',y_test.iloc[10])

predicted 0
actual 0


In [28]:
print('predicted',rf.predict(X_test_scaled[200].reshape(1,-1))[0])
print('actual',y_test.iloc[200])

predicted 1
actual 1


In [29]:
import pickle
pickle.dump(rf,open('rf.pkl','wb'))
pickle.dump(scaler,open('scaler.pkl','wb'))

In [30]:
with open('rf.pkl','rb') as file:
    rf=pickle.load(file)

In [31]:
with open('scaler.pkl','rb') as file:
    scaler=pickle.load(file)

In [32]:
from sklearn.preprocessing import OneHotEncoder
def predict(model, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose):
    encoder=OneHotEncoder()
    cat_features = np.array([[male.lower(), currentSmoker.lower(), BPMeds.lower(),
                              prevalentStroke.lower(), prevalentHyp.lower(), diabetes.lower()]])
    
    encoded_cat = encoder.fit_transform(cat_features).toarray()

    # Prepare numerical features
    num_features = np.array([[age, cigsPerDay, totChol, sysBP, diaBP, BMI, heartRate, glucose]])

    # Scale numerical features
    scaled_num = scaler.fit_transform(num_features)

    # Concatenate both
    final_input = np.hstack((encoded_cat, scaled_num))

    # Predict
    return model.predict(final_input)

In [33]:
# test 1:
male = "female"
age = 56.00
currentSmoker = "yes"
cigsPerDay = 3.00
BPMeds = "no"
prevalentStroke = "no"
prevalentHyp = "yes"
diabetes = 'no'
totChol = 285.00
sysBP = 145.00
diaBP = 100.00
BMI = 30.14
heartRate = 80.00
glucose = 86.00


result = predict(rf, scaler, male, age, currentSmoker, cigsPerDay, BPMeds, prevalentStroke, prevalentHyp, diabetes, totChol, sysBP, diaBP, BMI, heartRate, glucose)

if result == 1:
    print("The Patient has Heart Disease")
else: 
    print("The Patient has No Heart Disease")

The Patient has No Heart Disease
