HeartDisease : This column information about whether someone has heart disease or not.

BMI : Stands for Body Mass Index, which measures the proportion of weight to height for an individual.

Smoking : This column may store information about whether someone smokes or not.

AlcoholDrinking : Information about whether someone consumes alcoholic beverages or not.

Stroke : Could contain information about whether someone has had a stroke or not.

PhysicalHealth : This column might describe someone's physical health condition.

MentalHealth : Indicates someone's mental health condition.

DiffWalking : Likely stores information about whether someone has difficulty walking or not.

Sex : Indicates someone's gender.

AgeCategory : Groups someone's age into specific categories.

Race : Stores information about someone's race or ethnicity.

Diabetic : Information about whether someone has diabetes or not.

PhysicalActivity : Indicates how active someone is in physical activities.

GenHealth: Describes someone's overall health condition.

SleepTime : Someone's sleep time, which could be an indicator of sleep health.

Asthma : Information about whether someone has asthma or not.

KidneyDisease : Could contain information about whether someone has kidney disease or not.

SkinCancer : Information about whether someone has skin cancer or not.

In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("heart_2020_cleaned.csv")
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [3]:
df.drop_duplicates(inplace=True)


In [4]:
df.shape

(301717, 18)

In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 301717 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      301717 non-null  object 
 1   BMI               301717 non-null  float64
 2   Smoking           301717 non-null  object 
 3   AlcoholDrinking   301717 non-null  object 
 4   Stroke            301717 non-null  object 
 5   PhysicalHealth    301717 non-null  float64
 6   MentalHealth      301717 non-null  float64
 7   DiffWalking       301717 non-null  object 
 8   Sex               301717 non-null  object 
 9   AgeCategory       301717 non-null  object 
 10  Race              301717 non-null  object 
 11  Diabetic          301717 non-null  object 
 12  PhysicalActivity  301717 non-null  object 
 13  GenHealth         301717 non-null  object 
 14  SleepTime         301717 non-null  float64
 15  Asthma            301717 non-null  object 
 16  KidneyDisease     301717 

In [6]:
df1 = df.copy()


In [7]:
df1 = df1.replace({'No': 0, 'Yes': 1})

df1["Sex"] = df1["Sex"].replace({'Female': 0, 'Male': 1})

In [8]:
diabetic_mapping = {
    'No': 0,
    'No, borderline diabetes': 0,
    'Yes': 1,
    'Yes (during pregnancy)': 1
}

df1['Diabetic'] = df1['Diabetic'].replace(diabetic_mapping).astype(int)

In [9]:
gen_health_mapping = {
    'Poor': 0,
    'Fair': 1,
    'Good': 2,
    'Very good': 3,
    'Excellent': 4
}

df1['GenHealth'] = df1['GenHealth'].replace(gen_health_mapping).astype(int)

In [10]:
age_mapping = {
    '18-24': 0,
    '25-29': 1,
    '30-34': 2,
    '35-39': 3,
    '40-44': 4,
    '45-49': 5,
    '50-54': 6,
    '55-59': 7,
    '60-64': 8,
    '65-69': 9,
    '70-74': 10,
    '75-79': 11,
    '80 or older': 12
}

df1['AgeCategory'] = df1['AgeCategory'].map(age_mapping).astype(int)

In [11]:
race_mapping = {
    'American Indian/Alaskan Native': 0,
    'Asian': 1,
    'Black': 2,
    'Hispanic': 3,
    'Other': 4,
    'White': 5
}

df1['Race'] = df1['Race'].map(race_mapping).astype(int)

In [12]:
df1

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3.0,30.0,0,0,7,5,1,1,3,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,12,5,0,1,3,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,9,5,1,1,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11,5,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,4,5,0,1,3,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,1,8,3,1,0,1,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,1,3,3,0,1,3,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,0,5,3,0,1,2,6.0,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,0,1,3,0,0,2,12.0,0,0,0


In [13]:
features_to_scale_robust = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']


robust_scaler = RobustScaler()

df1[features_to_scale_robust] = robust_scaler.fit_transform(df1[features_to_scale_robust])


In [14]:
df1

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,-1.418635,1,0,0,1.5,7.5,0,0,7,5,1,1,3,-1.0,1,0,1
1,0,-0.927822,0,0,1,0.0,0.0,0,0,12,5,0,1,3,0.0,0,0,0
2,0,-0.108924,1,0,0,10.0,7.5,0,1,9,5,1,1,1,0.5,1,0,0
3,0,-0.419948,0,0,0,0.0,0.0,0,0,11,5,0,0,2,-0.5,0,0,1
4,0,-0.485564,0,0,0,14.0,0.0,1,0,4,5,0,1,3,0.5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,0.000000,1,0,0,3.5,0.0,1,1,8,3,1,0,1,-0.5,1,0,0
319791,0,0.318898,1,0,0,0.0,0.0,0,1,3,3,0,1,3,-1.0,1,0,0
319792,0,-0.416010,0,0,0,0.0,0.0,0,0,5,3,0,1,2,-0.5,0,0,0
319793,0,0.708661,0,0,0,0.0,0.0,0,0,1,3,0,0,2,2.5,0,0,0


In [15]:
df2 = df1.copy()


In [16]:
X = df2.drop(['HeartDisease'], axis=1)
y = df2['HeartDisease']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [18]:
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [19]:
scaler =MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:
rcf=RandomForestClassifier()
rcf.fit(X_train,y_train)

In [22]:
y_pred=rcf.predict(X_test)


In [23]:
accuracy_score(y_test, y_pred)*100


89.96254805780194

In [20]:
log=LogisticRegression()
log.fit(X_train,y_train)

y_pred=log.predict(X_test)

accuracy_score(y_test, y_pred)*100


91.0049052101286

In [22]:
xgb=XGBClassifier()
xgb.fit(X_train,y_train)

y_pred=xgb.predict(X_test)

accuracy_score(y_test, y_pred)*100

90.87564629457775

In [24]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)

y_pred=dt.predict(X_test)

accuracy_score(y_test, y_pred)*100

85.08550974413363

In [53]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [60]:
# test case
# row_data=[BMI, Smoking, AlcoholDrinking, Stroke, PhysicalHealth, MentalHealth, DiffWalking, Sex, AgeCategory, Race, Diabetic, PhysicalActivity, GenHealth, SleepTime, Asthma, KidneyDisease, SkinCancer]
row_data=[16.60,'Yes','No','No',3.0,30.0,'No','Female','55-59','White','Yes','Yes','Very good',5.0,'Yes','No','Yes']
# encoding of test case
row_data[7]=0 if row_data[8] == 'Female' else 1
row_data[9]=race_mapping[row_data[9]]
row_data[10]=diabetic_mapping[row_data[10]]
row_data[12]=gen_health_mapping[row_data[12]]
row_data[8]=age_mapping[row_data[8]]
for i in range(len(row_data)):
    if row_data[i] == 'Yes':
        row_data[i] = 1
    else:
        row_data[i] = 0

# scaling of test case
row_data = np.array(row_data).reshape(1, -1)
row_data = pca.transform(row_data)

row_data = scaler.transform(row_data)

y_pred = log.predict(row_data)

if y_pred == 1:
    print('Your heart seems to be sick')
else:
    print('Your heart seems to be okay')

Your heart seems to be okay


In [33]:
print(age_mapping)


{'18-24': 0, '25-29': 1, '30-34': 2, '35-39': 3, '40-44': 4, '45-49': 5, '50-54': 6, '55-59': 7, '60-64': 8, '65-69': 9, '70-74': 10, '75-79': 11, '80 or older': 12}


In [57]:
df['AgeCategory']

0               55-59
1         80 or older
2               65-69
3               75-79
4               40-44
             ...     
319790          60-64
319791          35-39
319792          45-49
319793          25-29
319794    80 or older
Name: AgeCategory, Length: 301717, dtype: object

In [50]:
print(row_data[13])


5.0


91.0049052101286