In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Heart Disease.csv')

In [3]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3,30,No,Female,55-59,White,Yes,Yes,Very good,5,Yes,No,Yes
1,No,20.34,No,No,Yes,0,0,No,Female,80 or older,White,No,Yes,Very good,7,No,No,No
2,No,26.58,Yes,No,No,20,30,No,Male,65-69,White,Yes,Yes,Fair,8,Yes,No,No
3,No,24.21,No,No,No,0,0,No,Female,75-79,White,No,No,Good,6,No,No,Yes
4,No,23.71,No,No,No,28,0,Yes,Female,40-44,White,No,Yes,Very good,8,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7,0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6,Yes,No,No
319791,No,29.84,Yes,No,No,0,0,No,Male,35-39,Hispanic,No,Yes,Very good,5,Yes,No,No
319792,No,24.24,No,No,No,0,0,No,Female,45-49,Hispanic,No,Yes,Good,6,No,No,No
319793,No,32.81,No,No,No,0,0,No,Female,25-29,Hispanic,No,No,Good,12,No,No,No


In [4]:
df.shape

(319795, 18)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  int64  
 6   MentalHealth      319795 non-null  int64  
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  int64  
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [6]:
df.nunique()

HeartDisease           2
BMI                 3604
Smoking                2
AlcoholDrinking        2
Stroke                 2
PhysicalHealth        31
MentalHealth          31
DiffWalking            2
Sex                    2
AgeCategory           13
Race                   6
Diabetic               4
PhysicalActivity       2
GenHealth              5
SleepTime             24
Asthma                 2
KidneyDisease          2
SkinCancer             2
dtype: int64

In [7]:
df.duplicated().sum()

18078

In [8]:
df.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

# Preprocessing

In [9]:
Diabetic_unique = df.Diabetic.unique()
Diabetic_unique

array(['Yes', 'No', 'No, borderline diabetes', 'Yes (during pregnancy)'],
      dtype=object)

In [10]:
GenHealth_unique = df.GenHealth.unique()
GenHealth_unique

array(['Very good', 'Fair', 'Good', 'Poor', 'Excellent'], dtype=object)

In [11]:
df['HeartDisease'] = df['HeartDisease'].replace(['Yes','No'],[1,0])
df['Smoking'] = df['Smoking'].replace(['Yes','No'],[1,0])
df['AlcoholDrinking'] = df['AlcoholDrinking'].replace(['Yes','No'],[1,0])
df['Stroke'] = df['Stroke'].replace(['Yes','No'],[1,0])
df['DiffWalking'] = df['DiffWalking'].replace(['Yes','No'],[1,0])
df['Sex'] = df['Sex'].replace(['Yes','No'],[1,0])
df['Diabetic'] = df['Diabetic'].replace(['Yes','No','No, borderline diabetes','Yes (during pregnancy)'],[1,0,2,3])
df['PhysicalActivity'] = df['PhysicalActivity'].replace(['Yes','No'],[1,0])
df['GenHealth'] = df['GenHealth'].replace(['Very good','Good','Fair','Poor','Excellent'],[0,1,2,3,4])
df['Asthma'] = df['Asthma'].replace(['Yes','No'],[1,0])
df['KidneyDisease'] = df['KidneyDisease'].replace(['Yes','No'],[1,0])
df['SkinCancer'] = df['SkinCancer'].replace(['Yes','No'],[1,0])
df['Sex'] = df['Sex'].replace(['Male','Female'],[1,0])

In [12]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3,30,0,0,55-59,White,1,1,0,5,1,0,1
1,0,20.34,0,0,1,0,0,0,0,80 or older,White,0,1,0,7,0,0,0
2,0,26.58,1,0,0,20,30,0,1,65-69,White,1,1,2,8,1,0,0
3,0,24.21,0,0,0,0,0,0,0,75-79,White,0,0,1,6,0,0,1
4,0,23.71,0,0,0,28,0,1,0,40-44,White,0,1,0,8,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7,0,1,1,60-64,Hispanic,1,0,2,6,1,0,0
319791,0,29.84,1,0,0,0,0,0,1,35-39,Hispanic,0,1,0,5,1,0,0
319792,0,24.24,0,0,0,0,0,0,0,45-49,Hispanic,0,1,1,6,0,0,0
319793,0,32.81,0,0,0,0,0,0,0,25-29,Hispanic,0,0,1,12,0,0,0


In [13]:
df['Age'] = df['AgeCategory'].str.extract('(\d\d)', expand=True)

In [14]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Age
0,0,16.60,1,0,0,3,30,0,0,55-59,White,1,1,0,5,1,0,1,55
1,0,20.34,0,0,1,0,0,0,0,80 or older,White,0,1,0,7,0,0,0,80
2,0,26.58,1,0,0,20,30,0,1,65-69,White,1,1,2,8,1,0,0,65
3,0,24.21,0,0,0,0,0,0,0,75-79,White,0,0,1,6,0,0,1,75
4,0,23.71,0,0,0,28,0,1,0,40-44,White,0,1,0,8,0,0,0,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7,0,1,1,60-64,Hispanic,1,0,2,6,1,0,0,60
319791,0,29.84,1,0,0,0,0,0,1,35-39,Hispanic,0,1,0,5,1,0,0,35
319792,0,24.24,0,0,0,0,0,0,0,45-49,Hispanic,0,1,1,6,0,0,0,45
319793,0,32.81,0,0,0,0,0,0,0,25-29,Hispanic,0,0,1,12,0,0,0,25


In [15]:
df.drop(columns=['AgeCategory','Race'],inplace=True)

In [16]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Age
0,0,16.6,1,0,0,3,30,0,0,1,1,0,5,1,0,1,55
1,0,20.34,0,0,1,0,0,0,0,0,1,0,7,0,0,0,80
2,0,26.58,1,0,0,20,30,0,1,1,1,2,8,1,0,0,65
3,0,24.21,0,0,0,0,0,0,0,0,0,1,6,0,0,1,75
4,0,23.71,0,0,0,28,0,1,0,0,1,0,8,0,0,0,40


In [17]:
df.corr()['HeartDisease']

HeartDisease        1.000000
BMI                 0.051803
Smoking             0.107764
AlcoholDrinking    -0.032080
Stroke              0.196835
PhysicalHealth      0.170721
MentalHealth        0.028591
DiffWalking         0.201258
Sex                 0.070040
Diabetic            0.124840
PhysicalActivity   -0.100030
GenHealth           0.011713
SleepTime           0.008327
Asthma              0.041444
KidneyDisease       0.145197
SkinCancer          0.093317
Name: HeartDisease, dtype: float64

# Label Encoding

In [18]:
df1 = df.copy()

In [19]:
df1

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Age
0,0,16.60,1,0,0,3,30,0,0,1,1,0,5,1,0,1,55
1,0,20.34,0,0,1,0,0,0,0,0,1,0,7,0,0,0,80
2,0,26.58,1,0,0,20,30,0,1,1,1,2,8,1,0,0,65
3,0,24.21,0,0,0,0,0,0,0,0,0,1,6,0,0,1,75
4,0,23.71,0,0,0,28,0,1,0,0,1,0,8,0,0,0,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7,0,1,1,1,0,2,6,1,0,0,60
319791,0,29.84,1,0,0,0,0,0,1,0,1,0,5,1,0,0,35
319792,0,24.24,0,0,0,0,0,0,0,0,1,1,6,0,0,0,45
319793,0,32.81,0,0,0,0,0,0,0,0,0,1,12,0,0,0,25


# Label Encoding

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
label_encoder = LabelEncoder()
label_encoder

LabelEncoder()

In [22]:
df1.BMI = label_encoder.fit_transform(df1.BMI)
df1.Smoking = label_encoder.fit_transform(df1.Smoking)
df1.AlcoholDrinking = label_encoder.fit_transform(df1.AlcoholDrinking)

df1.Stroke = label_encoder.fit_transform(df1.Stroke)
df1.PhysicalHealth = label_encoder.fit_transform(df1.PhysicalHealth)
df1.MentalHealth = label_encoder.fit_transform(df1.MentalHealth)

df1.DiffWalking = label_encoder.fit_transform(df1.DiffWalking)
df1.Sex = label_encoder.fit_transform(df1.Sex)
df1.Diabetic = label_encoder.fit_transform(df1.Diabetic)

df1.PhysicalActivity = label_encoder.fit_transform(df1.PhysicalActivity)
df1.GenHealth = label_encoder.fit_transform(df1.GenHealth)
df1.SleepTime = label_encoder.fit_transform(df1.SleepTime)

df1.Asthma = label_encoder.fit_transform(df1.Asthma)
df1.KidneyDisease = label_encoder.fit_transform(df1.KidneyDisease)
df1.SkinCancer = label_encoder.fit_transform(df1.SkinCancer)
df1.Age = label_encoder.fit_transform(df1.Age)

In [23]:
df1.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Age
0,0,223,1,0,0,3,30,0,0,1,1,0,4,1,0,1,7
1,0,524,0,0,1,0,0,0,0,0,1,0,6,0,0,0,12
2,0,1103,1,0,0,20,30,0,1,1,1,2,7,1,0,0,9
3,0,883,0,0,0,0,0,0,0,0,0,1,5,0,0,1,11
4,0,837,0,0,0,28,0,1,0,0,1,0,7,0,0,0,4


In [24]:
x = df1.drop('HeartDisease', axis=1)
x.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,Age
0,223,1,0,0,3,30,0,0,1,1,0,4,1,0,1,7
1,524,0,0,1,0,0,0,0,0,1,0,6,0,0,0,12
2,1103,1,0,0,20,30,0,1,1,1,2,7,1,0,0,9
3,883,0,0,0,0,0,0,0,0,0,1,5,0,0,1,11
4,837,0,0,0,28,0,1,0,0,1,0,7,0,0,0,4


In [25]:
y = df1['HeartDisease']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: HeartDisease, dtype: int64

# Split DataSet into Training and Testing

In [26]:
x_train,x_test,y_train,y_test = tts(x,y,test_size=.30,random_state=23)

In [27]:
x_train.shape

(223856, 16)

In [28]:
x_test.shape

(95939, 16)

In [29]:
y_train.shape

(223856,)

In [30]:
y_test.shape

(95939,)

# Appling LogisticRegression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
reg = LogisticRegression()

In [33]:
reg.fit(x_train,y_train)

LogisticRegression()

In [34]:
reg.coef_

array([[-5.47485442e-04,  2.69230046e-01, -1.55967716e-01,
         4.92420509e-01,  9.52974038e-03, -1.03291488e-02,
         5.25586916e-01,  3.70174792e-01,  6.07247539e-01,
        -9.13870216e-01,  1.72935888e-02, -3.59575342e-01,
         9.03366754e-02,  3.24436721e-01,  1.78491029e-01,
         1.65826054e-01]])

In [35]:
reg.intercept_

array([-0.78713257])

In [36]:
reg.predict_proba(x_test)

array([[0.96806386, 0.03193614],
       [0.99163797, 0.00836203],
       [0.96450923, 0.03549077],
       ...,
       [0.93691364, 0.06308636],
       [0.95634167, 0.04365833],
       [0.89622254, 0.10377746]])

In [37]:
reg.predict(x_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [38]:
y_test

212245    0
159145    0
78976     0
19336     0
240769    0
         ..
25433     0
105118    0
67103     0
142874    0
83336     0
Name: HeartDisease, Length: 95939, dtype: int64

In [39]:
logistic = reg.score(x_test, y_test)
logistic

0.9122046300253286