In [282]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

# Loading the dataset

In [283]:
# Loading the dataset
df = pd.read_csv('C:/Users/DELL/OneDrive/Desktop/heart1.csv')

# Explore dataset

In [284]:
# Explore dataset
df.shape

(340, 17)

In [285]:
df.columns

Index(['pid', 'age', 'sex', 'cp', 'trestbps', 'chol', 'famhist', 'fbs',
       'restecg', 'smoke', 'thalach', 'exang', 'oldpeak', 'slope', 'ca',
       'thal', 'target'],
      dtype='object')

In [286]:
df.dtypes

pid           int64
age           int64
sex          object
cp           object
trestbps      int64
chol          int64
famhist      object
fbs            bool
restecg      object
smoke        object
thalach       int64
exang        object
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [287]:
df.head()

Unnamed: 0,pid,age,sex,cp,trestbps,chol,famhist,fbs,restecg,smoke,thalach,exang,oldpeak,slope,ca,thal,target
0,1,63,M,Asymtomatic,145,233,no,True,normal,no,150,no,2.3,0,0,1,1
1,2,37,M,Non-Angina pain,130,250,no,False,ST-T wave abnormality,yes,187,no,3.5,0,0,2,1
2,3,41,F,Atypical Angina,130,204,no,False,normal,no,172,no,1.4,2,0,2,1
3,4,56,M,Atypical Angina,120,236,no,False,ST-T wave abnormality,no,178,no,0.8,2,0,2,1
4,5,51,F,Non-Angina pain,140,308,yes,False,,yes,142,no,1.5,2,1,2,1


In [288]:
df.tail()

Unnamed: 0,pid,age,sex,cp,trestbps,chol,famhist,fbs,restecg,smoke,thalach,exang,oldpeak,slope,ca,thal,target
335,336,47,M,,130,253,no,False,ST-T wave abnormality,no,179,no,0.0,2,0,2,1
336,337,51,M,,125,213,no,False,normal,yes,125,yes,1.4,2,1,2,1
337,338,46,F,Non-Angina pain,142,177,yes,False,,yes,160,yes,1.4,0,0,2,1
338,339,54,F,,135,304,no,True,ST-T wave abnormality,yes,170,no,0.0,2,0,2,1
339,340,56,M,,120,240,no,False,ST-T wave abnormality,yes,169,no,0.0,0,0,2,1


In [289]:
df.isnull().any()

pid         False
age         False
sex         False
cp           True
trestbps    False
chol        False
famhist     False
fbs         False
restecg      True
smoke       False
thalach     False
exang       False
oldpeak     False
slope       False
ca          False
thal        False
target      False
dtype: bool

In [290]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 17 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pid       340 non-null    int64  
 1   age       340 non-null    int64  
 2   sex       340 non-null    object 
 3   cp        322 non-null    object 
 4   trestbps  340 non-null    int64  
 5   chol      340 non-null    int64  
 6   famhist   340 non-null    object 
 7   fbs       340 non-null    bool   
 8   restecg   318 non-null    object 
 9   smoke     340 non-null    object 
 10  thalach   340 non-null    int64  
 11  exang     340 non-null    object 
 12  oldpeak   340 non-null    float64
 13  slope     340 non-null    int64  
 14  ca        340 non-null    int64  
 15  thal      340 non-null    int64  
 16  target    340 non-null    int64  
dtypes: bool(1), float64(1), int64(9), object(6)
memory usage: 43.0+ KB


# 1. Data Preprocessing

# 1.1 Data Cleaning (Handling Missing Values)

In [291]:
# Data Cleaning (Handling Missing Values)
df.dropna(inplace= True)

# 1.2 Data Reduction

In [292]:
# Data Reduction
df.drop(['pid','smoke','famhist'],axis='columns', inplace=True)

In [293]:
df.shape

(303, 14)

In [294]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,M,Asymtomatic,145,233,True,normal,150,no,2.3,0,0,1,1
1,37,M,Non-Angina pain,130,250,False,ST-T wave abnormality,187,no,3.5,0,0,2,1
2,41,F,Atypical Angina,130,204,False,normal,172,no,1.4,2,0,2,1
3,56,M,Atypical Angina,120,236,False,ST-T wave abnormality,178,no,0.8,2,0,2,1
8,57,F,Typical Angina,120,354,False,ST-T wave abnormality,163,yes,0.6,2,0,2,1


# 1.3 Data Encoding

In [295]:
# Data Encoding
df.loc[df["sex"] == "M", "sex"] = 1
df.loc[df["sex"] == "F", "sex"] = 0

In [296]:
df.loc[df["cp"] == "Typical Angina", "cp"] = 1
df.loc[df["cp"] == "Atypical Angina", "cp"] =2 
df.loc[df["cp"] == "Non-Angina pain", "cp"] =3 
df.loc[df["cp"] == "Asymtomatic", "cp"] =4

In [297]:
df.loc[df["restecg"] == "normal", "restecg"] =0
df.loc[df["restecg"] == "ST-T wave abnormality", "restecg"] =1
df.loc[df["restecg"] == "left ventricular hypertrophy", "restecg"] =2

In [298]:
df.fbs = df.fbs.astype('int') 
df.dtypes

age           int64
sex          object
cp           object
trestbps      int64
chol          int64
fbs           int32
restecg      object
thalach       int64
exang        object
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [299]:
df.loc[df["exang"] == "no", "exang"] =0
df.loc[df["exang"] == "yes", "exang"] =1

# Again exploring the dataset

In [300]:
# Again exploring the dataset
df.shape

(303, 14)

In [301]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,4,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,3,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,2,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,2,120,236,0,1,178,0,0.8,2,0,2,1
8,57,0,1,120,354,0,1,163,1,0.6,2,0,2,1


In [302]:
df.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
329,59,1,1,164,176,1,0,90,0,1.0,1,2,1,0
330,57,0,1,140,241,0,1,123,1,0.2,1,0,3,0
331,45,1,4,110,264,0,1,132,0,1.2,1,0,3,0
332,68,1,1,144,193,1,1,141,0,3.4,1,2,3,0
333,57,1,1,130,131,0,1,115,1,1.2,1,1,3,0


In [303]:
df.isnull().any()

age         False
sex         False
cp          False
trestbps    False
chol        False
fbs         False
restecg     False
thalach     False
exang       False
oldpeak     False
slope       False
ca          False
thal        False
target      False
dtype: bool

In [304]:
df.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
329,59,1,1,164,176,1,0,90,0,1.0,1,2,1,0
330,57,0,1,140,241,0,1,123,1,0.2,1,0,3,0
331,45,1,4,110,264,0,1,132,0,1.2,1,0,3,0
332,68,1,1,144,193,1,1,141,0,3.4,1,2,3,0
333,57,1,1,130,131,0,1,115,1,1.2,1,1,3,0


# Dummy columns creation

In [305]:
dataset = pd.get_dummies(df, columns=['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal'])

In [306]:
dataset.columns

Index(['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'target', 'sex_0',
       'sex_1', 'cp_1', 'cp_2', 'cp_3', 'cp_4', 'fbs_0', 'fbs_1', 'restecg_0',
       'restecg_1', 'restecg_2', 'exang_0', 'exang_1', 'slope_0', 'slope_1',
       'slope_2', 'ca_0', 'ca_1', 'ca_2', 'ca_3', 'ca_4', 'thal_0', 'thal_1',
       'thal_2', 'thal_3'],
      dtype='object')

# 1.4 Normalization

In [307]:
from sklearn.preprocessing import StandardScaler
standScaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
dataset[columns_to_scale] = standScaler.fit_transform(dataset[columns_to_scale])

In [308]:
dataset.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_1,cp_2,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.952197,0.763956,-0.256334,0.015443,1.087338,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,-1.915313,-0.092738,0.072199,1.633471,2.122573,1,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
2,-1.474158,-0.092738,-0.816773,0.977514,0.310912,1,1,0,0,1,...,1,1,0,0,0,0,0,0,1,0
3,0.180175,-0.663867,-0.198357,1.239897,-0.206705,1,0,1,0,1,...,1,1,0,0,0,0,0,0,1,0
8,0.290464,-0.663867,2.08205,0.583939,-0.379244,1,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0


In [309]:
dataset.tail()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_1,cp_2,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
329,0.511041,1.849101,-1.357886,-2.608388,-0.034166,0,0,1,1,0,...,0,0,0,1,0,0,0,1,0,0
330,0.290464,0.478391,-0.10173,-1.165281,-0.724323,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
331,-1.033002,-1.234996,0.342756,-0.771706,0.138373,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
332,1.503641,0.706843,-1.029353,-0.378132,2.036303,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,1
333,0.290464,-0.092738,-2.227533,-1.515125,0.138373,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1


# Again explore the dataset

In [310]:
dataset.shape

(303, 31)

In [311]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 303 entries, 0 to 333
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        303 non-null    float64
 1   trestbps   303 non-null    float64
 2   chol       303 non-null    float64
 3   thalach    303 non-null    float64
 4   oldpeak    303 non-null    float64
 5   target     303 non-null    int64  
 6   sex_0      303 non-null    uint8  
 7   sex_1      303 non-null    uint8  
 8   cp_1       303 non-null    uint8  
 9   cp_2       303 non-null    uint8  
 10  cp_3       303 non-null    uint8  
 11  cp_4       303 non-null    uint8  
 12  fbs_0      303 non-null    uint8  
 13  fbs_1      303 non-null    uint8  
 14  restecg_0  303 non-null    uint8  
 15  restecg_1  303 non-null    uint8  
 16  restecg_2  303 non-null    uint8  
 17  exang_0    303 non-null    uint8  
 18  exang_1    303 non-null    uint8  
 19  slope_0    303 non-null    uint8  
 20  slope_1   

In [312]:
dataset.isnull().sum()

age          0
trestbps     0
chol         0
thalach      0
oldpeak      0
target       0
sex_0        0
sex_1        0
cp_1         0
cp_2         0
cp_3         0
cp_4         0
fbs_0        0
fbs_1        0
restecg_0    0
restecg_1    0
restecg_2    0
exang_0      0
exang_1      0
slope_0      0
slope_1      0
slope_2      0
ca_0         0
ca_1         0
ca_2         0
ca_3         0
ca_4         0
thal_0       0
thal_1       0
thal_2       0
thal_3       0
dtype: int64

In [313]:
dataset.describe()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_1,cp_2,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,...,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,9.728192000000001e-17,-7.199045e-16,-1.069918e-16,-4.506846e-16,-3.181357e-16,0.544554,0.316832,0.683168,0.471947,0.165017,...,0.468647,0.577558,0.214521,0.125413,0.066007,0.016502,0.006601,0.059406,0.547855,0.386139
std,1.001654,1.001654,1.001654,1.001654,1.001654,0.498835,0.466011,0.466011,0.500038,0.371809,...,0.499842,0.494765,0.411169,0.331734,0.248704,0.127605,0.08111,0.236774,0.498528,0.487668
min,-2.797624,-2.148802,-2.32416,-3.439267,-0.8968617,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7572802,-0.6638668,-0.6814943,-0.7061105,-0.8968617,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.06988599,-0.09273778,-0.1210553,0.1466343,-0.2067053,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,0.7316189,0.4783913,0.5456738,0.7151309,0.4834512,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2.49624,3.905165,6.140401,2.289429,4.451851,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [314]:
dataset['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

# Separate X and Y values

In [315]:
X= dataset.drop(columns='target', axis=1)
Y= dataset['target']

In [316]:
print(X)

          age  trestbps      chol   thalach   oldpeak  sex_0  sex_1  cp_1  \
0    0.952197  0.763956 -0.256334  0.015443  1.087338      0      1     0   
1   -1.915313 -0.092738  0.072199  1.633471  2.122573      0      1     0   
2   -1.474158 -0.092738 -0.816773  0.977514  0.310912      1      0     0   
3    0.180175 -0.663867 -0.198357  1.239897 -0.206705      0      1     0   
8    0.290464 -0.663867  2.082050  0.583939 -0.379244      1      0     1   
..        ...       ...       ...       ...       ...    ...    ...   ...   
329  0.511041  1.849101 -1.357886 -2.608388 -0.034166      0      1     1   
330  0.290464  0.478391 -0.101730 -1.165281 -0.724323      1      0     1   
331 -1.033002 -1.234996  0.342756 -0.771706  0.138373      0      1     0   
332  1.503641  0.706843 -1.029353 -0.378132  2.036303      0      1     1   
333  0.290464 -0.092738 -2.227533 -1.515125  0.138373      0      1     1   

     cp_2  cp_3  ...  slope_2  ca_0  ca_1  ca_2  ca_3  ca_4  thal_0  thal_1

In [317]:
print(Y)

0      1
1      1
2      1
3      1
8      1
      ..
329    0
330    0
331    0
332    0
333    0
Name: target, Length: 303, dtype: int64


# Splitting of data

In [318]:
X_train, X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [319]:
print(X.shape,X_train.shape,X_test.shape)

(303, 30) (242, 30) (61, 30)


# Apply Logistic Regression

In [320]:
model = LogisticRegression()

In [321]:
 model.fit(X_train,Y_train)

LogisticRegression()

In [322]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [323]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

# Check Accuracy

In [324]:
print("Accuracy on Training data: ", training_data_accuracy)

Accuracy on Training data:  0.8801652892561983


In [325]:
print("Accuracy on Test data: ", test_data_accuracy)

Accuracy on Test data:  0.8360655737704918


In [326]:
print(dataset)

          age  trestbps      chol   thalach   oldpeak  target  sex_0  sex_1  \
0    0.952197  0.763956 -0.256334  0.015443  1.087338       1      0      1   
1   -1.915313 -0.092738  0.072199  1.633471  2.122573       1      0      1   
2   -1.474158 -0.092738 -0.816773  0.977514  0.310912       1      1      0   
3    0.180175 -0.663867 -0.198357  1.239897 -0.206705       1      0      1   
8    0.290464 -0.663867  2.082050  0.583939 -0.379244       1      1      0   
..        ...       ...       ...       ...       ...     ...    ...    ...   
329  0.511041  1.849101 -1.357886 -2.608388 -0.034166       0      0      1   
330  0.290464  0.478391 -0.101730 -1.165281 -0.724323       0      1      0   
331 -1.033002 -1.234996  0.342756 -0.771706  0.138373       0      0      1   
332  1.503641  0.706843 -1.029353 -0.378132  2.036303       0      0      1   
333  0.290464 -0.092738 -2.227533 -1.515125  0.138373       0      0      1   

     cp_1  cp_2  ...  slope_2  ca_0  ca_1  ca_2  ca

# Predict the target values

In [327]:
input_data = (-1.474158,-0.092738,-0.816773,0.977514,0.310912,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0)
input_data_as_numpy_array= np.asarray(input_data) 
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)
if(prediction[0]==0):
    print("The person does not have heart disease")
else:
    print("The person has heart disease")
    

[1]
The person has heart disease
