In [1]:
import numpy as np # linear algebra
import pandas as pd 

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Importing Dataset

In [3]:
dataset=pd.read_csv("/content/Heart-Attack-Dataset.csv")
dataset

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


Dividing the dataset into x and y, as in variables that are dependent and independent

In [7]:
x=dataset.iloc[:,:-1].values
y=dataset.iloc[:,-1].values

Using the dependent feature column to determine whether the dataset is balanced or not.

In [8]:
dataset["output"].value_counts()

1    165
0    138
Name: output, dtype: int64

We now balance the dataset using the balancing technique.

In [10]:
from imblearn.over_sampling import SMOTE 
s=SMOTE()                                
x_data,y_data=s.fit_resample(x,y)

We standardise the balanced data in order to obtain higher precision.

In [11]:
from sklearn.preprocessing import StandardScaler
ssd=StandardScaler()
x_scaled=ssd.fit_transform(x_data)
x_scaled

array([[ 0.96131996,  0.66482639,  2.04704265, ..., -2.23833351,
        -0.75811559, -2.13965737],
       [-1.99206316,  0.66482639,  1.06416216, ..., -2.23833351,
        -0.75811559, -0.52398395],
       [-1.53769652, -1.52178118,  0.08128167, ...,  1.02698005,
        -0.75811559, -0.52398395],
       ...,
       [-0.32554552, -0.57376359, -0.90159883, ...,  0.10217198,
         0.54964128,  1.09168948],
       [ 0.23474895,  0.66482639, -0.80420981, ..., -0.52479058,
         0.34695905,  1.01164473],
       [-0.14755353,  0.66482639, -0.90159883, ...,  1.02698005,
         0.24733379, -0.19059704]])

In [12]:
from collections import Counter  
print(Counter(y_data))           

Counter({1: 165, 0: 165})


Create training and test sets from the dataset.

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_data,y_data,test_size=0.3,random_state=20)

In [14]:
print(Counter(y_train))
print(Counter(y_test))

Counter({1: 118, 0: 113})
Counter({0: 52, 1: 47})


Fit the logistic Regression model on training set

In [17]:
from sklearn.linear_model import LogisticRegression 
l1=LogisticRegression(solver="liblinear")
l1.fit(x_train,y_train)

LogisticRegression(solver='liblinear')

Cross Validation - Model Balancing

In [16]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,random_state=20,shuffle=True)
kf.split(x_scaled)
print(kf)

KFold(n_splits=5, random_state=20, shuffle=True)


Without Iteration based cross validation

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
score=cross_val_score(l1,x_train,y_train,cv=kf)
y_pred=cross_val_predict(l1,x_test,y_test,cv=kf)
print(score)
print(y_pred)

[0.82978723 0.76086957 0.80434783 0.84782609 0.76086957]
[0 1 0 1 1 1 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0
 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 1 1 0 1
 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 1]


In [19]:
for train_data,test_data in kf.split(x_scaled):
    x_train,x_test=x_data[train_data],x_data[test_data]
    y_train,y_test=y_data[train_data],y_data[test_data]
    scores=cross_val_score(l1,x_train,y_train,cv=kf)
    y_pred=cross_val_predict(l1,x_test,y_test,cv=kf)
    print("Checking Test Balancing.......:",Counter(y_test))
    print(y_pred)
    print(scores)

Checking Test Balancing.......: Counter({0: 40, 1: 26})
[0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0]
[0.90566038 0.81132075 0.79245283 0.86792453 0.76923077]
Checking Test Balancing.......: Counter({1: 41, 0: 25})
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 0 1 0 1 0 1 1 1
 1 1 1 1 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0]
[0.73584906 0.96226415 0.86792453 0.9245283  0.82692308]
Checking Test Balancing.......: Counter({1: 38, 0: 28})
[1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0]
[0.79245283 0.8490566  0.83018868 0.81132075 0.90384615]
Checking Test Balancing.......: Counter({0: 34, 1: 32})
[1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 0 0 1
 0 1 0 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0.8490566  0.81132075 0.86792453 0.81132075 0.76923077]
Checking

Accuracy of the model

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)*100

80.3030303030303