In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [9]:
df=pd.read_csv('processed_cleveland.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [10]:
df.isnull().sum()
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  num       303 non-null    int64  
dtypes: float64(1), int64(11), object(2)
memory usage: 33.3+ KB


In [11]:
df.describe()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,num
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


In [12]:

df=df.rename(columns={'num':'target'})
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [14]:
df.replace('?', np.nan, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')
df.dropna(inplace=True)

In [15]:
X=df.drop('target',axis=1)
y=df['target']

In [26]:
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

In [27]:
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)

In [28]:
undersampler=RandomUnderSampler(sampling_strategy='majority')
undersampler=RandomUnderSampler(sampling_strategy='majority')
X_under,y_under=undersampler.fit_resample(X_train,y_train)
print("Before Undersampling:",Counter(y_train))
print("After Undersampling:",Counter(y_under))

Before Undersampling: Counter({0: 124, 1: 113})
After Undersampling: Counter({0: 113, 1: 113})


In [29]:
model=LogisticRegression()

In [30]:
param_grid={'C':[0.001,0.01,0.1,1,10,100,1000]}
grid_search=GridSearchCV(model,param_grid,cv=5)
grid_search.fit(X_under,y_under)

In [31]:
print("Best Parameters:",grid_search.best_params_)
print("Best Score:",grid_search.best_score_)

Best Parameters: {'C': 0.001}
Best Score: 0.8320772946859902


In [32]:
y_pred=grid_search.predict(X_test)

In [33]:
print("Accuracy Score is",accuracy_score(y_test,y_pred))
print("Confusion Matrix is")
print(confusion_matrix(y_test,y_pred))

print("Classification Report is")
print(classification_report(y_test,y_pred))

Accuracy Score is 0.8833333333333333
Confusion Matrix is
[[33  3]
 [ 4 20]]
Classification Report is
              precision    recall  f1-score   support

           0       0.89      0.92      0.90        36
           1       0.87      0.83      0.85        24

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60



In [34]:
import pickle
pickle.dump(grid_search,open('model.pkl','wb'))

In [35]:
pickled_model=pickle.load(open('model.pkl','rb'))

In [36]:
example_features = [[55, 1, 2, 130, 250, 0, 1, 170, 0, 1.5, 2, 0, 2]]
data_array = np.array(example_features)
data_array

array([[ 55. ,   1. ,   2. , 130. , 250. ,   0. ,   1. , 170. ,   0. ,
          1.5,   2. ,   0. ,   2. ]])

In [37]:
def pred(features):
  features=np.array(features).reshape(1,-1)
  prediction=pickled_model.predict(features)
  return prediction[0]


In [38]:
pickle.dump(scaler,open('Scaling.pkl','wb'))

In [39]:

new_sample = [[60, 1, 2, 150, 280, 0, 1, 130, 1, 2.5, 2, 1, 2]]
new_sample_scaled=scaler.transform(new_sample)

print("Prediction : ",pred(new_sample))

Prediction :  1


# New Section