In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn import metrics

# Loading and Preprocessing the data

In [2]:
data = pd.read_csv('heart.csv')
data.head()
# data.dtypes

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


> Rescaling the data 

In [3]:
x_col = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']
y_col = ['target']

x = data[x_col]
y = data[y_col]


x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 5)

scalar = StandardScaler()
rescaled_X = scalar.fit_transform(x_train)
rescaled_X[:5]

array([[ 0.16561372,  0.6355891 , -0.02240666, -0.64906971, -0.1590049 ,
        -0.42163702,  0.860252  ,  1.22761792, -0.62828086, -0.15964428,
         0.96963517, -0.67894322, -0.54221579],
       [-0.27881803,  0.6355891 , -0.02240666,  0.15127881, -0.8266995 ,
        -0.42163702,  0.860252  ,  0.32222568, -0.62828086, -0.15964428,
         0.96963517,  0.32061208, -0.54221579],
       [ 1.72112484,  0.6355891 , -0.97244907,  0.78012408, -1.34177819,
        -0.42163702,  0.860252  , -1.17167151,  1.59164485,  1.43096295,
        -2.26757201, -0.67894322,  1.14822168],
       [-0.05660216,  0.6355891 , -0.97244907, -1.22074723, -0.10177393,
        -0.42163702,  0.860252  , -1.1264019 ,  1.59164485,  1.60769708,
        -0.64896842,  0.32061208,  1.14822168],
       [ 0.05450578, -1.57334352, -0.02240666,  0.20844657,  0.10807294,
        -0.42163702, -1.07989081,  0.45803452, -0.62828086,  0.37055813,
        -0.64896842, -0.67894322, -0.54221579]])

### Using RFE to eliminate not important features

In [4]:
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=6)
fit = rfe.fit(rescaled_X, y_train.values.ravel())


transformed_X = fit.transform(rescaled_X)

# print(data.columns)
# print('Num features : ', fit.n_features_)
# print('Selected features : ', fit.support_)
# print('Features ranking : ', fit.ranking_)

data1=  data[x_col].loc[:,fit.support_]
data1.head()

Unnamed: 0,sex,cp,exang,oldpeak,ca,thal
0,1,3,0,2.3,0,1
1,1,2,0,3.5,0,2
2,0,1,0,1.4,0,2
3,1,1,0,0.8,0,2
4,0,0,1,0.6,0,2


### *Now we use data1 for our regressors*

## Not scaling the regressors

In [5]:
model = LogisticRegression()

x = data1.values
y = data[y_col]


x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 5)
model.fit(x_train,y_train.values.ravel())

print("accuracy of the model is : %.3f \n" %model.score(x_test,y_test))
pred = pd.Series(model.predict(x_test))
y_test = y_test.reset_index(drop=True)
pred_df = pd.concat([y_test, pred], axis=1)
pred_df.columns = ['True', 'Prediction']
pred_df.head()
# model.score(x_test,y_test)

accuracy of the model is : 0.890 



Unnamed: 0,True,Prediction
0,0,1
1,0,0
2,0,0
3,0,0
4,1,0


> Creating confusion Matrix

In [6]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

labels = [0, 1]
fig, ax = plt.subplots(figsize=(14,8))
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("top")
plt.title('Confusion matrix', y=1.1)
plt.ylabel('True',size = 25)
plt.xlabel('Predicted',size = 25)
plt.show()

NameError: name 'y_pred' is not defined

## Scaling the Regressors

In [None]:
model = LogisticRegression()

x = data1.values
y = data[y_col]




x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 5)

scalar = StandardScaler()
x_train = scalar.fit_transform(x_train)
rescaled_X[:5]

scalar = StandardScaler()
x_test = scalar.fit_transform(x_test)

model.fit(x_train,y_train.values.ravel())

pred = pd.Series(model.predict(x_test))
y_test = y_test.reset_index(drop=True)

print("accuracy of the model is : %.3f \n" %model.score(x_test,y_test))
pred_df = pd.concat([y_test, pred], axis=1)
pred_df.columns = ['True', 'Prediction']
pred_df.head()

In [None]:
# Confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

labels = [0, 1]
# plt.figure(figsize=[8,8]) 
fig, ax = plt.subplots(figsize=(14,8))

tick_marks = np.arange(len(labels))

plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("top")
plt.title('Confusion matrix', y=1.1)
plt.ylabel('True',size = 25)
plt.xlabel('Predicted',size = 25)
plt.show()