In [2]:
import pandas as pd #To read CSV file
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split #To split data into trainning and testing
from sklearn.preprocessing import StandardScaler # To scale numerical values from -1 to 1 range

from sklearn.linear_model import LogisticRegression # import Logistic regression
from sklearn.tree import DecisionTreeClassifier # import Decision tree
from sklearn.ensemble import RandomForestClassifier # import Random Forest Classifier
from sklearn.svm import SVC # import SVM
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score # accuracy used for accuracy of model
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv("HeartAttack.csv",na_values = '?')

In [4]:
df.head(10) #Top 10 row


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [5]:
df = df.drop(columns = ["slope","ca","thal"],axis = 1)

In [6]:
df.head(10) #Top 10 row

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target
0,63,1,3,145,233,1,0,150,0,2.3,1
1,37,1,2,130,250,0,1,187,0,3.5,1
2,41,0,1,130,204,0,0,172,0,1.4,1
3,56,1,1,120,236,0,1,178,0,0.8,1
4,57,0,0,120,354,0,1,163,1,0.6,1
5,57,1,0,140,192,0,1,148,0,0.4,1
6,56,0,1,140,294,0,0,153,0,1.3,1
7,44,1,1,120,263,0,1,173,0,0.0,1
8,52,1,2,172,199,1,1,162,0,0.5,1
9,57,1,2,150,168,0,1,174,0,1.6,1


In [7]:
df = df.dropna()

In [8]:
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target
0,63,1,3,145,233,1,0,150,0,2.3,1
1,37,1,2,130,250,0,1,187,0,3.5,1
2,41,0,1,130,204,0,0,172,0,1.4,1
3,56,1,1,120,236,0,1,178,0,0.8,1
4,57,0,0,120,354,0,1,163,1,0.6,1
5,57,1,0,140,192,0,1,148,0,0.4,1
6,56,0,1,140,294,0,0,153,0,1.3,1
7,44,1,1,120,263,0,1,173,0,0.0,1
8,52,1,2,172,199,1,1,162,0,0.5,1
9,57,1,2,150,168,0,1,174,0,1.6,1


In [9]:
numerical_cols = ["age","trestbps","chol","thalach","oldpeak"]  # We have to standardized this numerical columns
cat_cols = list(set(df.columns)-set(numerical_cols)-{"target"})

In [10]:
numerical_cols

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [11]:
cat_cols

['sex', 'restecg', 'fbs', 'cp', 'exang']

In [12]:
df_train , df_test = train_test_split(df,test_size = 0.2, random_state = 42)

In [13]:
len(df_train) , len(df_test)

(242, 61)

In [14]:
scaler = StandardScaler()  # object of Scaler

In [15]:
def get_features_and_target_arrays(df,numerical_cols,cat_cols,scaler):
  x_numeric_scaled = scaler.fit_transform(df[numerical_cols])  # Numeric Columns scaled and converted to numpy array
  #x_categorical = df[cat_cols].tonumpy() # categorical Columns converted to numpy array
  x_categorical = df[cat_cols].values  # same as above statement
  x = np.hstack((x_categorical,x_numeric_scaled))  # Merge x_categorical and x_numeric to x
  y = df["target"]  # on y there will be target
  return x,y

In [16]:
x_train,y_train = get_features_and_target_arrays(df_train,numerical_cols,cat_cols,scaler)

In [17]:
x_train  # Basically gives x_train array and vice versa for y_train by using "y_train"


array([[ 1.        ,  1.        ,  0.        , ...,  0.91403366,
         0.53278078, -0.92086403],
       [ 1.        ,  0.        ,  0.        , ...,  0.43952674,
        -1.75358236, -0.19378705],
       [ 1.        ,  1.        ,  0.        , ..., -0.30070405,
        -0.13967897,  2.3509824 ],
       ...,
       [ 1.        ,  0.        ,  1.        , ..., -0.24376322,
        -0.85696936, -0.82997941],
       [ 1.        ,  0.        ,  0.        , ...,  0.04094093,
        -0.27417092, -0.19378705],
       [ 0.        ,  1.        ,  0.        , ..., -0.98399402,
         1.29490183, -0.92086403]])

In [18]:
### Train using Logistic Regression
clf = LogisticRegression() # Object of logistic regression
clf.fit(x_train,y_train)  # fit method will map x_train with corresponding y_train

In [19]:
x_test,y_test = get_features_and_target_arrays(df_test,numerical_cols,cat_cols,scaler) 


In [20]:
test_pred = clf.predict(x_test)


In [21]:
mean_squared_error(y_test,test_pred)


0.14754098360655737

In [22]:
accuracy_score(y_test,test_pred)  # Accuracy of Logistic Regression


0.8524590163934426

In [23]:
confusion_matrix(y_test,test_pred) # Confusion matrix of result


array([[25,  4],
       [ 5, 27]], dtype=int64)

In [24]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,target
0,63,1,3,145,233,1,0,150,0,2.3,1
1,37,1,2,130,250,0,1,187,0,3.5,1
2,41,0,1,130,204,0,0,172,0,1.4,1
3,56,1,1,120,236,0,1,178,0,0.8,1
4,57,0,0,120,354,0,1,163,1,0.6,1


In [25]:
input_data = (54,1,2,120,258,0,0,147,0,0.4)
numpy_aray = np.asarray(input_data)
reshaped = numpy_aray.reshape(1,-1)
prediction = clf.predict(reshaped)
print("Prediction by Sandesh is")
print(prediction)

Prediction by Sandesh is
[0]
