1. Import The Libraries 

In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [6]:
heart = pd.read_csv("heart.csv")

In [7]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [8]:
heart.info

<bound method DataFrame.info of      age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  target  
0        0   0     1    

In [9]:
heart.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [10]:
heart.shape

(303, 14)

In [11]:
heart.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [12]:
heart.duplicated().sum()

1

In [13]:
heart[heart.duplicated()] #prints the duplicate value

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
164,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1


In [14]:
heart.drop_duplicates(inplace=True) #drop all the duplicates from the original data

In [15]:
heart.duplicated().sum()

0

In [16]:
heart.describe() # it is required to check the outliers

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.602649,246.5,0.149007,0.52649,149.569536,0.327815,1.043046,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,17.563394,51.753489,0.356686,0.526027,22.903527,0.470196,1.161452,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


**Note: ** 
The Linear Algorithm like Linear regression, Logistic Regression, Linear SVM etc are affected by the outliers.
But the other set of algorithms which are Tree Based Algorithms, they are not affected by the outliers

In [18]:
heart['target'].value_counts() ## to check whether the data is balanced or imbalanced

target
1    164
0    138
Name: count, dtype: int64

In [19]:
print(165/302)
print(138/302)

0.5463576158940397
0.45695364238410596


In [20]:
## This is also have imbalance but the difference is very little

Machine Learning algorithm Execution

In [22]:
X = heart.drop(columns='target') # storing all the column information except target column in X
y = heart['target'] # storing target column in Y

In [39]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state = 42) # stratify value y is the target column which is extracted in the above line

### standardization of the data

In [25]:
from sklearn.preprocessing import StandardScaler

In [27]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) #
X_test = scaler.transform(X_test)

In [30]:
X_train  ## Converts the data frame into 2D array

array([[ 0.28906186, -1.47344923, -0.93569147, ...,  0.96461102,
         0.26403574, -0.5751421 ],
       [ 1.26718225,  0.67867965,  0.06210342, ..., -0.63863902,
         2.22196228, -2.24513302],
       [ 0.07170177, -1.47344923, -0.93569147, ..., -0.63863902,
         0.26403574,  1.09484882],
       ...,
       [ 1.04982216,  0.67867965, -0.93569147, ..., -0.63863902,
         0.26403574,  1.09484882],
       [-1.77585897,  0.67867965,  1.0598983 , ...,  0.96461102,
         3.20092555, -0.5751421 ],
       [-1.3411388 ,  0.67867965,  1.0598983 , ...,  0.96461102,
        -0.71492753, -0.5751421 ]])

### Apply Logistic Regression

In [31]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)  ## Learning starts here

In [32]:
y_pred = log_reg.predict(X_test)

In [33]:
y_pred

array([0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1], dtype=int64)

In [34]:
y_test

179    0
197    0
285    0
194    0
188    0
      ..
14     1
4      1
224    0
202    0
12     1
Name: target, Length: 61, dtype: int64

In [37]:
accuracy_score(y_test, y_pred)

0.7868852459016393

### Performance Metrics used in Classification Models

1. Accuracy Score - Used commonly for balanced data
2. Confusion Matrix
3. Precision Score
4. Recall Score
5. ROC_AUC Score - Commonly used for Imbalanced data
6. Specificity
7. Sensitivity
