# Import Data And Clean , Normalize them

In [185]:
import pandas as pd
import numpy as np
df=pd.read_csv('Heart_data.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,cholestoral,blood sugar,electrocardiographic,heart rate,exercise induced,depression,slope,ca,thal,target
0,63,1,1,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,37,1,3,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
2,41,0,2,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
3,56,1,2,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
4,57,0,4,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0


In [186]:
#check to have some spaces and the end of the column names
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'cholestoral', 'blood sugar',
       'electrocardiographic', 'heart rate', 'exercise induced', 'depression',
       'slope', 'ca', 'thal', 'target'],
      dtype='object')

In [187]:
#check to have objects
df.dtypes

age                       int64
sex                       int64
cp                        int64
trestbps                float64
cholestoral             float64
blood sugar             float64
electrocardiographic    float64
heart rate              float64
exercise induced        float64
depression              float64
slope                   float64
ca                      float64
thal                    float64
target                    int64
dtype: object

In [188]:
#check how many cells are null
df.isnull().sum()

age                       0
sex                       0
cp                        0
trestbps                  1
cholestoral              23
blood sugar               8
electrocardiographic      1
heart rate                1
exercise induced          1
depression                0
slope                   190
ca                      294
thal                    268
target                    0
dtype: int64

In [189]:
#fill null by medians
df_median = df.fillna(df.median())
df_median.isnull().sum()

age                     0
sex                     0
cp                      0
trestbps                0
cholestoral             0
blood sugar             0
electrocardiographic    0
heart rate              0
exercise induced        0
depression              0
slope                   0
ca                      0
thal                    0
target                  0
dtype: int64

In [190]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_data_minmax = scaler.fit_transform(df_median)
normalized_heart_disease_median = pd.DataFrame(normalized_data_minmax, columns = df_median.columns)
normalized_heart_disease_median

Unnamed: 0,age,sex,cp,trestbps,cholestoral,blood sugar,electrocardiographic,heart rate,exercise induced,depression,slope,ca,thal,target
0,0.714286,1.0,0.000000,0.490741,0.285714,1.0,1.0,0.603053,0.0,0.370968,1.0,0.0,0.75,0.0
1,0.183673,1.0,0.666667,0.351852,0.318533,0.0,0.0,0.885496,0.0,0.564516,1.0,0.0,0.00,0.0
2,0.265306,0.0,0.333333,0.351852,0.229730,0.0,1.0,0.770992,0.0,0.225806,0.0,0.0,0.00,0.0
3,0.571429,1.0,0.333333,0.259259,0.291506,0.0,0.0,0.816794,0.0,0.129032,0.0,0.0,0.00,0.0
4,0.591837,0.0,1.000000,0.259259,0.519305,0.0,0.0,0.702290,1.0,0.096774,0.0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592,0.489796,1.0,1.000000,0.444444,0.349421,0.0,0.0,0.480916,1.0,0.322581,0.5,0.0,0.00,1.0
593,0.306122,1.0,1.000000,0.444444,0.391892,0.0,0.0,0.488550,1.0,0.322581,0.5,0.0,0.00,1.0
594,0.265306,1.0,1.000000,0.259259,0.484556,0.0,0.0,0.358779,1.0,0.483871,0.5,0.0,0.00,1.0
595,0.326531,1.0,1.000000,0.398148,0.783784,0.0,0.0,0.488550,0.0,0.000000,0.5,0.0,0.00,1.0


# Define X & Y

In [191]:
df=normalized_heart_disease_median
x = df.drop(['target'],axis=1)
y = df.target

# Train- Test Sets

In [192]:
from sklearn.model_selection import train_test_split

In [193]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(x,y,test_size=0.3,random_state=1)
print(Xtrain.shape)
print(Ytrain.shape)
print(Xtest.shape)
print(Ytest.shape)

#70 percent for train and 30 percent for test

(417, 13)
(417,)
(180, 13)
(180,)


# Random Decision Tree

In [194]:
from sklearn.tree import DecisionTreeClassifier

In [195]:
DT=DecisionTreeClassifier()

DT.fit(Xtrain , Ytrain)


In [196]:
pred=DT.predict(Xtest)

In [197]:
#what percent of predection from not tuned DecisionTreeClassifier is accurate?
(pred==Ytest).mean()


0.7888888888888889

# Tunning DecisionTreeClassifier

In [198]:
from sklearn.model_selection import GridSearchCV
param={'criterion':['gini','entropy'],
                   'max_depth':[3,4,5],
                   'min_samples_split':[3,4,5,6],
                   'min_samples_leaf':[2,3,4]}
clf=DecisionTreeClassifier()

GS=GridSearchCV(clf,param,cv=10,scoring="accuracy")
GS.fit(x , y)


In [199]:
GS.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'min_samples_leaf': 4,
 'min_samples_split': 4}

In [200]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(criterion="entropy",max_depth=4,min_samples_split=3, min_samples_leaf=2)
DT.fit(Xtrain,Ytrain)

In [201]:
ypred = DT.predict(Xtest)
print(ypred.shape , Ytest.shape)
#what percent of predection from tuned DecisionTreeClassifier is accurate?
(ypred==Ytest).mean()

(180,) (180,)


0.7833333333333333

# Cross Validation

In [202]:
from sklearn.model_selection import cross_val_predict, cross_val_score

DT=DecisionTreeClassifier(criterion="gini",max_depth=3,min_samples_split=3, min_samples_leaf=2,)
                
scores=cross_val_score(DT,x,y,cv=10, scoring='accuracy')

# percent of accurate predection after Cross validation
print(scores.max())

0.9166666666666666
