In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, precision_score, classification_report, average_precision_score
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import pickle

### Functions :D

In [2]:
def EvaluateModelPerformance (Y_Test, ModelPrediction):
        print("Accuracy score: {}".format(accuracy_score(Y_Test, ModelPrediction) * 100))
        print("Precision score: {}".format(precision_score(Y_Test, ModelPrediction) * 100))
        print("Average precision score: {}".format(average_precision_score(Y_Test, ModelPrediction) * 100))
        print("Classification report: \n\n{}".format(classification_report(Y_Test, ModelPrediction)))

### Importing the dataset

In [5]:
MainDataset = fetch_ucirepo(id=519)
MainDataset

{'data': {'ids': None,
  'features':       age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
  0    75.0        0                       582         0                 20   
  1    55.0        0                      7861         0                 38   
  2    65.0        0                       146         0                 20   
  3    50.0        1                       111         0                 20   
  4    65.0        1                       160         1                 20   
  ..    ...      ...                       ...       ...                ...   
  294  62.0        0                        61         1                 38   
  295  55.0        0                      1820         0                 38   
  296  45.0        0                      2060         1                 60   
  297  45.0        0                      2413         0                 38   
  298  50.0        0                       196         0                 45   
  
       high_bl

In [4]:
PD_Dataframe = pd.DataFrame(MainDataset.data.original)
PD_Dataframe

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death_event
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


### Splitting into Data and Target matrix

In [5]:
DataMatrix = PD_Dataframe.iloc[:, :-1]
TargetMatrix = PD_Dataframe.iloc[:, -1]

In [6]:
DataMatrix

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8
...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280


In [7]:
TargetMatrix

0      1
1      1
2      1
3      1
4      1
      ..
294    0
295    0
296    0
297    0
298    0
Name: death_event, Length: 299, dtype: int64

### Splitting into training and testing datasets

In [8]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(DataMatrix, TargetMatrix, 
                                                    train_size=0.7, test_size=0.3, 
                                                    random_state=100, shuffle=True)

print("XT: {} \nXTT: {} \nYT: {} \nYTT: {}".format(np.array(X_Train).shape, np.array(X_Test).shape, 
                                                   np.array(Y_Train).shape, np.array(Y_Test).shape))

XT: (209, 12) 
XTT: (90, 12) 
YT: (209,) 
YTT: (90,)


### Grid Search and KFold

In [9]:
# Parameter grid
DTC_Hyperparameter_Grid = {
        "criterion": ["entropy", "gini"],
        "max_depth": [5, 10, 14, 20],
        "min_samples_split": [2, 4, 6, 8, 12],
        "max_leaf_nodes": [2, 3, 4],
        "max_features": [6, 12]
}

# KFold cross validation and grid search
KFold_Shuffle = KFold(n_splits=10, shuffle=True, random_state=100)
Grid_Search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=DTC_Hyperparameter_Grid, cv=KFold_Shuffle)


In [10]:
Grid_Search.fit(X_Train, Y_Train)
OptimalParams = Grid_Search.best_params_

### Training and evaluating the DecisionTree model and making it's first prediction

In [11]:
DTC_Model = DecisionTreeClassifier(**OptimalParams)
DTC_Model.fit(X_Train, Y_Train)

In [12]:
ModelPredictionY = DTC_Model.predict(X_Test)
print(ModelPredictionY)

[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1]


In [None]:
TestSeries = []
TestPrediction = DTC_Model.predict()

### Evaluating model prediction and performance

In [13]:
EvaluateModelPerformance(Y_Test, ModelPredictionY)

Accuracy score: 74.44444444444444
Precision score: 83.33333333333334
Average precision score: 50.215053763440864
Classification report: 

              precision    recall  f1-score   support

           0       0.73      0.97      0.83        59
           1       0.83      0.32      0.47        31

    accuracy                           0.74        90
   macro avg       0.78      0.64      0.65        90
weighted avg       0.77      0.74      0.71        90



### Saving the model using JSON

In [14]:
with open("/home/ai/Desktop/Machine Learning/Legit ML Projects/Heart Failure/Model/DTC_Model.pkl", "wb") as FileDump:
        pickle.dump(DTC_Model, FileDump)