In [1]:
import numpy as np
import pandas as pd
from joblib import dump

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
dataSet = pd.read_csv('dataset.csv')

In [3]:
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB


In [4]:
#droping unwanted columns
dataSet = dataSet.drop(['id','dataset'],axis=1)

In [5]:
#selecting numerical columns
numerical_cols = dataSet.select_dtypes(include=['int64','float64']).columns.tolist()

numerical_cols

['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca', 'num']

In [6]:
#selecting categorical columns
categorical_cols = dataSet.select_dtypes(include=['object']).columns.tolist()

categorical_cols

['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']

In [7]:
#filling the empty values of numerical cols

for column in numerical_cols:
    if dataSet[column].isnull().any():
        dataSet[column] = dataSet[column].fillna(dataSet[column].mean())

In [8]:
#filling the empty values of categorical cols

for column in categorical_cols:
    if dataSet[column].isnull().any():
        mode_value = dataSet[column].mode()[0]
        dataSet[column] = dataSet[column].fillna(mode_value)

  dataSet[column] = dataSet[column].fillna(mode_value)


In [9]:
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    int64  
 1   sex       920 non-null    object 
 2   cp        920 non-null    object 
 3   trestbps  920 non-null    float64
 4   chol      920 non-null    float64
 5   fbs       920 non-null    bool   
 6   restecg   920 non-null    object 
 7   thalch    920 non-null    float64
 8   exang     920 non-null    bool   
 9   oldpeak   920 non-null    float64
 10  slope     920 non-null    object 
 11  ca        920 non-null    float64
 12  thal      920 non-null    object 
 13  num       920 non-null    int64  
dtypes: bool(2), float64(5), int64(2), object(5)
memory usage: 88.2+ KB


In [10]:
#checking the distribution of target variable
dataSet['num'].value_counts()

num
0    411
1    265
2    109
3    107
4     28
Name: count, dtype: int64

In [11]:
#target data is imbalance
#Combine classes into a single 'has heart disease' class
dataSet['num'] = dataSet['num'].apply(lambda x: 1 if x > 0 else 0)

# Check the new distribution
print(dataSet['num'].value_counts())


num
1    509
0    411
Name: count, dtype: int64


In [12]:
#encoding the dataset
encorder = LabelEncoder()

In [13]:
encoded_dataset = dataSet.iloc[:,0:14]

In [14]:
##encoding all the columns iterating through the dataset

for i in encoded_dataset:
    encoded_dataset[i] = encorder.fit_transform(encoded_dataset[i])

In [15]:
dataSet.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,63,Male,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,67,Male,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,1
2,67,Male,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,37,Male,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,41,Female,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0
5,56,Male,atypical angina,120.0,236.0,False,normal,178.0,False,0.8,upsloping,0.0,normal,0
6,62,Female,asymptomatic,140.0,268.0,False,lv hypertrophy,160.0,False,3.6,downsloping,2.0,normal,1
7,57,Female,asymptomatic,120.0,354.0,False,normal,163.0,True,0.6,upsloping,0.0,normal,0
8,63,Male,asymptomatic,130.0,254.0,False,lv hypertrophy,147.0,False,1.4,flat,1.0,reversable defect,1
9,53,Male,asymptomatic,140.0,203.0,True,lv hypertrophy,155.0,True,3.1,downsloping,0.0,reversable defect,1


In [16]:
encoded_dataset.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,35,1,3,41,87,1,0,77,0,34,0,0,0,0
1,39,1,0,50,140,0,0,34,1,26,1,4,1,1
2,39,1,0,22,83,0,0,55,1,37,1,3,2,1
3,9,1,2,31,104,0,1,113,0,44,0,0,1,0
4,13,0,1,31,58,0,0,99,0,25,2,0,1,0
5,28,1,1,22,90,0,1,105,0,18,2,0,1,0
6,34,0,0,38,122,0,0,87,0,45,0,3,1,1
7,29,0,0,22,193,0,1,90,1,16,2,0,1,0
8,35,1,0,31,108,0,0,74,0,25,1,2,2,1
9,25,1,0,38,57,1,0,82,1,41,0,0,2,1


In [17]:
#split the dataset
X = encoded_dataset.iloc[:,0:13]
y = encoded_dataset.iloc[:,13]

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [19]:
#train the model
model = DecisionTreeClassifier(criterion='gini',max_depth=6)
model.fit(X_train,y_train)

In [20]:
#Accuracy Score

#Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [21]:
print("Accuracy on training data :",training_data_accuracy)

Accuracy on training data : 0.8804347826086957


In [22]:
#Accuracy on test data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, y_test)

In [23]:
print("Accuracy on testing data :", testing_data_accuracy)

Accuracy on testing data : 0.7445652173913043


In [24]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [25]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.7445652173913043
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.73      0.74        92
           1       0.74      0.76      0.75        92

    accuracy                           0.74       184
   macro avg       0.74      0.74      0.74       184
weighted avg       0.74      0.74      0.74       184



In [26]:
prediction = model.predict([[25,1,0,38,57,1,0,82,1,41,0,0,2]])
print(prediction)

[1]




In [29]:
#Save the model
dump(model,'.//TrainedModels//DecisionTreeModel.joblib')

['.//TrainedModels//DecisionTreeModel.joblib']