In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
df_heart = pd.read_csv('../input/cardiovascular-disease-dataset/cardio_train.csv',sep=';')

In [3]:
df_heart.shape

(70000, 13)

In [4]:
df_heart.head()


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [5]:
df_heart.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [6]:
df_heart.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [7]:
df_heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [8]:
df_heart['cardio'].value_counts()/df_heart.shape[0]*100

0    50.03
1    49.97
Name: cardio, dtype: float64

Data is distribute almost equally a balanced data set

In [9]:
y = df_heart['cardio']

In [10]:
X = df_heart.drop('cardio',axis=1)

### Training a simple Decision Tree Model

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(56000, 12)
(56000,)
(14000, 12)
(14000,)


In [12]:
dt_basic = DecisionTreeClassifier()

In [13]:
dt_basic.fit(X_train,y_train)

DecisionTreeClassifier()

In [14]:
#Making predictions on train and test
y_pred_train = dt_basic.predict(X_train)
y_pred_test = dt_basic.predict(X_test)

Create Baseline accuracy using Dummy Classifier

In [15]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy ='most_frequent')
dummy_clf.fit(X_train,y_train)
dummy_clf.predict(X_train)
dummy_clf.score(X_train,y_train)

0.5009285714285714

If I guess at random I would get an accuracy of 50% on the data set

In [16]:
train_accuracy = metrics.accuracy_score(y_train, y_pred_train)
test_accuracy = metrics.accuracy_score(y_test, y_pred_test)
print(f'Training Accuracy is :{train_accuracy}')
print(f'Training Accuracy is :{test_accuracy}')

Training Accuracy is :1.0
Training Accuracy is :0.6265


Confusion Matrix

In [17]:
confusion_matrix(y_test,y_pred_test)

array([[4406, 2563],
       [2666, 4365]])

In [18]:
confusion_matrix(y_train,y_pred_train)

array([[28052,     0],
       [    0, 27948]])

In [19]:
print('Classification Report for Test')
print(classification_report(y_test,y_pred_test))

Classification Report for Test
              precision    recall  f1-score   support

           0       0.62      0.63      0.63      6969
           1       0.63      0.62      0.63      7031

    accuracy                           0.63     14000
   macro avg       0.63      0.63      0.63     14000
weighted avg       0.63      0.63      0.63     14000



In [20]:
print('Classification Report for Train')
print(classification_report(y_train,y_pred_train))

Classification Report for Train
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28052
           1       1.00      1.00      1.00     27948

    accuracy                           1.00     56000
   macro avg       1.00      1.00      1.00     56000
weighted avg       1.00      1.00      1.00     56000

