In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

In [3]:
data= pd.read_csv('heart_failure_clinical_records_dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.4 KB


In [4]:
data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [68]:
inf=data.sample(5,random_state=0)

In [69]:
data_train=data.drop(inf.index)

In [79]:
data_train.reset_index(drop=True,inplace=True)
inf.reset_index(drop=True,inplace=True)

In [80]:
X=data_train.drop('DEATH_EVENT',axis=1)
y=data_train['DEATH_EVENT']

In [18]:
data.high_blood_pressure.value_counts()

0    194
1    105
Name: high_blood_pressure, dtype: int64

In [81]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=212)

In [20]:
num=['age','creatinine_phosphokinase','platelets','serum_creatinine', 'serum_sodium']
cat=['anaemia','diabetes','high_blood_pressure','sex', 'smoking', 'time','ejection_fraction']

In [28]:
pre=ColumnTransformer([('num',MinMaxScaler(),num),
                    ('cat',OneHotEncoder(handle_unknown='ignore'),cat)

])

In [82]:
pipe_dt=Pipeline([('preprocessing',pre),
                ('model_dt',DecisionTreeClassifier()
                 )])

In [None]:
from sklearn.ensemble import BaggingClassifier


BaggingClassifier()

In [57]:
pipe_rf=Pipeline([('preprocessing',pre),
                ('model_dt',RandomForestClassifier()
                 )])

In [58]:
pipe_ab=Pipeline([('preprocessing',pre),
                ('model_dt',AdaBoostClassifier()
                 )])

In [49]:
pipe_dt.get_params()

{'memory': None,
 'steps': [('preprocessing',
   ColumnTransformer(transformers=[('num', MinMaxScaler(),
                                    ['age', 'creatinine_phosphokinase',
                                     'platelets', 'serum_creatinine',
                                     'serum_sodium']),
                                   ('cat', OneHotEncoder(handle_unknown='ignore'),
                                    ['anaemia', 'diabetes', 'high_blood_pressure',
                                     'sex', 'smoking', 'time',
                                     'ejection_fraction'])])),
  ('model_dt', DecisionTreeClassifier())],
 'verbose': False,
 'preprocessing': ColumnTransformer(transformers=[('num', MinMaxScaler(),
                                  ['age', 'creatinine_phosphokinase',
                                   'platelets', 'serum_creatinine',
                                   'serum_sodium']),
                                 ('cat', OneHotEncoder(handle_unknown='ignore')

In [83]:
pipe_dt.fit(X_train,y_train)

In [45]:
y_pred_train=pipe_dt.predict(X_train)
y_pred_test=pipe_dt.predict(X_test)

In [47]:
from sklearn.metrics import classification_report


print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       166
           1       1.00      1.00      1.00        73

    accuracy                           1.00       239
   macro avg       1.00      1.00      1.00       239
weighted avg       1.00      1.00      1.00       239



In [88]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

nfgkaslfgnasklgnakl={'model_dt__max_depth':[1,2,3,4,5,6,7],
        'model_dt__max_leaf_nodes':[1,2,3],
        'preprocessing':[pre,None,StandardScaler()]}
grid=GridSearchCV(pipe_dt,nfgkaslfgnasklgnakl,cv=5)

In [66]:
import warnings
warnings.filterwarnings('ignore')

In [94]:
X_train.shape

(235, 12)

In [95]:
inf.shape

(5, 13)

In [89]:
grid.fit(X_train,y_train)

In [91]:
grid.best_estimator_

In [56]:
from sklearn.model_selection import cross_val_predict, cross_val_score


print(cross_val_score(grid.best_estimator_,X_train,y_train).mean())
print(cross_val_score(pipe_dt,X_train,y_train).mean())

0.82854609929078
0.724290780141844


In [96]:
y_pred_inf=pipe_dt.predict(inf)

In [97]:
pred=pd.DataFrame({'pred':y_pred_inf})

In [98]:
pd.concat([inf,pred],axis=1)

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT,pred
0,40.0,1,101,0,40,0,226000.0,0.8,141,0,0,187,0,0
1,60.667,1,151,1,40,1,201000.0,1.0,136,0,0,172,0,1
2,45.0,1,981,0,30,0,136000.0,1.1,137,1,0,11,1,0
3,55.0,0,582,1,35,1,371000.0,0.7,140,0,0,197,0,0
4,70.0,0,232,0,30,0,173000.0,1.2,132,1,0,210,0,0
