In [None]:
import numpy
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

## Data preprocessing

In [None]:
data = pd.read_csv("datasets_727551_1263738_heart_failure_clinical_records_dataset.csv")
#data.drop(columns=['time'],axis=1,inplace=True)
X = data.iloc[:,:-1]  #independent columns
y = data.iloc[:,-1]
X,y

(      age  anaemia  creatinine_phosphokinase  ...  sex  smoking  time
 0    75.0        0                       582  ...    1        0     4
 1    55.0        0                      7861  ...    1        0     6
 2    65.0        0                       146  ...    1        1     7
 3    50.0        1                       111  ...    1        0     7
 4    65.0        1                       160  ...    0        0     8
 ..    ...      ...                       ...  ...  ...      ...   ...
 294  62.0        0                        61  ...    1        1   270
 295  55.0        0                      1820  ...    0        0   271
 296  45.0        0                      2060  ...    0        0   278
 297  45.0        0                      2413  ...    1        1   280
 298  50.0        0                       196  ...    1        1   285
 
 [299 rows x 12 columns], 0      1
 1      1
 2      1
 3      1
 4      1
       ..
 294    0
 295    0
 296    0
 297    0
 298    0
 Name: DEAT

# Train test split

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
x_train

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
257,58.0,0,132,1,38,1,253000.0,1.0,139,1,0,230
281,70.0,0,582,0,40,0,51000.0,2.7,136,1,1,250
249,53.0,0,207,1,40,0,223000.0,1.2,130,0,0,214
279,55.0,0,84,1,38,0,451000.0,1.3,136,0,0,246
43,72.0,0,127,1,50,1,218000.0,1.0,134,1,0,33
...,...,...,...,...,...,...,...,...,...,...,...,...
73,65.0,0,224,1,50,0,149000.0,1.3,137,1,1,72
116,60.0,1,96,1,60,1,271000.0,0.7,136,0,0,94
143,61.0,1,84,0,40,1,229000.0,0.9,141,0,0,110
246,55.0,0,2017,0,25,0,314000.0,1.1,138,1,0,214


# Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc=StandardScaler()
x_train=sc.fit_transform(x_train)

x_test=sc.transform(x_test)

In [None]:
x_train1=x_train+6 #removing all the negatives in the dataset

In [None]:
#x_train+=5
x_train1

array([[5.74095356, 5.13714106, 5.54455835, ..., 6.7093269 , 5.3368567 ,
        7.26622297],
       [6.76806577, 5.13714106, 6.00379602, ..., 6.7093269 , 7.5079697 ,
        7.52513326],
       [5.31299015, 5.13714106, 5.62109796, ..., 4.59021278, 5.3368567 ,
        7.05909474],
       ...,
       [5.99773161, 7.15893799, 5.49557299, ..., 4.59021278, 5.3368567 ,
        5.71276124],
       [5.48417551, 5.13714106, 7.46825395, ..., 6.7093269 , 5.3368567 ,
        7.05909474],
       [6.68247308, 5.13714106, 6.00379602, ..., 6.7093269 , 7.5079697 ,
        5.23377721]])

# Feature selection

For feature selection, we have used a combination of recursive feature elemination and the chi2 methods. First we have used chi2 values to determine the importance values for each features and then we started eliminating the least important features one by one until we get the highest accuracy.

## Chi2 test

In [None]:
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(x_train1,y_train)

In [None]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
featureScores.sort_values(by='Score',ascending=False,inplace=True)
featureScores

In [None]:
import numpy as np
feat_order=np.array(featureScores.iloc[:,0])
feat_order

## Recursive feature elimination 

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=49)


Converting all the train and test data into dataframes for easier handling of data:

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=pd.DataFrame(sc.fit_transform(x_train)) 
x_test=pd.DataFrame(sc.transform(x_test))
y_train=pd.DataFrame(y_train)
y_test=pd.DataFrame(y_test)

In [None]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.714243,-0.877707,-0.331011,-0.819346,-0.712143,-0.736163,-0.974610,-0.205392,-1.150833,0.709327,-0.696040,1.104880
1,1.124656,1.139332,0.015961,-0.819346,-0.712143,-0.736163,-0.404019,0.382441,-0.649376,0.709327,-0.696040,-0.187342
2,-1.748231,-0.877707,-0.471783,-0.819346,-0.286212,-0.736163,-0.074832,-0.298699,-0.147919,0.709327,1.436698,1.131524
3,0.714243,-0.877707,0.015961,1.220485,-0.030653,-0.736163,-2.597503,-0.298699,0.854994,0.709327,-0.696040,1.584467
4,1.370903,-0.877707,-0.338942,-0.819346,0.991583,-0.736163,2.405045,-0.018779,0.353537,0.709327,1.436698,0.865086
...,...,...,...,...,...,...,...,...,...,...,...,...
234,-1.009489,1.139332,-0.492601,-0.819346,0.991583,-0.736163,-1.424499,-0.392006,0.854994,-1.409787,-0.696040,0.265601
235,-1.255736,-0.877707,-0.394458,1.220485,-1.819565,1.358395,0.100735,0.634370,-3.156659,-1.409787,-0.696040,-0.360527
236,-0.599076,-0.877707,0.015961,1.220485,-0.030653,-0.736163,0.023924,0.354449,-0.649376,0.709327,-0.696040,1.144845
237,0.303831,-0.877707,-0.169421,1.220485,-1.138075,-0.736163,0.034897,-0.205392,-0.147919,0.709327,1.436698,0.358855


Constructing a classifier with all the features. We will then start eleminating the features one by one from the next cell.

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=50)
y_pred=clf.fit(x_train,y_train).predict(x_test)
print(feat_order[10:12])

['sex' 'smoking']


  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
from sklearn.metrics import accuracy_score
curr_acc=0.0
acc=accuracy_score(y_test,y_pred) #computing accuracy for the model with all the features

Performing recursive feature elemination to find out the optimal number of features:

In [None]:
i=0
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=140)
clf=RandomForestClassifier(n_estimators=50,random_state=140)
y_pred=clf.fit(x_train,y_train).predict(x_test)
curr_acc=0.0
acc=accuracy_score(y_test,y_pred)
print('initial Accuracy='+str(acc))
while(curr_acc<=acc):
  curr_acc=acc
  i+=1
  x_train.drop(columns=feat_order[12-i],inplace=True)
  x_test.drop(columns=feat_order[12-i],inplace=True)
  clf=RandomForestClassifier(n_estimators=50,random_state=140)
  y_pred=clf.fit(x_train,y_train).predict(x_test)
  acc=accuracy_score(y_test,y_pred)
print(acc,curr_acc,i) #the cariable i corresponds to the number of features which have been removed

initial Accuracy=0.85


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


0.8666666666666667 0.8833333333333333 3


In [None]:
x_train #features after feature selection

Unnamed: 0,age,anaemia,creatinine_phosphokinase,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,time
237,70.0,0,232,30,0,173000.0,1.20,132,210
148,75.0,1,582,30,0,225000.0,1.83,134,113
242,40.0,0,90,35,0,255000.0,1.10,136,212
277,70.0,0,582,38,0,25100.0,1.10,140,246
212,78.0,0,224,50,0,481000.0,1.40,138,192
...,...,...,...,...,...,...,...,...,...
177,49.0,1,69,50,0,132000.0,1.00,140,147
126,46.0,0,168,17,1,271000.0,2.10,124,100
244,54.0,0,582,38,0,264000.0,1.80,134,213
182,65.0,0,395,25,0,265000.0,1.20,136,154


## Saving the test and train datasets as four seperate files


In [None]:
sc=StandardScaler()
x_train=pd.DataFrame(sc.fit_transform(x_train))
x_test=pd.DataFrame(sc.transform(x_test))

In [None]:
pd.DataFrame(x_train).to_csv("Pre-processed_x_train.csv")
pd.DataFrame(x_test).to_csv("Pre-processed_x_test.csv")
pd.DataFrame(y_train).to_csv("Pre-processed_y_train.csv")
pd.DataFrame(y_test).to_csv("Pre-processed_y_test.csv")