# Survival Classification Using SMOTE

## Importing libraries and packages

In [None]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
import os

## Importing the dataset

In [None]:
%matplotlib inline
sns.set_style('whitegrid')
df = pd.read_csv(r'heart_failure_clinical_records_dataset.csv')
df.head(5)


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [None]:
# Selecting only 3 highest correlated features (from random forest ranking and pearson correlation)
df = pd.DataFrame(df.iloc[:,[0,4,7,-1]])

In [None]:
df

Unnamed: 0,age,ejection_fraction,serum_creatinine,DEATH_EVENT
0,75.0,20,1.9,1
1,55.0,38,1.1,1
2,65.0,20,1.3,1
3,50.0,20,1.9,1
4,65.0,20,2.7,1
...,...,...,...,...
294,62.0,38,1.1,0
295,55.0,38,1.2,0
296,45.0,60,0.8,0
297,45.0,38,1.4,0


In [None]:
# Appending the list of attributes to be used later when coupling- decoupling the dataframe
ListAttr = []
lengthOfList = len(df)
for i in df:
    print(i)
    ListAttr.append(i)
   


age
ejection_fraction
serum_creatinine
DEATH_EVENT


## Splitting the dataframe in train and test sets

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df.iloc[:,:], test_size = 0.2, random_state = 0)
#print(df_train)

## Scaling the train set features 

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
df_train_scaled= min_max_scaler.fit_transform(df_train)

df_train_scaled= pd.DataFrame(data= df_train_scaled, columns=ListAttr)
#print(df_train_scaled)
df_test_scaled= min_max_scaler.transform(df_test)
df_test_scaled= pd.DataFrame(data= df_test_scaled, columns=ListAttr)
#print(df_test_scaled)

## Defining independent and dependent variables

In [None]:
X_train=df_train_scaled.iloc[:, 0:-1]
y_train=df_train_scaled.iloc[:,-1]
X_test=df_test_scaled.iloc[:,0:-1]
y_test=df_test_scaled.iloc[:,-1]
#print(X_train)
#print(y_train)


## Dealing with data imbalance

### Using SMOTE to create synthetic data

In [None]:
# To study the effect without SMOTE, comment ou this cell.
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy = 'auto')
X_train, y_train = sm.fit_sample(X_train, y_train)




## Training the model


### SVM model

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
parameters = {'kernel':('linear', 'rbf','poly'), 'C':[0.1, 1, 10,100],'gamma':[0.1, 1, 10] }
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(X_train, y_train)
y_pred_SVM_CV = clf.predict(X_test)

print("Using the score funtion:",clf.score(X_test,y_test))
print("The best param is:", clf.best_params_)

#for i in clf.cv_results_:
#    print(i)
#print(clf.cv_results_['params'])
#print(clf.cv_results_['mean_test_score'])

#checking results
cm = confusion_matrix(y_test, y_pred_SVM_CV)
print(cm)
print("Accuracy Score:",accuracy_score(y_test, y_pred_SVM_CV))
print("F1 score:",f1_score(y_test, y_pred_SVM_CV, average='macro'))
print("MCC Score:",matthews_corrcoef(y_test, y_pred_SVM_CV))

Using the score funtion: 0.8
The best param is: {'C': 10, 'gamma': 10, 'kernel': 'rbf'}
[[32  5]
 [ 7 16]]
Accuracy Score: 0.8
F1 score: 0.7846889952153109
MCC Score: 0.5713623881818716


### Neural network

In [None]:
#Trying Neural Networks (multilayer perceptron)
from sklearn.neural_network import MLPClassifier
maxScore = 0
maxi = 0
maxj = 0
for i in range(2,6):
    for j in range(2,6):
        clf_nn = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(i,j),random_state = 1)
        clf_nn.fit(X_train, y_train)#,batch_size=57, epochs=1000, validation_split=0.2)
        y_pred_nn = clf_nn.predict(X_test)
        print("*"*20)
#         print(clf_nn.loss_)
        print("loss = %f" %(clf_nn.loss_))



        cm = confusion_matrix(y_test, y_pred_nn)
        print("This is when the layer sizes is: %i,%i" %(i,j))
        print(cm)
        Score = accuracy_score(y_test, y_pred_nn)
        print(Score)
        if (Score > maxScore):
            maxScore= Score
            maxi = i
            maxj = j
        print("F1 score:",f1_score(y_test, y_pred_nn, average='macro'))
        print("MCC Score:",matthews_corrcoef(y_test, y_pred_nn))
print("the best score was when the layer size was: %i,%i with a score of %04f" %(maxi,maxj,maxScore))


********************
loss = 0.693147
This is when the layer sizes is: 2,2
[[37  0]
 [23  0]]
0.6166666666666667
F1 score: 0.3814432989690722
MCC Score: 0.0
********************
loss = 0.693147
This is when the layer sizes is: 2,3
[[ 0 37]
 [ 0 23]]
0.38333333333333336
F1 score: 0.27710843373493976
MCC Score: 0.0
********************
loss = 0.693147
This is when the layer sizes is: 2,4
[[37  0]
 [23  0]]
0.6166666666666667
F1 score: 0.3814432989690722
MCC Score: 0.0
********************
loss = 0.693147
This is when the layer sizes is: 2,5
[[ 0 37]
 [ 0 23]]
0.38333333333333336
F1 score: 0.27710843373493976
MCC Score: 0.0
********************
loss = 0.693147
This is when the layer sizes is: 3,2
[[ 0 37]
 [ 0 23]]
0.38333333333333336
F1 score: 0.27710843373493976
MCC Score: 0.0
********************
loss = 0.540080
This is when the layer sizes is: 3,3
[[30  7]
 [ 7 16]]
0.7666666666666667
F1 score: 0.7532314923619272
MCC Score: 0.5064629847238543


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


********************
loss = 0.548120
This is when the layer sizes is: 3,4
[[29  8]
 [ 6 17]]
0.7666666666666667
F1 score: 0.7569444444444444
MCC Score: 0.5156929620972371
********************
loss = 0.666501
This is when the layer sizes is: 3,5
[[30  7]
 [14  9]]
0.65
F1 score: 0.6011396011396011
MCC Score: 0.22221702922157036
********************
loss = 0.693147
This is when the layer sizes is: 4,2
[[ 0 37]
 [ 0 23]]
0.38333333333333336
F1 score: 0.27710843373493976
MCC Score: 0.0
********************
loss = 0.693147
This is when the layer sizes is: 4,3
[[ 0 37]
 [ 0 23]]
0.38333333333333336
F1 score: 0.27710843373493976
MCC Score: 0.0
********************
loss = 0.480192
This is when the layer sizes is: 4,4
[[31  6]
 [ 5 18]]
0.8166666666666667
F1 score: 0.8076362576508307
MCC Score: 0.615761131012764


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


********************
loss = 0.564566
This is when the layer sizes is: 4,5
[[29  8]
 [ 6 17]]
0.7666666666666667
F1 score: 0.7569444444444444
MCC Score: 0.5156929620972371
********************
loss = 0.528026
This is when the layer sizes is: 5,2
[[31  6]
 [ 7 16]]
0.7833333333333333
F1 score: 0.7688888888888887
MCC Score: 0.5382548280972278
********************
loss = 0.542623
This is when the layer sizes is: 5,3
[[31  6]
 [ 6 17]]
0.8
F1 score: 0.7884841363102232
MCC Score: 0.5769682726204466
********************
loss = 0.517731
This is when the layer sizes is: 5,4
[[30  7]
 [ 7 16]]
0.7666666666666667
F1 score: 0.7532314923619272
MCC Score: 0.5064629847238543
********************
loss = 0.527564
This is when the layer sizes is: 5,5
[[31  6]
 [ 6 17]]
0.8
F1 score: 0.7884841363102232
MCC Score: 0.5769682726204466
the best score was when the layer size was: 4,4 with a score of 0.816667


### Random Forest 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV


calibrated_forest = CalibratedClassifierCV(base_estimator=RandomForestClassifier(n_estimators=20))


pipe = Pipeline([('select', SelectKBest()),('model', calibrated_forest)])
param_grid = {'select__k': [1, 2],'model__base_estimator__max_depth': [2, 4, 6, 8]}



search = GridSearchCV(pipe, param_grid, cv=5).fit(X_train, y_train)
y_pred_Pipe = search.predict(X_test)


print("Using the score funtion:",search.score(X_test,y_test))
print("The best param is:", search.best_params_)

cm = confusion_matrix(y_test, y_pred_Pipe)
print(cm)
print("Accuracy Score:",accuracy_score(y_test, y_pred_Pipe))
print("F1 score:",f1_score(y_test, y_pred_Pipe, average='macro'))
print("MCC Score:",matthews_corrcoef(y_test, y_pred_Pipe))


Using the score funtion: 0.6833333333333333
The best param is: {'model__base_estimator__max_depth': 8, 'select__k': 2}
[[26 11]
 [ 8 15]]
Accuracy Score: 0.6833333333333333
F1 score: 0.6723196320781835
MCC Score: 0.3481897886336749


In [None]:
from sklearn.linear_model import LogisticRegression
parameters = {'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'C':[0.1, 1, 10,100],'fit_intercept':[True,False]}
clf = GridSearchCV(estimator=LogisticRegression(), param_grid=parameters)
clf = clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print("Using the score funtion:",clf.score(X_test,y_test))
print("The best param is:", clf.best_params_)
print(confusion_matrix(y_test, y_pred))
print("Accuracy Score:",accuracy_score(y_test, y_pred))
print("F1 score:",f1_score(y_test, y_pred, average='macro'))
print("MCC Score:",matthews_corrcoef(y_test, y_pred))


Using the score funtion: 0.7666666666666667
The best param is: {'C': 100, 'fit_intercept': True, 'solver': 'newton-cg'}
[[30  7]
 [ 7 16]]
Accuracy Score: 0.7666666666666667
F1 score: 0.7532314923619272
MCC Score: 0.5064629847238543


In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
cm_nb = confusion_matrix(y_test, y_pred)
print(cm_nb)
print("Accuracy Score:",accuracy_score(y_test, y_pred))
print("F1 score:",f1_score(y_test, y_pred, average='macro'))
print("MCC Score:",matthews_corrcoef(y_test, y_pred))

[[34  3]
 [13 10]]
Accuracy Score: 0.7333333333333333
F1 score: 0.6825396825396826
MCC Score: 0.41742755148618044
