# FDS project 21/22 - Heart Disease Prediction

### Authors
- Elios Buzo
- Laurentiu Adrian Crsturean
- Anthony Giusti
- Ludovico Lentini
- Michele Spina

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import datasets, metrics, model_selection, svm
from sklearn.model_selection import RepeatedKFold, GridSearchCV


from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from sklearn import metrics

from gd_functions import *

# Clean and analyze the dataset

## A first look

### What type of data we have

In [None]:
#Import the original dataset
df = pd.read_csv('dataset.csv')
#df.head()

#Print the number of null values
df.isnull().sum()
#df.info()

In [None]:
df.head(10)

In [None]:
df.info()

We can immediatly observe that there are some categorical values and there isn't null values, but we must analyze if all the data are consistent

In [None]:
df.describe(include = 'all')

The minimum value for the attributes **Cholesterol** and **RestingBP** is 0 and it isn't consistent.

In [None]:
print(np.sum(np.asarray(df["Cholesterol"] == 0)))

In [None]:
print(np.sum(np.asarray(df["RestingBP"] == 0)))

### How data are distributed

The same graphs but in only one plot

In [None]:
oe=['g','r']
fig = plt.figure(figsize=(15,15))

plt.subplot(4,3,1)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="HeartDisease", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,2)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="Sex", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,3)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="ChestPainType", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,4)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="ExerciseAngina", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,5)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="RestingECG", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,6)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="ST_Slope", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,7)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="Cholesterol", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,8)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="RestingBP", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,9)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="Age", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,10)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="MaxHR", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,11)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="Oldpeak", hue="HeartDisease",multiple="stack",palette=oe)

plt.subplot(4,3,12)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df, x="FastingBS", hue="HeartDisease",multiple="stack",palette=oe)


In [None]:
sns.catplot( x ='Age', y ='ChestPainType' , hue = 'HeartDisease', data=df)

In [None]:
sns.catplot( x ='Age', y ='RestingECG' , hue = 'HeartDisease', data=df)

In [None]:
sns.catplot( x ='Age', y ='ST_Slope' , hue = 'HeartDisease', data=df)

## From categorical to numeric

We must convert categorical data to numerical data

In [None]:
'''
I want rappresent the heatmap here
Maybe we should do that after clean the dataset or show two differents heatmap: one whitout cholesterol 
and one only with colesterol, deleting rows with value 0
'''
onlyNumeric = pd.read_csv('dataset.csv')
onlyNumeric = onlyNumeric
del onlyNumeric["ChestPainType"]
del onlyNumeric["RestingECG"]
del onlyNumeric["ST_Slope"]
del onlyNumeric["ExerciseAngina"]
del onlyNumeric["Sex"]

#onlyNumeric.style.background_gradient(cmap ='viridis')\
        #.set_properties(**{'font-size': '20px'})
#df.columns
#df.drop(df.columns['ChestPainType', 'RestingECG', 'ExerciseAngina','ST_Slope','Sex']), axis=1).set_index('Age')
#corr = onlyNumeric.corr()
#sns.heatmap(corr, annot = True)
#sns.heatmap(onlyNumeric, annot=True)

heatmap = sns.heatmap(onlyNumeric.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

The variables are not very correlated.

In [None]:
## Converting categorical values using one-hot encoding
df = pd.get_dummies(df)
df.head()

## How can we manage inconsisten values?

This link contains some useful ways:
https://www.analyticsvidhya.com/blog/2021/05/dealing-with-missing-values-in-python-a-complete-guide/

### Deleting the columns with missing data

In [None]:

df_noC = df.copy()
#df_noC = df_noC.replace(cleanup_nums)
df_noC = df_noC.drop(df_noC[df_noC.RestingBP == 0].index)
del df_noC["Cholesterol"]
#del df_noC["RestingBP"]
df_noC.info()


In [None]:
df_noC.describe()

### Deleting the rows with missing data

In [None]:
#df_noR = pd.read_csv('dataset.csv')
df_noR = df.drop(df[df.Cholesterol == 0].index)
df_noR.info()

In [None]:
df_noR.describe()

### Filling the Missing Values – Imputation

In [None]:
cleanup_inconsisten = {"Cholesterol":     {0: 244.635389},
                "RestingBP":      {0: 133.022788},
               }
df_mean = df.replace(cleanup_inconsisten)

In [None]:
df_mean.describe()

### Imputation with an additional column

In [None]:
df_extraC = df.copy()
df_extraC['CholesterolIsMissing'] = df_extraC['Cholesterol'] == 0
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer(strategy = 'median')
#data_new = my_imputer.fit_transform(df_extraC)

cleanup_inconsisten = {"Cholesterol":     {0: 244.635389},
                "RestingBP":      {0: 133.022788},
                'CholesterolIsMissing': {False: 0, True: 1},
               }
df_extraC = df_extraC.replace(cleanup_inconsisten)
#df_extraC = df_extraC.replace(cleanup_nums)

df_extraC.describe()

In [None]:
oe=['g','r']
fig = plt.figure(figsize=(10,10))

plt.subplot(1,1,1)
plt.style.use('seaborn')
plt.tight_layout()
sns.set_context('talk')
sns.histplot(data=df_extraC, x="CholesterolIsMissing", hue="HeartDisease",multiple="stack",palette=oe)

In this plot we can observe the distribution of the new attribute CholesterolIsMissing

### Filling with a Regression Model

In [None]:
df_RM = df.copy()
lr = LinearRegression()
X_test_RM = df_RM[df_RM['Cholesterol']==0] #The ones we need to predict
X_train_RM = df_RM[df_RM['Cholesterol']!=0] #The ones used for training
y = X_train_RM['Cholesterol'] #cholesterol values
hd_train = X_train_RM['HeartDisease']
hd_test = X_test_RM['HeartDisease']
X_train_RM.describe()
del X_train_RM["Cholesterol"]
del X_train_RM["HeartDisease"]
lr.fit(X_train_RM,y) #Fit available cholesterol with it's data
del X_test_RM["Cholesterol"]
del X_test_RM["HeartDisease"]
pred = lr.predict(X_test_RM) # Predict the missing values
X_test_RM.info()
X_test_RM['Cholesterol']= pred #Assign the new values creating again the column Cholesterol
#X_test_RM.loc(X_test_RM.index, 'Cholesterol')
X_train_RM['Cholesterol']= y # Assign again the old cholesterol values 
X_test_RM['HeartDisease']= hd_test #Assign the new values creating again the column Cholesterol
X_train_RM['HeartDisease']= hd_train # Assign again the old cholesterol values 
df_RM = pd.concat([X_test_RM, X_train_RM], ignore_index=True, sort=False) # concatenate both charts, the one with predicted
#test = pd.concat([X_test_RM, pred, hd_test], axis=1).reindex(X_test_RM.index)
#train = pd.concat([X_train_RM, y, hd_train], axis=1).reindex(X_train_RM.index)
#df_RM = pd.concat([test, train], ignore_index=True, sort=False) # concatenate both charts, the one with predicted

In [None]:
df_RM.describe()

### Results

#### In this section we tested the different 5 ways to clean the dataset. We splitted the dataset one sigle time for all the ways and then we clone and clean it using the function clean. Then we measure the result using the accuracy, the AUROC and the AP for the linear regression model.

In [None]:
def clean(way, X_train, X_test,y_train,y_test):
    my_X_train = X_train.copy()
    my_X_test = X_test.copy()
    my_y_train = y_train.copy()
    my_y_test = y_test.copy()
    if way == "delCols":
        return delCols(my_X_train, my_X_test, my_y_train,my_y_test)
    elif way == "delRows":
        return delRows(my_X_train, my_X_test, my_y_train,my_y_test)
    elif way == "addMean":
        return addMean(my_X_train, my_X_test, my_y_train,my_y_test)
    elif way == "addCol":
        return addCol(my_X_train, my_X_test, my_y_train,my_y_test)
    elif way== "lr":
        return lr(my_X_train, my_X_test, my_y_train,my_y_test)
    else:
        return my_X_train, my_X_test, my_y_train,my_y_test
    
def delCols(X_train, X_test,y_train,y_test):
    y_train = y_train.drop(X_train[X_train.RestingBP==0].index)
    X_train = X_train.drop(X_train[X_train.RestingBP==0].index)
    y_test = y_test.drop(X_test[X_test.RestingBP==0].index)
    X_test = X_test.drop(X_test[X_test.RestingBP==0].index)
    del X_train["Cholesterol"]
    del X_test["Cholesterol"]
    dfmin = pd.concat([X_train.min(), X_test.min()], axis=1).min(axis=1)
    dfmax = pd.concat([X_train.max(), X_test.max()], axis=1).max(axis=1)
    X_train = (X_train-dfmin)/(dfmax-dfmin)
    X_test = (X_test-dfmin)/(dfmax-dfmin)
    
    return X_train, X_test,y_train,y_test

def delRows(X_train, X_test,y_train,y_test):
    y_train = y_train.drop(X_train[X_train.Cholesterol==0].index)
    X_train = X_train.drop(X_train[X_train.Cholesterol==0].index)
    y_test = y_test.drop(X_test[X_test.Cholesterol==0].index)
    X_test = X_test.drop(X_test[X_test.Cholesterol==0].index)
    dfmin = pd.concat([X_train.min(), X_test.min()], axis=1).min(axis=1)
    dfmax = pd.concat([X_train.max(), X_test.max()], axis=1).max(axis=1)
    X_train = (X_train-dfmin)/(dfmax-dfmin)
    X_test = (X_test-dfmin)/(dfmax-dfmin)
    return X_train, X_test,y_train,y_test
 
def addMean(X_train, X_test,y_train,y_test):
    cleanup_inconsisten = {"Cholesterol":     {0: 244.635389},
                "RestingBP":      {0: 133.022788},
               }
    X_train = X_train.replace(cleanup_inconsisten)
    X_test = X_test.replace(cleanup_inconsisten)
    dfmin = pd.concat([X_train.min(), X_test.min()], axis=1).min(axis=1)
    dfmax = pd.concat([X_train.max(), X_test.max()], axis=1).max(axis=1)
    X_train = (X_train-dfmin)/(dfmax-dfmin)
    X_test = (X_test-dfmin)/(dfmax-dfmin)
    return X_train, X_test,y_train,y_test    

def addCol(X_train, X_test,y_train,y_test):

    X_train['CholesterolIsMissing'] = X_train['Cholesterol'] == 0
    X_test['CholesterolIsMissing'] = X_test['Cholesterol'] == 0
    cleanup_inconsisten = {"Cholesterol":     {0: 244.635389},
                "RestingBP":      {0: 133.022788},
                'CholesterolIsMissing': {False: 0, True: 1},
               }
    X_train = X_train.replace(cleanup_inconsisten)
    X_test = X_test.replace(cleanup_inconsisten)
    dfmin = pd.concat([X_train.min(), X_test.min()], axis=1).min(axis=1)
    dfmax = pd.concat([X_train.max(), X_test.max()], axis=1).max(axis=1)
    X_train = (X_train-dfmin)/(dfmax-dfmin)
    X_test = (X_test-dfmin)/(dfmax-dfmin)
    return X_train, X_test, y_train, y_test 

def lr(X_train, X_test,y_train,y_test):
    y_train = y_train.drop(X_train[X_train.RestingBP==0].index)
    X_train = X_train.drop(X_train[X_train.RestingBP==0].index)
    y_test = y_test.drop(X_test[X_test.RestingBP==0].index)
    X_test = X_test.drop(X_test[X_test.RestingBP==0].index)
    
    X_test_RM = X_train[X_train['Cholesterol']==0] #The ones we need to predict
    X_train_RM = X_train[X_train['Cholesterol']!=0] #The ones used for training
    #X_RM = pd.concat([X_train, X_test], ignore_index=True, sort=False)
    #X_test_RM = X_RM[X_RM['Cholesterol']==0] #The ones we need to predict
    #X_train_RM = X_RM[X_RM['Cholesterol']!=0] #The ones used for training
    y = X_train_RM['Cholesterol'] #cholesterol values
    del X_train_RM["Cholesterol"]
    del X_test_RM["Cholesterol"]
    lr = LinearRegression()
    lr.fit(X_train_RM,y) #Fit available cholesterol with it's data
    pred = lr.predict(X_test_RM) # Predict the missing values
    X_test_RM['Cholesterol']= pred 
    X_train_RM['Cholesterol']= y
    X_train = pd.concat([X_test_RM, X_train_RM], ignore_index=True, sort=False)
    X_train_RM = X_test[X_test['Cholesterol']!=0] 
    X_test_RM = X_test[X_test['Cholesterol']==0] #The ones we need to predict
    #X_train_RM = X_test[X_test['Cholesterol']!=0] #The ones used for training
    del X_test_RM["Cholesterol"]

    pred = lr.predict(X_test_RM) # Predict the missing values
    X_test_RM['Cholesterol']= pred 

    X_test = pd.concat([X_test_RM, X_train_RM], ignore_index=True, sort=False)
    X_train = (X_train-dfmin)/(dfmax-dfmin)
    X_test = (X_test-dfmin)/(dfmax-dfmin)
    
    #X_test = X_test_RM
    #['Cholesterol'] = rows_train['Cholesterol']
    #my_X_train[Cholesterol]
    
    
    return X_train, X_test, y_train, y_test 


Applying the **LINEAR REGRESSION**, what is the best dataset?

In [None]:
y = df['HeartDisease']
df.drop("HeartDisease",axis=1,inplace=True)
dfmin = df.min()
dfmax = df.max()
X_train, X_test,y_train,y_test = train_test_split(df,y,test_size=0.25)
X_train_delCols, X_test_delCols,y_train_delCols,y_test_delCols = clean("delCols", X_train, X_test,y_train,y_test)
X_train_delRows, X_test_delRows,y_train_delRows,y_test_delRows = clean("delRows", X_train, X_test,y_train,y_test)
X_train_addMean, X_test_addMean,y_train_addMean,y_test_addMean = clean("addMean", X_train, X_test,y_train,y_test)
X_train_addCol, X_test_addCol, y_train_addCol, y_test_addCol = clean("addCol", X_train, X_test,y_train,y_test)
X_train_lr, X_test_lr, y_train_lr, y_test_lr = clean("lr", X_train, X_test,y_train,y_test)

In [None]:
X_train_delCols.describe()

In [None]:
X_train_delRows.describe()

In [None]:
X_train_addMean.describe()

In [None]:
X_train_addCol.describe()

In [None]:
X_train_lr.describe()

In [None]:
dfmin = pd.concat([X_train.min(), X_test.min()], axis=1).min(axis=1)
dfmax = pd.concat([X_train.max(), X_test.max()], axis=1).max(axis=1)
X_train = (X_train-dfmin)/(dfmax-dfmin)
X_test = (X_test-dfmin)/(dfmax-dfmin)
lr = LogisticRegression(max_iter=100000)
lr.fit(X_train,y_train)
pred = lr.predict(X_test)
print(metrics.accuracy_score(pred,y_test))
plot_confusion_matrix(lr, X_test, y_test,cmap="binary") 
metrics.plot_roc_curve(lr, X_test, y_test) 
metrics.plot_precision_recall_curve(lr, X_test, y_test) 

plt.grid(False)
plt.show()

In [None]:
lr = LogisticRegression(max_iter=100000)
lr.fit(X_train_delCols,y_train_delCols)
pred = lr.predict(X_test_delCols)
print(metrics.accuracy_score(pred,y_test_delCols))
plot_confusion_matrix(lr, X_test_delCols, y_test_delCols,cmap="binary") 
metrics.plot_roc_curve(lr, X_test_delCols, y_test_delCols) 
metrics.plot_precision_recall_curve(lr, X_test_delCols, y_test_delCols) 

plt.grid(False)
plt.show()

In [None]:
lr = LogisticRegression(max_iter=100000)
lr.fit(X_train_delRows,y_train_delRows)
pred = lr.predict(X_test_delRows)
print(metrics.accuracy_score(pred,y_test_delRows))
plot_confusion_matrix(lr, X_test_delRows, y_test_delRows,cmap="binary") 
metrics.plot_roc_curve(lr, X_test_delRows, y_test_delRows) 
metrics.plot_precision_recall_curve(lr, X_test_delRows, y_test_delRows) 

plt.grid(False)
plt.show()

In [None]:
lr = LogisticRegression(max_iter=100000)
lr.fit(X_train_addMean,y_train_addMean)
pred = lr.predict(X_test_addMean)
print(metrics.accuracy_score(pred,y_test_addMean))
plot_confusion_matrix(lr, X_test_addMean, y_test_addMean,cmap="binary")
metrics.plot_roc_curve(lr, X_test_addMean, y_test_addMean) 
metrics.plot_precision_recall_curve(lr, X_test_addMean, y_test_addMean) 

plt.grid(False)
plt.show()

In [None]:
lr = LogisticRegression(max_iter=100000)
lr.fit(X_train_addCol,y_train_addCol)
pred = lr.predict(X_test_addCol)
print(metrics.accuracy_score(pred,y_test_addCol))
plot_confusion_matrix(lr, X_test_addCol, y_test_addCol,cmap="binary") 
metrics.plot_roc_curve(lr, X_test_addCol, y_test_addCol) 
metrics.plot_precision_recall_curve(lr, X_test_addCol, y_test_addCol) 

plt.grid(False)
plt.show()

In [None]:
lr = LogisticRegression(max_iter=100000)
lr.fit(X_train_lr,y_train_lr)
pred = lr.predict(X_test_lr)
print(metrics.accuracy_score(pred,y_test_lr))
plot_confusion_matrix(lr, X_test_lr, y_test_lr,cmap="binary") 
metrics.plot_roc_curve(lr, X_test_lr, y_test_lr) 
metrics.plot_precision_recall_curve(lr, X_test_lr, y_test_lr) 

plt.grid(False)
plt.show()

The best way is **Imputation with an additional column**


# Other models 

### Now we want to explore differents models in order to compare the results

## Logistic Regression with Gradient Ascent

In [None]:
training_data_matrix = X_train_addCol.to_numpy()
target_array = y_train_addCol.to_numpy()
column_of_ones = np.ones(shape=(training_data_matrix.shape[0], 1))
training_data_matrix = np.hstack((column_of_ones, training_data_matrix))
theta0 = np.zeros((training_data_matrix.shape[1]))
theta_final, log_l_history = gradient_ascent(theta0, training_data_matrix, target_array, lr=0.0001 , num_steps=100000)

fig,ax = plt.subplots(num=2)
ax.set_ylabel('l(Theta)')
ax.set_xlabel('Iterations')
_=ax.plot(range(len(log_l_history)),log_l_history,'b.')

In [None]:
test_data_matrix = X_test_addCol.to_numpy()
target_array = y_test_addCol.to_numpy()

column_of_ones = np.ones(shape=(test_data_matrix.shape[0], 1))
test_data_matrix = np.hstack((column_of_ones, test_data_matrix))

In [None]:
survived_array = predictions(test_data_matrix, theta_final).flatten()

Survived_series = pd.Series(data=survived_array, index=X_test_addCol.index)
X_test_addCol_f = X_test_addCol.copy()
X_test_addCol_f.insert(0, "T", Survived_series)


In [None]:
print(y_test_addCol.shape[0])

In [None]:
correct = np.sum((np.rint(X_test_addCol_f["T"])) == y_test_addCol)
print(correct/y_test_addCol.shape[0])
print(X_test_addCol_f["T"])

https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test_addCol,X_test_addCol_f["T"])
auc = metrics.roc_auc_score(y_test_addCol,X_test_addCol_f["T"])
plt.plot(fpr,tpr,label="LG w\ GA auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
precision, recall, thresholds = metrics.precision_recall_curve(y_test_addCol,X_test_addCol_f["T"])
AP = metrics.average_precision_score(y_test_addCol,X_test_addCol_f["T"])
plt.plot(recall,precision,label="LG w\ GA="+str(AP))
plt.legend(loc=4)


plt.show()

## Stochastic Gradient Descent

In [None]:

clf = SGDClassifier(loss='log', max_iter=100000, alpha=0.0001)
clf.fit(X_train_addCol,y_train_addCol)
pred = clf.predict(X_test_addCol)
print(metrics.accuracy_score(pred,y_test_addCol))
plot_confusion_matrix(clf, X_test_addCol, y_test_addCol,cmap="binary") 
metrics.plot_roc_curve(clf, X_test_addCol, y_test_addCol) 
metrics.plot_precision_recall_curve(clf, X_test_addCol, y_test_addCol) 


plt.grid(False)
plt.show()

## Gaussian discriminant analysis 

https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html

In [None]:
clf = LinearDiscriminantAnalysis()
clf.fit(X_train_addCol,y_train_addCol)
pred = clf.predict(X_test_addCol)
print(metrics.accuracy_score(pred,y_test_addCol))
plot_confusion_matrix(clf, X_test_addCol, y_test_addCol,cmap="binary") 
metrics.plot_roc_curve(clf, X_test_addCol, y_test_addCol) 
metrics.plot_precision_recall_curve(clf, X_test_addCol, y_test_addCol) 


plt.grid(False)
plt.show()

## Naive Bayes

In [None]:
clf = GaussianNB()
clf.fit(X_train_addCol,y_train_addCol)
pred = clf.predict(X_test_addCol)
print(metrics.accuracy_score(pred,y_test_addCol))
plot_confusion_matrix(clf, X_test_addCol, y_test_addCol,cmap="binary") 
metrics.plot_roc_curve(clf, X_test_addCol, y_test_addCol) 
metrics.plot_precision_recall_curve(clf, X_test_addCol, y_test_addCol) 


plt.grid(False)
plt.show()

In [None]:
clf = BernoulliNB()
clf.fit(X_train_addCol,y_train_addCol)
pred = clf.predict(X_test_addCol)
print(metrics.accuracy_score(pred,y_test_addCol))
plot_confusion_matrix(clf, X_test_addCol, y_test_addCol,cmap="binary") 
metrics.plot_roc_curve(clf, X_test_addCol, y_test_addCol) 
metrics.plot_precision_recall_curve(clf, X_test_addCol, y_test_addCol) 
plt.grid(False)
plt.show()

# Tuning hyperparameter

In this section we explored the different hyperparameters

In [None]:
#dataset
X_df_extraC = df_extraC.drop('HeartDisease', axis=1)
X_df_extraC = pd.get_dummies(X_df_extraC).values
 
#creating the array fo the y 
y = df_extraC['HeartDisease']
y_extraC = y.values 

## Logistic regression

We tuned these hyperparameter:
- max_iter : {1000,5000,10000,50000,100000}
- C : {0.1,1, 10, 100}
- Degree : {1, 2, 3}

In [48]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
X_df_extraC, y_extraC
#poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly = PolynomialFeatures(degree = 3)

X_deg3 = poly.fit_transform(X_df_extraC)
X_deg3.shape

(918, 2024)

In [None]:

####grid search stuff
from sklearn.model_selection import GridSearchCV
 
Scores = {'accuracy'}   
##[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]
 
K=10
repeats=5
cv = RepeatedKFold(n_splits=K, n_repeats=repeats, random_state=0) 
clf = GridSearchCV(LogisticRegression(),{
    'max_iter' : [1000,5000,10000,50000,100000],
    'C': [0.1,1, 10,100],
}, cv=cv,scoring='accuracy')
 
 
clf.fit(X_deg3, y_extraC)
clf.cv_results_               
 
 
res = pd.DataFrame(clf.cv_results_)
res[['param_max_iter','param_C','mean_test_score','std_test_score']]
 
 
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [48]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
X_df_extraC, y_extraC
#poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
poly = PolynomialFeatures(degree = 2)

X_deg3 = poly.fit_transform(X_df_extraC)
X_deg3.shape

(918, 2024)

In [None]:

####grid search stuff
from sklearn.model_selection import GridSearchCV
 
Scores = {'accuracy'}   
##[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]
 
K=10
repeats=5
cv = RepeatedKFold(n_splits=K, n_repeats=repeats, random_state=0) 
clf = GridSearchCV(LogisticRegression(),{
    'max_iter' : [1000,5000,10000,50000,100000],
    'C': [0.1,1, 10,100],
}, cv=cv,scoring='accuracy')
 
 
clf.fit(X_deg3, y_extraC)
clf.cv_results_               
 
 
res = pd.DataFrame(clf.cv_results_)
res[['param_max_iter','param_C','mean_test_score','std_test_score']]
 
 
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
clf.best_score_, clf.best_params_

In [None]:
####grid search stuff
from sklearn.model_selection import GridSearchCV
 
Scores = {'accuracy'}   
##[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]
 
K=10
repeats=5
cv = RepeatedKFold(n_splits=K, n_repeats=repeats, random_state=0) 
clf = GridSearchCV(LogisticRegression(),{
    'max_iter' : [100, 1000,5000,10000,50000,100000],
    'C': [0.1,1, 10,100],
}, cv=cv,scoring=['accuracy'])
 
 
clf.fit(X_df_extraC, y_extraC)
clf.cv_results_               
 
 
res = pd.DataFrame(clf.cv_results_)
res[['param_max_iter','param_C','mean_test_score','std_test_score']]
 
 

In [None]:
clf.best_score_, clf.best_params_

## GDA

https://machinelearningmastery.com/linear-discriminant-analysis-with-python/

We tuned these hyperparameter:
- solver : {'svd', 'lsqr'}

In [None]:
####grid search stuff
from sklearn.model_selection import GridSearchCV
 
Scores = {'accuracy'}   
##[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]
 
K=10
repeats=5
cv = RepeatedKFold(n_splits=K, n_repeats=repeats, random_state=0) 
clf = GridSearchCV(LinearDiscriminantAnalysis(),{
    'solver' : ['svd', 'lsqr'],
}, cv=cv,scoring=['accuracy'])
 
 
clf.fit(X_df_extraC, y_extraC)
clf.cv_results_               
 
 
res = pd.DataFrame(clf.cv_results_)
res[['param_max_iter','param_C','mean_test_score','std_test_score']]
 
 

In [None]:
clf.best_score_, clf.best_params_

## GaussianNB

https://www.analyticsvidhya.com/blog/2021/01/gaussian-naive-bayes-with-hyperpameter-tuning/#h2_2

We tuned these hyperparameter:
- var_smooth : np.logspace(0,-9, num=100)

In [None]:
####grid search stuff
from sklearn.model_selection import GridSearchCV
 
Scores = {'accuracy'}   
##[0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0]
 
K=10
repeats=5
cv = RepeatedKFold(n_splits=K, n_repeats=repeats, random_state=0) 
clf = GridSearchCV(GaussianNB(),{
    var_smooth : np.logspace(0,-9, num=100)
}, cv=cv,scoring=['accuracy'])
 
 
clf.fit(X_df_extraC, y_extraC)
clf.cv_results_               
 
 
res = pd.DataFrame(clf.cv_results_)
res[['param_max_iter','param_C','mean_test_score','std_test_score']]
 
 

In [None]:
clf.best_score_, clf.best_params_