In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [2]:
'''
1. Import the data and make the same changes to the data you made in exercise 5 (solutions are 
posted to Canvas for exercise 5)
'''
df = pd.read_excel('framingham_dataset_mod.xlsx')
df = df.dropna()

df['winter'] = np.where((df.month == 12) | (df.month < 3), 1, 0)
df['spring'] = np.where((df.month > 3) & (df.month < 6), 1, 0)
df['summer'] = np.where((df.month > 5) & (df.month < 9), 1, 0)
df['fall'] = np.where((df.month > 8) & (df.month < 12), 1, 0)

df['log_bmi'] = np.log(df.bmi)


display(df)

Unnamed: 0,id,sex,sbp,dbp,scl,age,bmi,month,chdfate,winter,spring,summer,fall,log_bmi
0,2642,1,120,80,267.0,55,25.0,8,1,0,0,1,0,3.218876
1,4627,1,130,78,192.0,53,28.4,12,1,1,0,0,0,3.346389
2,2568,1,144,90,207.0,61,25.1,8,1,0,0,1,0,3.222868
3,4192,1,92,66,231.0,48,26.2,11,1,0,0,0,1,3.265759
4,3977,1,162,98,271.0,39,28.4,11,1,0,0,0,1,3.346389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4690,1805,2,130,84,175.0,35,27.7,4,0,0,1,0,0,3.321432
4691,1976,2,136,92,197.0,45,23.1,5,0,0,1,0,0,3.139833
4692,3195,1,130,88,213.0,47,28.4,9,0,0,0,0,1,3.346389
4693,1674,2,112,68,252.0,40,22.0,4,0,0,1,0,0,3.091042


In [3]:
'''2. Split the data into a training test split. Use an 80% training and 20% test set split'''
y = df['chdfate']
Xn = sm.add_constant(df[['sex', 'dbp', 'scl', 'age', 'log_bmi', 'spring', 'summer', 'fall']])
X_train, X_test, y_train, y_test = train_test_split(Xn, y, test_size=0.2, random_state=1)

In [4]:
'''3. Scale the predictor variables'''

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
'''4. Fit a logistic regression model on the training data using all the relevant predictor variables 
(Note: use season, not month. Also, ID is not a predictor variable. Do not use it). Use sklearn. This 
reference will be helpful'''

logreg_model = LogisticRegression(random_state=0)
logreg_model.fit(X_train_scaled, y_train)

LogisticRegression(random_state=0)

In [6]:
'''5. Using the test data, generate a confusion matrix and evaluate the logistic regression model
a. Accuracy
b. Precision
c. Recall
'''
y_pred_logreg = logreg_model.predict(X_test_scaled)
conf_matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
print ("Confusion Matrix : \n", conf_matrix_logreg)

accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
precision_logreg = precision_score(y_test, y_pred_logreg)
recall_logreg = recall_score(y_test, y_pred_logreg)
print("Accuracy:", accuracy_logreg)
print("Precision:", precision_logreg)
print("Recall:", recall_logreg)

Confusion Matrix : 
 [[598  43]
 [247  43]]
Accuracy: 0.6885069817400644
Precision: 0.5
Recall: 0.1482758620689655


In [7]:
'''6. Fit a neural net classifier model using the defaults'''
nn_default_model = MLPClassifier(random_state=0)
nn_default_model.fit(X_train_scaled, y_train)



MLPClassifier(random_state=0)

In [8]:
'''7. Generate a confusion matrix and evaluate the neural net model
a. Accuracy
b. Precision
c. Recall'''
y_pred_nn_default = nn_default_model.predict(X_test_scaled)
conf_matrix_nn_default = confusion_matrix(y_test, y_pred_nn_default)
print ("Confusion Matrix : \n", conf_matrix_nn_default)

accuracy_nn_default = accuracy_score(y_test, y_pred_nn_default)
precision_nn_default = precision_score(y_test, y_pred_nn_default)
recall_nn_default = recall_score(y_test, y_pred_nn_default)
print("Accuracy:", accuracy_nn_default)
print("Precision:", precision_nn_default)
print("Recall:", recall_nn_default)

Confusion Matrix : 
 [[599  42]
 [234  56]]
Accuracy: 0.7035445757250268
Precision: 0.5714285714285714
Recall: 0.19310344827586207


In [15]:
'''8. Fit a neural net classifier model using 24 neurons and 2 hidden layers'''
nn_custom_model = MLPClassifier(hidden_layer_sizes=(24, 24), random_state=0)
nn_custom_model.fit(X_train_scaled, y_train)



MLPClassifier(hidden_layer_sizes=(24, 24), random_state=0)

In [16]:
'''9. Generate a confusion matrix and evaluate the new neural net model
a. Accuracy
b. Precision
c. Recall'''
y_pred_nn_custom = nn_custom_model.predict(X_test_scaled)
conf_matrix_nn_custom = confusion_matrix(y_test, y_pred_nn_custom)
print ("Confusion Matrix : \n", conf_matrix_nn_custom)

accuracy_nn_custom = accuracy_score(y_test, y_pred_nn_custom)
precision_nn_custom = precision_score(y_test, y_pred_nn_custom)
recall_nn_custom = recall_score(y_test, y_pred_nn_custom)
print("Accuracy:", accuracy_nn_custom)
print("Precision:", precision_nn_custom)
print("Recall:", recall_nn_custom)

Confusion Matrix : 
 [[575  66]
 [218  72]]
Accuracy: 0.6949516648764769
Precision: 0.5217391304347826
Recall: 0.2482758620689655


In [11]:
'''10. Compare performance of all 3 model'''
results = pd.DataFrame({'Accuracy': [accuracy_logreg, accuracy_nn_default, accuracy_nn_custom], 'Precision': [precision_logreg,precision_nn_default,precision_nn_custom], 'Recall': [recall_logreg,recall_nn_default,recall_nn_custom]}, index = ['Logistic', 'Neural Net (default)', 'Nerual Net (24  neuron)'])
results

Unnamed: 0,Accuracy,Precision,Recall
Logistic,0.688507,0.5,0.148276
Neural Net (default),0.703545,0.571429,0.193103
Nerual Net (24 neuron),0.694952,0.521739,0.248276
