# Neural Network with the `default` Data Set

#### Complete excercise #7 from Section 10.10 of *Introduction to Statistical Learning 2e* (pg. 459).

#### Remember to set random seeds so your work is reproducible.

In [17]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Reload the dataset in case the prior modification was not intended for this task
file_path = '/Users/yuanhanlim/Desktop/DS & ML/14_default_neural_network/default.csv'
default_data = pd.read_csv(file_path)

# Encode categorical variables
default_data['default'] = default_data['default'].apply(lambda x: 1 if x == 'Yes' else 0)
default_data['student'] = default_data['student'].apply(lambda x: 1 if x == 'Yes' else 0)

# Features and target variable
X = default_data[['income', 'balance', 'student']]
y = default_data['default']

# Standardize the features for neural network
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test sets (70% train, 30% test)
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Compare the neural network's performance with logistic regression
# Logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

# Classification performance 
logistic_report = classification_report(y_test, y_pred_logistic, target_names=['No Default', 'Default'])

In [19]:
# Print Logistic Regression Classification Report
print("\nLogistic Regression Classification Report:\n")
print(classification_report(y_test, y_pred_logistic, target_names=['No Default', 'Default']))



Logistic Regression Classification Report:

              precision    recall  f1-score   support

  No Default       0.98      1.00      0.99      2906
     Default       0.71      0.27      0.39        94

    accuracy                           0.97      3000
   macro avg       0.85      0.63      0.69      3000
weighted avg       0.97      0.97      0.97      3000



In [16]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score

# Define the neural network architecture with dropout
model_with_dropout = Sequential([
    Dense(10, activation='relu', input_shape=(X_train.shape[1],)),  # Hidden layer with 10 units
    Dropout(0.5),  # Dropout layer with 50% rate
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model_with_dropout.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_with_dropout.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Predict on the test set
y_pred_nn_dropout = (model_with_dropout.predict(X_test) > 0.5).astype(int).flatten()

# Evaluate the accuracy of the model with dropout
accuracy_with_dropout = accuracy_score(y_test, y_pred_nn_dropout)

# Print the classification report for the model with dropout
print("Neural Network with Dropout Classification Report:\n")
print(classification_report(y_test, y_pred_nn_dropout, target_names=['No Default', 'Default']))

# Output the accuracy
accuracy_with_dropout


2024-11-26 13:23:06.725436: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Neural Network with Dropout Classification Report:

              precision    recall  f1-score   support

  No Default       0.97      1.00      0.99      2906
     Default       0.76      0.14      0.23        94

    accuracy                           0.97      3000
   macro avg       0.87      0.57      0.61      3000
weighted avg       0.97      0.97      0.96      3000



0.9716666666666667

The logistic regression model slightly outperforms the neural network with dropout in identifying "Default" cases, achieving a higher F1-score (0.39 vs. 0.23) and recall (0.27 vs. 0.14). Both models achieve similar overall accuracy (97%) due to the imbalanced dataset dominated by "No Default" cases. The neural network with dropout shows potential for better precision in identifying defaults (0.76 vs. 0.71), but its recall performance limits its effectiveness.