# Lab 4: 


## Import Libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from ISLP import load_data
import warnings
warnings.filterwarnings('ignore')


## Question 1: Load and Examine Default Dataset

Load the Default dataset from the ISLP library and examine its structure. Report the dataset dimensions, the column names and their data types, and the distribution of the default variable (how many Yes vs. No). Then fit a logistic regression model to predict default using income, balance, and student as predictors (using the entire dataset). Report the coefficient for balance and interpret its meaning in terms of the log-odds of defaulting.


In [None]:
# Load the Default dataset
Default = load_data('Default')

# Examine dataset structure
print("Dataset dimensions:", Default.shape)
print("\nColumn names and data types:")
print(Default.dtypes)
print("\nFirst few rows:")
print(Default.head())

# Distribution of default variable
print("\nDistribution of default variable:")
default_counts = Default['default'].value_counts()
print(default_counts)
print(f"\nProportion of defaults: {default_counts['Yes'] / len(Default):.3f}")
print(f"Proportion of non-defaults: {default_counts['No'] / len(Default):.3f}")


In [None]:
# Fit logistic regression model using entire dataset
# Prepare features and target
X = Default[['income', 'balance', 'student']].copy()
# Convert student to binary (Yes=1, No=0)
X['student'] = (X['student'] == 'Yes').astype(int)
y = (Default['default'] == 'Yes').astype(int)

# Fit logistic regression
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X, y)

# Get coefficients
coefficients = pd.DataFrame({
    'Feature': ['income', 'balance', 'student'],
    'Coefficient': log_reg.coef_[0]
})
print("Logistic Regression Coefficients:")
print(coefficients)

# Report balance coefficient specifically
balance_coef = log_reg.coef_[0][1]  # balance is the second feature
print(f"\nBalance coefficient: {balance_coef:.6f}")
print(f"\nInterpretation: For every $1 increase in balance, the log-odds of defaulting increases by {balance_coef:.6f}.")
print(f"This means that higher credit card balances are associated with higher probability of defaulting.")


## Question 2: Train/Test Split and LDA/QDA Models

Split the Default dataset into training (70%) and testing (30%) sets using train_test_split with random_state=42. Fit both Linear Discriminant Analysis (LDA) and Quadratic Discriminant Analysis (QDA) models using income and balance as predictors. For the LDA model, report the class means for each predictor and the prior probabilities for each class. Generate predictions on the test set for both LDA and QDA, create confusion matrices, and report their test accuracy.


In [None]:
# Prepare features for LDA/QDA (only income and balance)
X_features = Default[['income', 'balance']].copy()
y_labels = Default['default']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_labels, test_size=0.3, random_state=42, stratify=y_labels
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Training set default distribution: {y_train.value_counts().to_dict()}")
print(f"Test set default distribution: {y_test.value_counts().to_dict()}")


In [None]:
# Fit LDA model
lda = LDA()
lda.fit(X_train, y_train)

# Report LDA class means and prior probabilities
print("LDA Model Results:")
print(f"Classes: {lda.classes_}")
print(f"\nPrior probabilities:")
for i, class_name in enumerate(lda.classes_):
    print(f"  {class_name}: {lda.priors_[i]:.4f}")

print(f"\nClass means for each predictor:")
means_df = pd.DataFrame(lda.means_, 
                       columns=['income', 'balance'],
                       index=lda.classes_)
print(means_df)

# Make predictions
lda_pred = lda.predict(X_test)
lda_accuracy = accuracy_score(y_test, lda_pred)

print(f"\nLDA Test Accuracy: {lda_accuracy:.4f}")
print("\nLDA Confusion Matrix:")
lda_cm = confusion_matrix(y_test, lda_pred)
print(lda_cm)


In [None]:
# Fit QDA model
qda = QDA()
qda.fit(X_train, y_train)

# Make predictions
qda_pred = qda.predict(X_test)
qda_accuracy = accuracy_score(y_test, qda_pred)

print(f"QDA Test Accuracy: {qda_accuracy:.4f}")
print("\nQDA Confusion Matrix:")
qda_cm = confusion_matrix(y_test, qda_pred)
print(qda_cm)

# Compare LDA and QDA accuracies
print(f"\nComparison:")
print(f"LDA Accuracy: {lda_accuracy:.4f}")
print(f"QDA Accuracy: {qda_accuracy:.4f}")


## Question 3: Naive Bayes Classifier

Using the same train/test split from Question 2, fit a Naive Bayes classifier (GaussianNB) with income and balance as predictors. Generate predictions on the test set and create a confusion matrix. Compare the test accuracy of Naive Bayes with the LDA and QDA results. Finally, use the predict_proba() method to find the predicted probability of default for a customer with income = 40000 and balance = 2000.


In [None]:
# Fit Naive Bayes classifier
nb = GaussianNB()
nb.fit(X_train, y_train)

# Make predictions
nb_pred = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)

print(f"Naive Bayes Test Accuracy: {nb_accuracy:.4f}")
print("\nNaive Bayes Confusion Matrix:")
nb_cm = confusion_matrix(y_test, nb_pred)
print(nb_cm)

# Compare with LDA and QDA
print(f"\nAccuracy Comparison:")
print(f"LDA Accuracy: {lda_accuracy:.4f}")
print(f"QDA Accuracy: {qda_accuracy:.4f}")
print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")


In [None]:
# Predict probability for specific customer
customer_data = np.array([[40000, 2000]])  # income=40000, balance=2000
customer_proba = nb.predict_proba(customer_data)

print(f"Predicted probabilities for customer with income=40000, balance=2000:")
print(f"Probability of No (non-default): {customer_proba[0][0]:.4f}")
print(f"Probability of Yes (default): {customer_proba[0][1]:.4f}")
print(f"\nPredicted class: {nb.predict(customer_data)[0]}")


## Question 4: K-Nearest Neighbors (KNN)

Using the same train/test split, apply feature scaling to income and balance with StandardScaler. Fit K-Nearest Neighbors (KNN) models with n_neighbors = 1, 3, 5, and 10, and evaluate their test performance. Create a table summarizing the test accuracy for each K value. Identify which K gives the best performance and explain why very small values of K (such as K=1) may not be optimal.


In [None]:
# Apply feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling applied successfully.")
print(f"Scaled training set mean: {X_train_scaled.mean(axis=0)}")
print(f"Scaled training set std: {X_train_scaled.std(axis=0)}")


In [None]:
# Fit KNN models with different K values
k_values = [1, 3, 5, 10]
knn_results = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    knn_pred = knn.predict(X_test_scaled)
    knn_accuracy = accuracy_score(y_test, knn_pred)
    
    knn_results.append({
        'K': k,
        'Accuracy': knn_accuracy,
        'Confusion_Matrix': confusion_matrix(y_test, knn_pred)
    })
    
    print(f"K={k}: Test Accuracy = {knn_accuracy:.4f}")
    print(f"Confusion Matrix:")
    print(confusion_matrix(y_test, knn_pred))
    print()


In [None]:
# Create summary table
knn_summary = pd.DataFrame([
    {'K': result['K'], 'Test_Accuracy': result['Accuracy']}
    for result in knn_results
])

print("KNN Performance Summary:")
print(knn_summary.to_string(index=False))

# Find best K
best_k_result = max(knn_results, key=lambda x: x['Accuracy'])
best_k = best_k_result['K']
best_accuracy = best_k_result['Accuracy']

print(f"\nBest K value: {best_k} with accuracy: {best_accuracy:.4f}")

print(f"\nExplanation for why K=1 may not be optimal:")
print(f"- K=1 is very sensitive to noise and outliers")
print(f"- It can lead to overfitting, especially with small datasets")
print(f"- The decision boundary becomes very irregular and complex")
print(f"- Higher K values provide smoother decision boundaries and better generalization")


## Question 5: Comprehensive Comparison and Analysis

Create a summary table comparing the test accuracy of all methods implemented: Logistic Regression (refit on training data), LDA, QDA, Naive Bayes, and the best KNN from Question 4. Using the confusion matrices, identify which method has the lowest false negative rate. If the cost of missing a default is 10 times higher than a false alarm, recommend which method should be used and explain why. Finally, for your chosen method, adjust the probability threshold from 0.5 to 0.3 and report how this change affects the false positive and false negative rates.


In [None]:
# Refit logistic regression on training data only
X_train_full = Default[['income', 'balance', 'student']].iloc[X_train.index].copy()
X_train_full['student'] = (X_train_full['student'] == 'Yes').astype(int)
y_train_binary = (y_train == 'Yes').astype(int)

X_test_full = Default[['income', 'balance', 'student']].iloc[X_test.index].copy()
X_test_full['student'] = (X_test_full['student'] == 'Yes').astype(int)
y_test_binary = (y_test == 'Yes').astype(int)

log_reg_train = LogisticRegression(random_state=42)
log_reg_train.fit(X_train_full, y_train_binary)
log_reg_pred = log_reg_train.predict(X_test_full)
log_reg_accuracy = accuracy_score(y_test_binary, log_reg_pred)

print(f"Logistic Regression (training data only) Test Accuracy: {log_reg_accuracy:.4f}")


In [None]:
# Create comprehensive comparison table
comparison_results = [
    {'Method': 'Logistic Regression', 'Test_Accuracy': log_reg_accuracy},
    {'Method': 'LDA', 'Test_Accuracy': lda_accuracy},
    {'Method': 'QDA', 'Test_Accuracy': qda_accuracy},
    {'Method': 'Naive Bayes', 'Test_Accuracy': nb_accuracy},
    {'Method': f'KNN (K={best_k})', 'Test_Accuracy': best_accuracy}
]

comparison_df = pd.DataFrame(comparison_results)
comparison_df = comparison_df.sort_values('Test_Accuracy', ascending=False)

print("Comprehensive Method Comparison:")
print(comparison_df.to_string(index=False))

# Find best method
best_method = comparison_df.iloc[0]
print(f"\nBest performing method: {best_method['Method']} with accuracy: {best_method['Test_Accuracy']:.4f}")


In [None]:
# Calculate false negative rates from confusion matrices
def calculate_fnr(cm):
    # cm is [[TN, FP], [FN, TP]]
    tn, fp, fn, tp = cm.ravel()
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    return fnr

def calculate_fpr(cm):
    # cm is [[TN, FP], [FN, TP]]
    tn, fp, fn, tp = cm.ravel()
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return fpr

# Get confusion matrices for all methods
log_reg_cm = confusion_matrix(y_test_binary, log_reg_pred)
lda_cm_binary = confusion_matrix((y_test == 'Yes').astype(int), (lda_pred == 'Yes').astype(int))
qda_cm_binary = confusion_matrix((y_test == 'Yes').astype(int), (qda_pred == 'Yes').astype(int))
nb_cm_binary = confusion_matrix((y_test == 'Yes').astype(int), (nb_pred == 'Yes').astype(int))
best_knn_cm_binary = confusion_matrix((y_test == 'Yes').astype(int), 
                                     (KNeighborsClassifier(n_neighbors=best_k).fit(X_train_scaled, y_train).predict(X_test_scaled) == 'Yes').astype(int))

print("False Negative Rates:")
print(f"Logistic Regression: {calculate_fnr(log_reg_cm):.4f}")
print(f"LDA: {calculate_fnr(lda_cm_binary):.4f}")
print(f"QDA: {calculate_fnr(qda_cm_binary):.4f}")
print(f"Naive Bayes: {calculate_fnr(nb_cm_binary):.4f}")
print(f"KNN (K={best_k}): {calculate_fnr(best_knn_cm_binary):.4f}")

# Find method with lowest FNR
fnr_results = [
    ('Logistic Regression', calculate_fnr(log_reg_cm)),
    ('LDA', calculate_fnr(lda_cm_binary)),
    ('QDA', calculate_fnr(qda_cm_binary)),
    ('Naive Bayes', calculate_fnr(nb_cm_binary)),
    (f'KNN (K={best_k})', calculate_fnr(best_knn_cm_binary))
]

lowest_fnr_method = min(fnr_results, key=lambda x: x[1])
print(f"\nMethod with lowest False Negative Rate: {lowest_fnr_method[0]} (FNR = {lowest_fnr_method[1]:.4f})")


In [None]:
# Cost analysis: missing a default costs 10x more than false alarm
print("Cost Analysis (Missing default = 10x cost of false alarm):")
print("\nFor each method, calculating total cost:")

cost_results = []
for method_name, cm in [('Logistic Regression', log_reg_cm),
                        ('LDA', lda_cm_binary),
                        ('QDA', qda_cm_binary),
                        ('Naive Bayes', nb_cm_binary),
                        (f'KNN (K={best_k})', best_knn_cm_binary)]:
    tn, fp, fn, tp = cm.ravel()
    # Cost: FP = 1, FN = 10
    total_cost = fp * 1 + fn * 10
    cost_results.append((method_name, total_cost, fp, fn))
    print(f"{method_name}: Total Cost = {total_cost} (FP={fp}, FN={fn})")

# Find method with lowest cost
best_cost_method = min(cost_results, key=lambda x: x[1])
print(f"\nRecommended method based on cost analysis: {best_cost_method[0]}")
print(f"Total cost: {best_cost_method[1]} (FP={best_cost_method[2]}, FN={best_cost_method[3]})")
print(f"\nExplanation: This method minimizes the total cost where missing a default (FN) costs 10 times more than a false alarm (FP).")


In [None]:
# Adjust probability threshold for the recommended method
recommended_method = best_cost_method[0]

if 'Logistic Regression' in recommended_method:
    # Get probabilities for logistic regression
    proba_05 = log_reg_train.predict_proba(X_test_full)[:, 1]
    pred_05 = (proba_05 > 0.5).astype(int)
    pred_03 = (proba_05 > 0.3).astype(int)
    
    cm_05 = confusion_matrix(y_test_binary, pred_05)
    cm_03 = confusion_matrix(y_test_binary, pred_03)
    
elif 'LDA' in recommended_method:
    proba_05 = lda.predict_proba(X_test)[:, 1]
    pred_05 = (proba_05 > 0.5).astype(int)
    pred_03 = (proba_05 > 0.3).astype(int)
    
    cm_05 = confusion_matrix((y_test == 'Yes').astype(int), pred_05)
    cm_03 = confusion_matrix((y_test == 'Yes').astype(int), pred_03)
    
elif 'QDA' in recommended_method:
    proba_05 = qda.predict_proba(X_test)[:, 1]
    pred_05 = (proba_05 > 0.5).astype(int)
    pred_03 = (proba_05 > 0.3).astype(int)
    
    cm_05 = confusion_matrix((y_test == 'Yes').astype(int), pred_05)
    cm_03 = confusion_matrix((y_test == 'Yes').astype(int), pred_03)
    
elif 'Naive Bayes' in recommended_method:
    proba_05 = nb.predict_proba(X_test)[:, 1]
    pred_05 = (proba_05 > 0.5).astype(int)
    pred_03 = (proba_05 > 0.3).astype(int)
    
    cm_05 = confusion_matrix((y_test == 'Yes').astype(int), pred_05)
    cm_03 = confusion_matrix((y_test == 'Yes').astype(int), pred_03)
    
else:  # KNN
    knn_best = KNeighborsClassifier(n_neighbors=best_k)
    knn_best.fit(X_train_scaled, y_train)
    proba_05 = knn_best.predict_proba(X_test_scaled)[:, 1]
    pred_05 = (proba_05 > 0.5).astype(int)
    pred_03 = (proba_05 > 0.3).astype(int)
    
    cm_05 = confusion_matrix((y_test == 'Yes').astype(int), pred_05)
    cm_03 = confusion_matrix((y_test == 'Yes').astype(int), pred_03)

print(f"Threshold Analysis for {recommended_method}:")
print(f"\nThreshold = 0.5:")
print(f"Confusion Matrix: {cm_05.ravel()}")
print(f"False Positive Rate: {calculate_fpr(cm_05):.4f}")
print(f"False Negative Rate: {calculate_fnr(cm_05):.4f}")

print(f"\nThreshold = 0.3:")
print(f"Confusion Matrix: {cm_03.ravel()}")
print(f"False Positive Rate: {calculate_fpr(cm_03):.4f}")
print(f"False Negative Rate: {calculate_fnr(cm_03):.4f}")

print(f"\nChange in FPR: {calculate_fpr(cm_03) - calculate_fpr(cm_05):.4f}")
print(f"Change in FNR: {calculate_fnr(cm_03) - calculate_fnr(cm_05):.4f}")
print(f"\nInterpretation: Lowering the threshold from 0.5 to 0.3 makes the model more sensitive to predicting defaults,")
print(f"which reduces false negatives but increases false positives.")


## Summary


1. **Logistic Regression**: Provided interpretable coefficients showing the relationship between predictors and log-odds of default
2. **LDA and QDA**: Linear and quadratic discriminant analysis methods with different assumptions about class distributions
3. **Naive Bayes**: Simple probabilistic classifier assuming feature independence
4. **KNN**: Non-parametric method requiring feature scaling, with performance varying by K value
