In [1141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

pd.set_option('display.max_columns', None)

In [1142]:
df = pd.read_csv("DefualtData/UCI_Credit_card.csv")
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [1143]:
#Rename columns to make pay align with bill & pay amt
df = df.rename(columns={"PAY_0":"PAY_1", "default.payment.next.month":"DEFAULT"})
#Remove unnessecary columns
df = df.drop(columns="ID")


In [1144]:
X = df.drop(columns="DEFAULT")
y= df["DEFAULT"]

In [1145]:
#Split Training and Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1146]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#Do X_Train and X_Test seperately or else the model gets indirect access to distribution in the test data, which is data leakage
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

In [1147]:
#Make and train model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_scaled, y_train)

In [1148]:
# Prediction
y_pred = model.predict(X_test_scaled)

In [1149]:
# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6925
Confusion Matrix:
 [[3314 1373]
 [ 472  841]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.71      0.78      4687
           1       0.38      0.64      0.48      1313

    accuracy                           0.69      6000
   macro avg       0.63      0.67      0.63      6000
weighted avg       0.77      0.69      0.72      6000



The accuracy of this model when predicting the test data is 69.25%.<br>

3314 Predicted Non-Defaulters Did not Default (True Negative)<br>
472 Predicted Non-Defaulters Defaulted (False Negative) <br>
1373 Predicted Defaulters did not Defaulted (False Positive) <br>
841 Predicted Defaulters Defaulted (True Positive) <br>
Note: I can read this, but the wording is pretty bad, in the future use better wording <br>

Predicted non defaulters (0) with a 88% accuracy <br>
Predicted defaulters (1) with a 38% accuracy <br>

Correctly gets 71% of people who did not default <br>
Correctly gets 64% of people who did default <br>

Macro Avg: Unweighted average across all classes (0s & 1s) <br>
Weighted Avg: Weights certain parameters higher to be more accurate <br>

New Goal: Increase Accuracy of Default prediciton (1)


In [1150]:
#Get probability of y
y_probs = model.predict_proba(X_test_scaled)[:, 1]

#use a lower threshhold isntead of degault 0.5
threshold = 0.4
y_pred_thresh = (y_probs >= threshold).astype(int)

#evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_thresh))
print(confusion_matrix(y_test, y_pred_thresh))
print(classification_report(y_test, y_pred_thresh))

Accuracy: 0.5078333333333334
[[1982 2705]
 [ 248 1065]]
              precision    recall  f1-score   support

           0       0.89      0.42      0.57      4687
           1       0.28      0.81      0.42      1313

    accuracy                           0.51      6000
   macro avg       0.59      0.62      0.50      6000
weighted avg       0.76      0.51      0.54      6000



Predicts defaulters much better, removing many false negatives.<br>
However, increased false positives, marking many more non defaulters as defaulters.<br>
Try again with different thresholds, maybe a higher threshold will be better.<br>

In [1151]:
#Get probability of y
y_probs = model.predict_proba(X_test_scaled)[:, 1]

#use a lower threshhold isntead of degault 0.5
threshold = 0.760
y_pred_thresh = (y_probs >= threshold).astype(int)

#evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_thresh))
print(confusion_matrix(y_test, y_pred_thresh))
print(classification_report(y_test, y_pred_thresh))

Accuracy: 0.8086666666666666
[[4558  129]
 [1019  294]]
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4687
           1       0.70      0.22      0.34      1313

    accuracy                           0.81      6000
   macro avg       0.76      0.60      0.61      6000
weighted avg       0.79      0.81      0.77      6000



This gets peak accuracy. <br>
But, in futher retrospect, the recall for defaulters is very low, and the f1-score reflects that. <br>
1019 out of 1313 defaulters were missed <br>
Purely looking for accuracy is not the best solution, in actual use case threshold 0.3 is more useful <br>

Trying Feature Selection to focus on higher correlated variables

In [1152]:
#Using SelectKBest to find top k features that have the strongest relationship, and eliminate vairables with low correlation
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=13)
X_selected = selector.fit_transform(X_scaled, y)
selected_features = selector.get_support(indices=True)
print("Selected feature indices:", selected_features)

Selected feature indices: [ 0  5  6  7  8  9 10 17 18 19 20 21 22]


In [1153]:
selected_feature_names = [X.columns.values.tolist()[i] for i in selected_features]
print("Selected features:", selected_feature_names)


Selected features: ['LIMIT_BAL', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']


This selection ends up focusing purely on payment history, which would be correct when looking at the correlation heatmap in Defaults EDA. <br>


In [1154]:
#df1 will be new dataset to work with with filters to only focus on payment history and ignore demographic info
df1 = df[selected_feature_names]

X = df1
y = df["DEFAULT"]

#Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1155]:
# Scale the filtered features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [1156]:
# Train the logistic regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_scaled, y_train)

In [1157]:
# Make predictions
y_pred = model.predict(X_test_scaled)

In [1158]:
#Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.677
Confusion Matrix:
 [[3175 1512]
 [ 426  887]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.68      0.77      4687
           1       0.37      0.68      0.48      1313

    accuracy                           0.68      6000
   macro avg       0.63      0.68      0.62      6000
weighted avg       0.77      0.68      0.70      6000



This seems to have not done anything, perhaps because these were already the main groups used to indentify the likelyhood of defaulting by the model.<br>

Next action: Feature Engineering

In [1159]:
#Setting up new Features
df['TOTAL_BILL'] = df[[f'BILL_AMT{i}' for i in range(1, 7)]].sum(axis=1)
df['TOTAL_PAY'] = df[[f'PAY_AMT{i}' for i in range(1, 7)]].sum(axis=1)
df['MEAN_DELAY'] = df[[f'PAY_{i}' for i in range(1, 7)]].mean(axis=1)
df['BAL_USED'] = df['TOTAL_BILL'] / df['LIMIT_BAL']

#Setting up Data
X = df.drop(columns="DEFAULT")
y= df["DEFAULT"]

In [1160]:
#Split Training and Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1161]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#Do X_Train and X_Test seperately or else the model gets indirect access to distribution in the test data, which is data leakage
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

In [1162]:
#Make and train model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_scaled, y_train)

In [1163]:
# Prediction
y_pred = model.predict(X_test_scaled)

In [1164]:
# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6955
Confusion Matrix:
 [[3345 1342]
 [ 485  828]]
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.71      0.79      4687
           1       0.38      0.63      0.48      1313

    accuracy                           0.70      6000
   macro avg       0.63      0.67      0.63      6000
weighted avg       0.77      0.70      0.72      6000



Results seem similar to before features were added, perhaps even slightly worse than before. <br>
Additional Note: Removing class_weight='Balanced' brought overall accuracy up, but brought recall for Defaulters down. <br>
Before trying a different Model, I will try to split the data using stratified sampling rather than random sampling. Since majority of the data is non-defaulters, it could be causing a bias in the model

In [1165]:
from sklearn.utils import resample
train = pd.concat([X_train, y_train], axis=1)

majority = train[train['DEFAULT'] == 0]
minority = train[train['DEFAULT'] == 1]

majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)
balanced_train = pd.concat([majority_downsampled, minority]).sample(frac=1, random_state=42)

# Separate X and y again
X_train_balanced = balanced_train.drop('DEFAULT', axis=1)
y_train_balanced = balanced_train['DEFAULT']

In [1166]:
#Make and train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_balanced, y_train_balanced)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1167]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6415
              precision    recall  f1-score   support

           0       0.86      0.64      0.74      4687
           1       0.33      0.64      0.44      1313

    accuracy                           0.64      6000
   macro avg       0.60      0.64      0.59      6000
weighted avg       0.75      0.64      0.67      6000

[[3014 1673]
 [ 478  835]]


Seems to do slightly worse than before balancing. Seems class_weight='balanced' worked better at weighing data out than through making it perfectly even. <br>

That will be in for the Logistic Regression Model Attempt. Next is 