In [2]:
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
data = Path('Resources/cc_default.csv')
df = pd.read_csv(data)

In [4]:
x_cols = [i for i in df.columns if i not in ('ID', 'default_next_month')]
X = df[x_cols]
y = df['default_next_month']

In [5]:
x_cols

['ln_balance_limit', 'sex', 'education', 'marriage', 'age']

In [6]:
Counter(y)

Counter({1: 6636, 0: 23364})

In [7]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Use the SMOTEENN technique to perform combination sampling on the data
# Count the resampled classes
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 10783, 1: 5801})

In [9]:
# Fit a Logistic regression model using random undersampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [10]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[4824, 1008],
       [1159,  509]], dtype=int64)

In [11]:
# Calculate the Balanced Accuracy Score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.5661581845634603

In [12]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.81      0.83      0.31      0.82      0.50      0.27      5832
          1       0.34      0.31      0.83      0.32      0.50      0.24      1668

avg / total       0.70      0.71      0.42      0.71      0.50      0.26      7500

