1. Load the credit card data (https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients)

Here are explaination of the variables used:
This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study reviewed the literature and used the following 23 variables as explanatory variables:
X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
X2: Gender (1 = male; 2 = female).
X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
X4: Marital status (1 = married; 2 = single; 3 = others).
X5: Age (year).
X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. 
X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005.

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
  
# data (as pandas dataframes) 
X = default_of_credit_card_clients.data.features 
y = default_of_credit_card_clients.data.targets 
  
# metadata 
print(default_of_credit_card_clients.metadata) 
  
# variable information 
print(default_of_credit_card_clients.variables) 

{'uci_id': 350, 'name': 'Default of Credit Card Clients', 'repository_url': 'https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients', 'data_url': 'https://archive.ics.uci.edu/static/public/350/data.csv', 'abstract': "This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.", 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 30000, 'num_features': 23, 'feature_types': ['Integer', 'Real'], 'demographics': ['Sex', 'Education Level', 'Marital Status', 'Age'], 'target_col': ['Y'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Fri Mar 29 2024', 'dataset_doi': '10.24432/C55S3H', 'creators': ['I-Cheng Yeh'], 'intro_paper': {'title': 'The comparisons of data mining techniques for the predictive accuracy of probability of default of cre

2. Load the tools to be used.

In [2]:
# Import essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu, linear, sigmoid
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
import logging

# Suppress TensorFlow logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Custom modules 
# Add on Demand

# Set TensorFlow float precision and verbosity
tf.keras.backend.set_floatx('float64')
tf.autograph.set_verbosity(0)

3. Data patrition
Training(70%)
Validation(30%)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)
print("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
print("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)

X_train.shape (21000, 23) y_train.shape (21000, 1)
X_test.shape (9000, 23) y_test.shape (9000, 1)


#### take a look at our datasets

In [4]:
print(X_train[0:10])
print(y_train[0:10])
print(np.sum(y_train))
print(np.sum(y_test))

           X1  X2  X3  X4  X5  X6  X7  X8  X9  X10  ...     X14     X15  \
4936    20000   1   1   2  24  -1  -1  -1  -1   -1  ...     390     780   
4788   120000   1   2   1  52   2   0   0   0    0  ...  105262   46605   
8447    70000   2   2   2  26   0   0   0   0    0  ...   26103   28342   
4535   230000   1   2   2  37   1  -2  -2  -2   -2  ...       0       0   
27563  130000   1   2   2  56   0   0   0   0    0  ...  111780  116357   
16891  120000   1   1   2  33   1  -2  -2  -1   -1  ...       0     600   
28568  180000   2   3   1  40   0   0   0  -2   -2  ...    2079     792   
24502   50000   1   1   2  34   1   2   2   2    2  ...   49914   48382   
7425   180000   2   1   1  38   1   2   0   0    0  ...  117004  119542   
27693   50000   2   1   2  25  -1   2  -1  -1   -1  ...    2262    2185   

          X16     X17   X18    X19   X20   X21    X22   X23  
4936        0       0   390    390   780     0      0     0  
4788    45590   46204  4449   2731  3000  5000   3

  return reduction(axis=axis, out=out, **passkwargs)


In [5]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
print(X_train[0:10])
print(y_train[0:10])

[[-1.13811999e+00 -1.23402819e+00 -1.08113355e+00  8.60263312e-01
  -1.25044823e+00 -8.72212469e-01 -7.24864487e-01 -6.97151790e-01
  -6.67326417e-01 -6.49210559e-01 -1.48852344e+00 -6.89268715e-01
  -6.85560012e-01 -6.79853854e-01 -6.60926079e-01 -6.61641735e-01
  -6.50012923e-01 -3.13415732e-01 -2.46141485e-01 -2.51886380e-01
  -3.02330578e-01 -3.12222220e-01 -3.00971068e-01]
 [-3.64733911e-01 -1.23402819e+00  1.80329088e-01 -1.05813210e+00
   1.79323915e+00  1.78016893e+00  1.06856321e-01  1.35433947e-01
   1.84372547e-01  2.29005219e-01  2.49149151e-01  7.61464069e-01
   8.12551402e-01  8.54708826e-01  5.25646906e-02  8.88480589e-02
   1.23000623e-01 -7.02698877e-02 -1.42050882e-01 -1.24115905e-01
   1.77590093e-02 -1.15448614e-01 -1.24388267e-01]
 [-7.51426949e-01  8.10354262e-01  1.80329088e-01  8.60263312e-01
  -1.03304199e+00  1.19146657e-02  1.06856321e-01  1.35433947e-01
   1.84372547e-01  2.29005219e-01  2.49149151e-01 -3.76818019e-01
  -3.64764433e-01 -3.03602708e-01 -2.317

Plot trainning, test and so on...

In [7]:
tf.random.set_seed(1234)
model = Sequential(
    [
        tf.keras.Input(shape=(23,)),
        Dense(units=20, activation='relu', name='L1'),  
        Dense(units=10, activation='relu', name='L2'),
        Dense(units=5, activation='relu', name='L3'),
        Dense(units=1, activation='sigmoid', name='L4')
    ]
)


In [8]:
model.summary()
#### Examine Weights shapes
[layer1, layer2, layer3, layer4] = model.layers
W1,b1 = layer1.get_weights()
W2,b2 = layer2.get_weights()
W3,b3 = layer3.get_weights()
W4,b4 = layer4.get_weights()

print(f"W1 shape = {W1.shape}, b1 shape = {b1.shape}")
print(f"W2 shape = {W2.shape}, b2 shape = {b2.shape}")
print(f"W3 shape = {W3.shape}, b3 shape = {b3.shape}")
print(f"W4 shape = {W4.shape}, b3 shape = {b4.shape}")

W1 shape = (23, 20), b1 shape = (20,)
W2 shape = (20, 10), b2 shape = (10,)
W3 shape = (10, 5), b3 shape = (5,)
W4 shape = (5, 1), b3 shape = (1,)


In [9]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
)

model.fit(
    X_train,y_train,
    epochs=20
)


Epoch 1/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 660us/step - loss: 0.5441
Epoch 2/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 605us/step - loss: 0.4524
Epoch 3/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step - loss: 0.4458
Epoch 4/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 584us/step - loss: 0.4424
Epoch 5/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 607us/step - loss: 0.4399
Epoch 6/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 668us/step - loss: 0.4378
Epoch 7/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 595us/step - loss: 0.4360
Epoch 8/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 660us/step - loss: 0.4345
Epoch 9/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 583us/step - loss: 0.4333
Epoch 10/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

<keras.src.callbacks.history.History at 0x12dd1140710>

In [10]:
# Evaluate the model on the test data
test_loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

# Make predictions on the test data
predictions = model.predict(X_test)

# Convert y_test to a NumPy array if it's a DataFrame
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.values

# Compare predictions with actual test labels
print('Predictions:', predictions.flatten())
print('Actual Labels:', y_test.flatten())

# Optionally, you can threshold the predictions to get binary outputs
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)
print('Binary Predictions:', binary_predictions.flatten())

# Calculate and print the distribution of predictions
unique, counts = np.unique(binary_predictions, return_counts=True)
prediction_distribution = dict(zip(unique, counts))
print(f'Prediction Distribution: {prediction_distribution}')

# Calculate additional metrics with zero_division parameter to handle undefined precision
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions, zero_division=1)
recall = recall_score(y_test, binary_predictions, zero_division=1)
f1 = f1_score(y_test, binary_predictions, zero_division=1)
auc = roc_auc_score(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'AUC: {auc}')

[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 478us/step - loss: 0.4369
Test Loss: 0.44029490380903186
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 539us/step
Predictions: [0.06698307 0.11341859 0.14472478 ... 0.20425161 0.23159502 0.25014458]
Actual Labels: [0 1 0 ... 0 0 0]
Binary Predictions: [0 0 0 ... 0 0 0]
Prediction Distribution: {0: 7795, 1: 1205}
Accuracy: 0.8138888888888889
Precision: 0.6348547717842323
Recall: 0.3825
F1-Score: 0.47737909516380655
AUC: 0.768516


In [11]:
np.sum(binary_predictions)

1205

Test block here

Implementing SMOTE:

In [12]:
from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE
smote = SMOTE(random_state=42)

# Fit and apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the model with resampled data
model.fit(
    X_train_resampled, y_train_resampled,
    epochs=20,
    class_weight=None  # No need to pass class weights if using SMOTE
)

# Evaluate the model on the test data
test_loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate AUC
auc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc}')

# Convert to binary predictions with a different threshold if needed
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)

# Calculate additional metrics
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions, zero_division=1)
recall = recall_score(y_test, binary_predictions, zero_division=1)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')


Epoch 1/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 652us/step - loss: 0.5604
Epoch 2/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 590us/step - loss: 0.5436
Epoch 3/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 588us/step - loss: 0.5409
Epoch 4/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 571us/step - loss: 0.5390
Epoch 5/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 603us/step - loss: 0.5371
Epoch 6/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 574us/step - loss: 0.5355
Epoch 7/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 573us/step - loss: 0.5337
Epoch 8/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 573us/step - loss: 0.5324
Epoch 9/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 601us/step - loss: 0.5308
Epoch 10/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━