1. Load the credit card data (https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients)

Here are explaination of the variables used:
This research employed a binary variable, default payment (Yes = 1, No = 0), as the response variable. This study reviewed the literature and used the following 23 variables as explanatory variables:
X1: Amount of the given credit (NT dollar): it includes both the individual consumer credit and his/her family (supplementary) credit.
X2: Gender (1 = male; 2 = female).
X3: Education (1 = graduate school; 2 = university; 3 = high school; 4 = others).
X4: Marital status (1 = married; 2 = single; 3 = others).
X5: Age (year).
X6 - X11: History of past payment. We tracked the past monthly payment records (from April to September, 2005) as follows: X6 = the repayment status in September, 2005; X7 = the repayment status in August, 2005; . . .;X11 = the repayment status in April, 2005. The measurement scale for the repayment status is: -1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; . . .; 8 = payment delay for eight months; 9 = payment delay for nine months and above.
X12-X17: Amount of bill statement (NT dollar). X12 = amount of bill statement in September, 2005; X13 = amount of bill statement in August, 2005; . . .; X17 = amount of bill statement in April, 2005. 
X18-X23: Amount of previous payment (NT dollar). X18 = amount paid in September, 2005; X19 = amount paid in August, 2005; . . .;X23 = amount paid in April, 2005.

In [7]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
  
# data (as pandas dataframes) 
X = default_of_credit_card_clients.data.features 
y = default_of_credit_card_clients.data.targets 
  
# metadata 
print(default_of_credit_card_clients.metadata) 
  
# variable information 
print(default_of_credit_card_clients.variables) 

{'uci_id': 350, 'name': 'Default of Credit Card Clients', 'repository_url': 'https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients', 'data_url': 'https://archive.ics.uci.edu/static/public/350/data.csv', 'abstract': "This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.", 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 30000, 'num_features': 23, 'feature_types': ['Integer', 'Real'], 'demographics': ['Sex', 'Education Level', 'Marital Status', 'Age'], 'target_col': ['Y'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Fri Mar 29 2024', 'dataset_doi': '10.24432/C55S3H', 'creators': ['I-Cheng Yeh'], 'intro_paper': {'title': 'The comparisons of data mining techniques for the predictive accuracy of probability of default of cre

2. Load the tools to be used.

In [1]:
# Import essential libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
import tensorflow as tf
from tf.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.activations import relu, linear, sigmoid
# from tensorflow.keras.losses import BinaryCrossentropy
# from tensorflow.keras.optimizers import Adam
import logging

# Suppress TensorFlow logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

# Custom modules 
# Add on Demand

# Set TensorFlow float precision and verbosity
tf.keras.backend.set_floatx('float64')
tf.autograph.set_verbosity(0)

3. Data patrition
Training(70%)
Validation(30%)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)
print("X_train.shape", X_train.shape, "y_train.shape", y_train.shape)
print("X_test.shape", X_test.shape, "y_test.shape", y_test.shape)

X_train.shape (21000, 23) y_train.shape (21000, 1)
X_test.shape (9000, 23) y_test.shape (9000, 1)


#### take a look at our datasets

In [None]:
print(X_train[0:10])
print(y_train[0:10])
print(np.sum(y_train, axis=0))
print(np.sum(y_test, axis=0))

In [10]:
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(X_train[0:10])
print(y_train[0:10])

Plot trainning, test and so on...

In [11]:
tf.random.set_seed(1234)
model = Sequential(
    [
        tf.keras.Input(shape=(23,)),
        Dense(units=20, activation='relu', name='L1'),  
        Dense(units=15, activation='relu', name='L2'),
        Dense(units=10, activation='relu', name='L3'),
        Dense(units=1, activation='sigmoid', name='L4')
    ]
)


In [12]:
model.summary()
#### Examine Weights shapes
[layer1, layer2, layer3, layer4] = model.layers
W1,b1 = layer1.get_weights()
W2,b2 = layer2.get_weights()
W3,b3 = layer3.get_weights()
W4,b4 = layer4.get_weights()

print(f"W1 shape = {W1.shape}, b1 shape = {b1.shape}")
print(f"W2 shape = {W2.shape}, b2 shape = {b2.shape}")
print(f"W3 shape = {W3.shape}, b3 shape = {b3.shape}")
print(f"W4 shape = {W4.shape}, b3 shape = {b4.shape}")

W1 shape = (23, 20), b1 shape = (20,)
W2 shape = (20, 15), b2 shape = (15,)
W3 shape = (15, 10), b3 shape = (10,)
W4 shape = (10, 1), b3 shape = (1,)


In [13]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
)

model.fit(
    X_train,y_train,
    epochs=20
)


Epoch 1/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 636us/step - loss: 0.5245
Epoch 2/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 635us/step - loss: 0.4580
Epoch 3/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 634us/step - loss: 0.4491
Epoch 4/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 630us/step - loss: 0.4447
Epoch 5/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 610us/step - loss: 0.4416
Epoch 6/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 579us/step - loss: 0.4386
Epoch 7/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step - loss: 0.4365
Epoch 8/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 796us/step - loss: 0.4345
Epoch 9/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 651us/step - loss: 0.4331
Epoch 10/20
[1m657/657[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s

<keras.src.callbacks.history.History at 0x1feb7f69e20>

In [14]:
# Evaluate the model on the test data
test_loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

# Make predictions on the test data
predictions = model.predict(X_test)

# Convert y_test to a NumPy array if it's a DataFrame
if isinstance(y_test, pd.DataFrame):
    y_test = y_test.values

# Compare predictions with actual test labels
print('Predictions:', predictions.flatten())
print('Actual Labels:', y_test.flatten())

# Optionally, you can threshold the predictions to get binary outputs
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)
print('Binary Predictions:', binary_predictions.flatten())

# Calculate and print the distribution of predictions
unique, counts = np.unique(binary_predictions, return_counts=True)
prediction_distribution = dict(zip(unique, counts))
print(f'Prediction Distribution: {prediction_distribution}')

# Calculate additional metrics with zero_division parameter to handle undefined precision
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions, zero_division=1)
recall = recall_score(y_test, binary_predictions, zero_division=1)
f1 = f1_score(y_test, binary_predictions, zero_division=1)
auc = roc_auc_score(y_test, predictions)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')
print(f'AUC: {auc}')

[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 462us/step - loss: 0.4352
Test Loss: 0.4390438814805765
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 593us/step
Predictions: [0.04617502 0.1205393  0.05425163 ... 0.17537488 0.26420017 0.28990432]
Actual Labels: [0 1 0 ... 0 0 0]
Binary Predictions: [0 0 0 ... 0 0 0]
Prediction Distribution: {0: 7864, 1: 1136}
Accuracy: 0.8131111111111111
Precision: 0.6399647887323944
Recall: 0.3635
F1-Score: 0.46364795918367346
AUC: 0.7722415


In [15]:
np.sum(binary_predictions)

1136

# Test block here

Implementing SMOTE:

In [16]:
from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE
smote = SMOTE(random_state=42)

# Fit and apply SMOTE to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the model with resampled data
model.fit(
    X_train_resampled, y_train_resampled,
    epochs=20,
    class_weight=None  # No need to pass class weights if using SMOTE
)

# Evaluate the model on the test data
test_loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate AUC
auc = roc_auc_score(y_test, predictions)
print(f'AUC: {auc}')

# Convert to binary predictions with a different threshold if needed
threshold = 0.5
binary_predictions = (predictions > threshold).astype(int)

# Calculate additional metrics
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions, zero_division=1)
recall = recall_score(y_test, binary_predictions, zero_division=1)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')


Epoch 1/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 637us/step - loss: 0.5511
Epoch 2/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 573us/step - loss: 0.5371
Epoch 3/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 599us/step - loss: 0.5345
Epoch 4/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 608us/step - loss: 0.5325
Epoch 5/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 572us/step - loss: 0.5305
Epoch 6/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 698us/step - loss: 0.5288
Epoch 7/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 585us/step - loss: 0.5273
Epoch 8/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 622us/step - loss: 0.5258
Epoch 9/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 602us/step - loss: 0.5246
Epoch 10/20
[1m1023/1023[0m [32m━━━━━━━━━━━━━━━━━━━━

In [6]:
print(swat.__version__)

1.6.1


In [1]:
import swat
import pandas as pd

## Data Visualization
from matplotlib import pyplot as plt
import seaborn as sns


In [2]:
## Global Options
swat.options.cas.trace_actions = False      # Enabling tracing of actions (Default is False. Will change to true later)
swat.options.cas.trace_ui_actions = False   # Display the actions behind “UI” methods (Default is False. Will change to true later)
pd.set_option('display.max_columns', 500)   # Modify DataFrame max columns shown
pd.set_option('display.max_colwidth', 1000) # Modify DataFrame max column width

In [None]:

# Connection with cas
conn = swat.CAS("https://rmdemo.unx.sas.com/", 5770, "userid", "pwd", protocol="http")

conn

In [None]:
conn.decisionTree.forestTrain(
    table    = dict(name = 'HomeEquity', where = '_PartInd_ = 1'),
    target   = "Bad", 
    inputs   = ['LOAN' , 'IMP_REASON' , 'IMP_JOB' , 'REGION' , 'IMP_CLAGE' , 'IMP_CLNO' ,    
                       'IMP_DEBTINC' , 'IMP_DELINQ' , 'IMP_DEROG' , 'IMP_MORTDUE' , 'IMP_NINQ' ,  
                       'IMP_VALUE' , 'IMP_YOJ'], 
    nominals = ['BAD','IMP_REASON','IMP_JOB','REGION'],
    varImp=True,
    seed = 1234,
    casOut   = dict(name = 'rf_model', replace = True),
    saveState= dict(name = 'rf_astore', replace = True)
)