In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model



In [7]:
# Load the dataset
data = pd.read_csv('dataset/main/credit_train.csv')
numerical_cols = [ 'credit_sum', 'credit_month', 'score_shk', 'monthly_income', 'credit_count', 'overdue_credit_count']


In [8]:
# Data Preprocessing
# Handle missing values
data.fillna(0, inplace=True)  # Replace missing values with 0 for simplicity

def convert_credit_sum(value):
    try:
        return float(value.replace(',', '.'))
    except:
        return 0
    
# Convert incorrect data types to numerical
data['credit_sum'] = data['credit_sum'].apply(convert_credit_sum)
data['score_shk'] = data['score_shk'].str.replace(',', '.').astype(float)



In [15]:
# Separate features (X) and target variable (y)
X = data.drop(columns=['open_account_flg'])
y = data['open_account_flg']


categorical_cols = ['gender', 'marital_status', 'job_position', 'education', 'living_region']
X[categorical_cols] = X[categorical_cols].astype(str)

# Encode categorical variables using Label Encoding
label_encoders = {}
for col in ['gender', 'marital_status', 'job_position', 'education', 'living_region']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the deep learning model
model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(512, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(16, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Make predictions
y_pred_probs = model.predict(X_test)
y_pred = np.round(y_pred_probs)

# Convert probabilities to binary predictions

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_rep)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.82
Classification Report:
               precision    recall  f1-score   support

           0       0.82      1.00      0.90     22450
           1       0.00      0.00      0.00      4870

    accuracy                           0.82     27320
   macro avg       0.41      0.50      0.45     27320
weighted avg       0.68      0.82      0.74     27320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Separate features (X) and target variable (y)
X = data.drop(columns=['open_account_flg'])
y = data['open_account_flg']


categorical_cols = ['gender', 'marital_status', 'job_position', 'education', 'living_region']
X[categorical_cols] = X[categorical_cols].astype(str)

# Encode categorical variables using Label Encoding
label_encoders = {}
for col in ['gender', 'marital_status', 'job_position', 'education', 'living_region']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Define hyperparameter values to try
param_grid = {
    'hidden_units': [32, 64, 128],
    'activation': ['relu', 'tanh'],
    'dropout_rate': [0.2, 0.4],
    'learning_rate': [0.001, 0.01, 0.1],
    'batch_size': [32, 64]
}

best_accuracy = 0
best_model = None

# Iterate over different hyperparameter combinations
for units in param_grid['hidden_units']:
    for activation in param_grid['activation']:
        for dropout_rate in param_grid['dropout_rate']:
            for lr in param_grid['learning_rate']:
                for batch_size in param_grid['batch_size']:
                    model = keras.Sequential([
                        layers.Input(shape=(X_train.shape[1],)),
                        layers.Dense(units, activation=activation),
                        layers.Dropout(dropout_rate),
                        layers.Dense(1, activation='sigmoid')
                    ])
                    
                    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=lr),
                                  loss='binary_crossentropy',
                                  metrics=['accuracy'])
                    
                    model.fit(X_train, y_train, epochs=10, batch_size=batch_size, verbose=0)
                    
                    # Evaluate the model
                    accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
                    
                    if accuracy > best_accuracy:
                        best_accuracy = accuracy
                        best_model = model

# Make predictions using the best model
y_pred_probs = best_model.predict(X_test)
y_pred = np.round(y_pred_probs)

# Convert probabilities to binary predictions

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Best Model Accuracy: {accuracy:.2f}')
print('Classification Report for Best Model:\n', classification_rep)

Best Model Accuracy: 0.82
Classification Report for Best Model:
               precision    recall  f1-score   support

           0       0.82      1.00      0.90     22450
           1       0.00      0.00      0.00      4870

    accuracy                           0.82     27320
   macro avg       0.41      0.50      0.45     27320
weighted avg       0.68      0.82      0.74     27320



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
