In [None]:
!pip install imblearn

import numpy as np
import itertools
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from scipy.stats import boxcox
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from functools import partial
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from io import BytesIO
import base64
import seaborn as sns
from joblib import Parallel, delayed

warnings.filterwarnings("ignore")


df = pd.read_csv('/kaggle/input/kaggle-s4e10/playground-series-s4e10/train.csv')
df_origin = pd.read_csv('/kaggle/input/ps4e9-original-data-loan-approval-prediction/credit_risk_dataset.csv')

df_origin['person_emp_length'].fillna(df_origin['person_emp_length'].mean(), inplace=True)
df_origin['loan_int_rate'].fillna(df_origin['loan_int_rate'].mean(), inplace=True)


df = df.drop(columns=['id'])

df = pd.concat([df, df_origin],axis=0)


numeric_var = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

# Isolation Forest

In [None]:
def isol_forest(df):

    anomaly_inputs = numeric_var
    model_IF = IsolationForest(contamination=0.05, random_state=42)
    model_IF.fit(df[anomaly_inputs])
    df['anomaly_scores'] = model_IF.decision_function(df[anomaly_inputs])
    df['anomaly'] = model_IF.predict(df[anomaly_inputs])

    df = df[df['anomaly'] == 1].reset_index(drop=True)
    df.drop(['anomaly_scores', 'anomaly'], axis=1, inplace=True)
    
    return df

# Interquartile Range (IQR)

In [None]:
def iqr(df):

    for col in numeric_var:
        
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - 2 * IQR
        upper_bound = Q3 + 2 * IQR
        
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

    return df

# Box-Cox Transformation

In [None]:
def box_cox(df):
    for col in numeric_var:
        df[col], _ = boxcox(df[col] + 1)

    return df

# Robust Scaler

In [None]:
def robust_scaler(df):
    scaler = RobustScaler()
    df[numeric_var] = scaler.fit_transform(df[numeric_var])

    return df

# Normalization

In [None]:
def normalization(df):
    for col in numeric_var:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    return df

# Standardization

In [None]:
def standardization(df):
    for col in numeric_var:
        df[col] = (df[col] - df[col].mean()) / df[col].std()

    return df

# SMOTENC

In [None]:
def smotenc(df, sampling_strategy=0.5):
    X = df[['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']]
    y = df[['loan_status']]
    
    categorical_features = [2, 4, 5, 9]

    smotenc = SMOTENC(sampling_strategy=sampling_strategy, categorical_features=categorical_features, random_state=42)
    
    X_resampled, y_resampled = smotenc.fit_resample(X, y)
    
    X_resampled_df = pd.DataFrame(X_resampled, columns=['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length'])
    y_resampled_df = pd.DataFrame(y_resampled, columns=['loan_status'])
    df = pd.concat([X_resampled_df, y_resampled_df], axis=1)
    
    return df


# One-Hot Encoding

In [None]:
def one_hot(df):
    df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'])

    return df

# Label Encoding

In [None]:
def label_enc(df):
    label_encoder = LabelEncoder()
    
    for column in ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']:
        df[column] = label_encoder.fit_transform(df[column])

    return df

# Model

In [None]:
def Model_and_save(df_train, df_val, function, save_function):

    class NeuralNetworkModel:
        def __init__(self, input_dim, dropout_rate=0.2, patience=5, batchnorm=True, random_seed=42):

            self.input_dim = input_dim
            self.dropout_rate = dropout_rate
            self.patience = patience
            self.batchnorm = batchnorm
            
            self.random_seed = random_seed
            tf.random.set_seed(self.random_seed)
            
            self.model = self.build_model()
    
            self.early_stopping = EarlyStopping(monitor='val_auc', patience=self.patience, restore_best_weights=True)
    
        def build_model(self):

            model = Sequential()
    
            model.add(Dense(64, input_dim=self.input_dim, activation='relu'))
    
            if self.batchnorm:
                model.add(BatchNormalization())
    
            model.add(Dense(128, activation='relu'))
            model.add(Dropout(self.dropout_rate))
    
            if self.batchnorm:
                model.add(BatchNormalization())
                
            model.add(Dense(64, activation='relu'))
            model.add(Dropout(self.dropout_rate))

            model.add(Dense(1, activation='sigmoid'))

            lr_schedule = ExponentialDecay(
            initial_learning_rate=0.0005,  
            decay_steps=1000,            
            decay_rate=0.85,            
            staircase=False             
            )
    
            model.compile(optimizer=Adam(learning_rate=lr_schedule), 
                          loss='binary_crossentropy', 
                          metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    
            return model
    
        def train(self, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):

            history = self.model.fit(X_train, y_train,
                                     validation_data=(X_val, y_val),
                                     epochs=epochs,
                                     batch_size=batch_size,
                                     callbacks=[self.early_stopping],
                                     verbose=0)
            return history
    
        def evaluate(self, X_val, y_val):

            val_loss, val_accuracy, val_auc = self.model.evaluate(X_val, y_val)
            return val_loss, val_accuracy, val_auc

        def predict(self, X):
            return self.model.predict(X)
            
    X_train = df_train.drop(columns=['loan_status'])
    y_train = df_train['loan_status'] 

    X_val = df_val.drop(columns=['loan_status'])
    y_val = df_val['loan_status'] 
    
    with tf.device('/GPU:0'):
        nn_model = NeuralNetworkModel(input_dim=X_train.shape[1], dropout_rate=function[-2], patience=30, batchnorm=function[-1])
        history = nn_model.train(X_train, y_train, X_val, y_val, epochs=100, batch_size=512)
        
        y_pred = nn_model.predict(X_val)
    
        tsne = TSNE(n_components=2, random_state=42)
        X_tsne = tsne.fit_transform(X_val)
        
        fig, ax = plt.subplots(figsize=(8, 6))
        
        scatter = ax.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred, cmap='viridis', edgecolor='k', s=50)
        plt.colorbar(scatter, ax=ax)
        ax.set_title("t-SNE of Test Data (Colored by Model Predictions)")
        ax.set_xlabel("t-SNE Component 1")
        ax.set_ylabel("t-SNE Component 2")
        

        buf = BytesIO()
        fig.savefig(buf, format='png', dpi=100, transparent=False)
        buf.seek(0)  
        plt.close(fig)  
        
        image_binary_tsne = buf.getvalue()
        encoded_image_tsne = base64.b64encode(image_binary_tsne).decode('utf-8')
        buf.close()
    
        y_pred_binary = (y_pred > 0.5).astype(int)
        
        cm = confusion_matrix(y_val, y_pred_binary)
        class_names = ["0", "1"]
        
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names, ax=ax)
        ax.set_title("Confusion Matrix")
        ax.set_xlabel("Predicted Label")
        ax.set_ylabel("True Label")
        
        buf = BytesIO()
        fig.savefig(buf, format='png', dpi=100, transparent=False)
        buf.seek(0)
        plt.close(fig)
        
        image_binary_conf = buf.getvalue()
        encoded_image_conf = base64.b64encode(image_binary_conf).decode('utf-8')
        buf.close()
        image_binary = base64.b64decode(encoded_image_conf)
    
    
        df_save = pd.DataFrame([{
        'isol_forest': save_function[0],
        'iqr': save_function[1],
        'box_cox': save_function[2],
        'robust_scaler': save_function[3],
        'normalization': save_function[4],
        'SMOTENC': save_function[5],
        'one_hot': save_function[6],
        'Dropout': save_function[7],
        'BatchNorm': save_function[8],
        'history': history.history,
        't-SNE': encoded_image_tsne,
        'confusion_matrix': encoded_image_conf
    }])
        
        file_path = "info.csv"
        
        def save_to_csv(df_save, file_path):
            try:
                if not pd.io.common.file_exists(file_path):
                    df_save.to_csv(file_path, index=False)
                else:
                    df_save.to_csv(file_path, mode='a', index=False, header=False)
            except Exception as e:
                print(f"Error saving to CSV: {e}")
        
        save_to_csv(df_save, file_path)

# Main

In [None]:
def main(combination, save_combination):
    df_new = df.copy()
    df_train, df_val = train_test_split(df_new, test_size=0.2, random_state=42)
    
    for i, func in enumerate(combination):
        if callable(func):            
            df_train = func(df_train)
            if i != 5:
                df_val = func(df_val)

    Model_and_save(df_train, df_val, combination, save_combination)

In [None]:
functions = [[False, isol_forest], [False, iqr], [False, box_cox], [False, robust_scaler], [False, standardization, normalization], \
              [False, lambda df: smotenc(df, sampling_strategy=0.4), lambda df: smotenc(df, sampling_strategy=0.6),  \
              lambda df: smotenc(df, sampling_strategy=0.8)], [label_enc, one_hot], [0, 0.15, 0.25, 0.4], [False, True]]

save_functions = [[False, True], [False, True], [False, True], [False, True], [False, 'standardization', 'normalization'], \
                  [False, 0.4, 0.6, 0.8], ['label_enc', 'one_hot'], [0, 0.15, 0.25, 0.4], [False, True]]

combinations = list(itertools.product(*functions))
save_combinations = list(itertools.product(*save_functions))

Parallel(n_jobs=3, backend="threading")(
    delayed(main)(combination, save_combination) 
    for combination, save_combination in zip(combinations, save_combinations)
)