# Data Manipulation Libraries

In [1]:
import pandas as pd
import numpy as np

# Model Selection and Evaluation

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Data Preprocessing

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Machine Learning Models

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Visualization Tools

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec

# Handling Imbalanced Data

In [6]:
from imblearn.over_sampling import SMOTE
%pip install imbalanced-learn



# File Paths and Data Loading

In [7]:
application_record_path = 'application_record.csv'
credit_record_path = 'credit_record.csv'
application_record = pd.read_csv(application_record_path)
credit_record = pd.read_csv(credit_record_path)

# Data Inspection

In [8]:
application_record_info = application_record.info(), application_record.head()
credit_record_info = credit_record.info(), credit_record.head()
application_record.head(), credit_record.head()
application_record_info, credit_record_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

((None,
          ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
  0  5008804           M            Y               Y             0   
  1  5008805           M            Y               Y             0   
  2  5008806           M            Y               Y             0   
  3  5008808           F            N               Y             0   
  4  5008809           F            N               Y             0   
  
     AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
  0          427500.0               Working               Higher education   
  1          427500.0               Working               Higher education   
  2          112500.0               Working  Secondary / secondary special   
  3          270000.0  Commercial associate  Secondary / secondary special   
  4          270000.0  Commercial associate  Secondary / secondary special   
  
       NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
  0        Civil 

# Check for Null Values

In [9]:
# Calculate the number of missing values in each column
application_nulls = application_record.isnull().sum()
credit_nulls = credit_record.isnull().sum()

# Re-check for Null Values

In [10]:
application_nulls = application_record.isnull().sum()
credit_nulls = credit_record.isnull().sum()
application_nulls, credit_nulls

(ID                          0
 CODE_GENDER                 0
 FLAG_OWN_CAR                0
 FLAG_OWN_REALTY             0
 CNT_CHILDREN                0
 AMT_INCOME_TOTAL            0
 NAME_INCOME_TYPE            0
 NAME_EDUCATION_TYPE         0
 NAME_FAMILY_STATUS          0
 NAME_HOUSING_TYPE           0
 DAYS_BIRTH                  0
 DAYS_EMPLOYED               0
 FLAG_MOBIL                  0
 FLAG_WORK_PHONE             0
 FLAG_PHONE                  0
 FLAG_EMAIL                  0
 OCCUPATION_TYPE        134203
 CNT_FAM_MEMBERS             0
 dtype: int64,
 ID                0
 MONTHS_BALANCE    0
 STATUS            0
 dtype: int64)

# Drop Irrelevant Column and Preview Data

In [11]:
application_record = application_record.drop(['OCCUPATION_TYPE'], axis=1)
application_nulls = application_record.isnull().sum()
credit_nulls = credit_record.isnull().sum()
application_nulls, credit_nulls

(ID                     0
 CODE_GENDER            0
 FLAG_OWN_CAR           0
 FLAG_OWN_REALTY        0
 CNT_CHILDREN           0
 AMT_INCOME_TOTAL       0
 NAME_INCOME_TYPE       0
 NAME_EDUCATION_TYPE    0
 NAME_FAMILY_STATUS     0
 NAME_HOUSING_TYPE      0
 DAYS_BIRTH             0
 DAYS_EMPLOYED          0
 FLAG_MOBIL             0
 FLAG_WORK_PHONE        0
 FLAG_PHONE             0
 FLAG_EMAIL             0
 CNT_FAM_MEMBERS        0
 dtype: int64,
 ID                0
 MONTHS_BALANCE    0
 STATUS            0
 dtype: int64)

# Clean the application_record.csv Data

# Create Good/Bad Labels from credit_record.csv


In [12]:
def determine_label(group):
    # Assign 'Bad' if any value in 'STATUS' is from the set ['1', '2', '3', '4', '5']
    if any(group['STATUS'].isin(['1', '2', '3', '4', '5'])):
        return 'Bad'
    return 'Good'
credit_record['STATUS'] = credit_record['STATUS'].astype(str)
labels = credit_record.groupby('ID').apply(determine_label).reset_index(name='label')
merged_data = application_record.merge(labels, on='ID', how='inner')

  labels = credit_record.groupby('ID').apply(determine_label).reset_index(name='label')


# Data Preprocessing

In [13]:
categorical_cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE',
                    'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE']
merged_data = pd.get_dummies(merged_data, columns=categorical_cols, drop_first=True)
merged_data.drop(columns=['ID'], inplace=True)
X = merged_data.drop(columns=['label'])
y = merged_data['label']

# Balance the Data

In [14]:
#Applying SMOTE
X_balanced, Y_balanced = SMOTE().fit_resample(X, y)

# Encode Target Variable

In [15]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Standardize Numerical Features

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
X.head(), y[:5]

(   CNT_CHILDREN  AMT_INCOME_TOTAL  DAYS_BIRTH  DAYS_EMPLOYED  FLAG_MOBIL  \
 0     -0.579661          2.365845    0.945169      -0.463532         0.0   
 1     -0.579661          2.365845    0.945169      -0.463532         0.0   
 2     -0.579661         -0.728827   -1.309091      -0.438774         0.0   
 3     -0.579661          0.818509   -0.746300      -0.452700         0.0   
 4     -0.579661          0.818509   -0.746300      -0.452700         0.0   
 
    FLAG_WORK_PHONE  FLAG_PHONE  FLAG_EMAIL  CNT_FAM_MEMBERS  CODE_GENDER_M  \
 0         1.853127   -0.646578   -0.313952        -0.217680           True   
 1         1.853127   -0.646578   -0.313952        -0.217680           True   
 2        -0.539628   -0.646578   -0.313952        -0.217680           True   
 3        -0.539628    1.546603    3.185203        -1.314564          False   
 4        -0.539628    1.546603    3.185203        -1.314564          False   
 
    ...  NAME_EDUCATION_TYPE_Secondary / secondary special  

Print the Number of "Good" and "Bad" Labels Before Balancing


In [17]:
print("Before Balancing:")
print(merged_data['label'].value_counts())

Before Balancing:
label
Good    32166
Bad      4291
Name: count, dtype: int64


Print the Number of "Good" and "Bad" Labels After Balancing

In [18]:
print("After Balancing:")
print(pd.Series(Y_balanced).value_counts())

After Balancing:
label
Bad     32166
Good    32166
Name: count, dtype: int64


# Modeling

Split the Data into Training and Test Sets

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, Y_balanced, test_size=0.3, random_state=42, stratify=Y_balanced)

Define Models for Evaluation

In [20]:
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(C=1.0, kernel='rbf', gamma='auto'),
    "Random Forest": RandomForestClassifier()
}

Train and Evaluate the Models

In [21]:
classification_reports = []
confusion_matrices = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    classification_reports.append(report)
    confusion_matrices.append(cm)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Display Results for Each Model

In [22]:
for i in range(len(models)):
    print(list(models.keys())[i])
    print(classification_reports[i])
    print(confusion_matrices[i])

Logistic Regression
              precision    recall  f1-score   support

         Bad       0.65      0.64      0.64      9650
        Good       0.64      0.66      0.65      9650

    accuracy                           0.65     19300
   macro avg       0.65      0.65      0.65     19300
weighted avg       0.65      0.65      0.65     19300

[[6158 3492]
 [3307 6343]]
SVM
              precision    recall  f1-score   support

         Bad       0.82      0.94      0.87      9650
        Good       0.92      0.79      0.85      9650

    accuracy                           0.86     19300
   macro avg       0.87      0.86      0.86     19300
weighted avg       0.87      0.86      0.86     19300

[[9026  624]
 [2003 7647]]
Random Forest
              precision    recall  f1-score   support

         Bad       0.91      0.91      0.91      9650
        Good       0.91      0.92      0.91      9650

    accuracy                           0.91     19300
   macro avg       0.91      0.91   

 Normalize the Confusion Matrix

In [23]:
confusion_matrices[i] = confusion_matrices[i].astype('float') / confusion_matrices[i].sum(axis=1)[:, np.newaxis]

# **The chosen model is Random Forest**

# Hyperparameter Tuning


Define the Parameter Grid for Random Forest Hyperparameter Tuning

In [24]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

Initialize the Random Forest Classifier

In [25]:
rf = RandomForestClassifier()

Set Up GridSearchCV with the Defined Parameters

In [26]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

Fit the Grid Search to the Training Data

In [None]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


Output the Best Parameters and the Corresponding Score

In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)

# Test the Model with the Best Parameters




In [None]:
best_rf = RandomForestClassifier(**grid_search.best_params_)


# Train and Evaluate the Model

In [None]:
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))