In [1]:
# Install necessary libraries
!pip install gradio imbalanced-learn catboost lightgbm xgboost scikit-learn pandas matplotlib seaborn

Collecting gradio
  Downloading gradio-4.42.0-py3-none-any.whl.metadata (15 kB)
Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.112.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downlo

In [31]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import itertools

In [32]:
# Load the data
data = pd.read_csv("/content/application_record.csv", encoding='utf-8')
record = pd.read_csv("/content/credit_record.csv", encoding='utf-8')


In [33]:

# Feature Engineering
begin_month = pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
begin_month = begin_month.rename(columns={'MONTHS_BALANCE': 'begin_month'})
new_data = pd.merge(data, begin_month, how="left", on="ID")

In [34]:
# Define target variable
record['dep_value'] = None
record.loc[record['STATUS'].isin(['2', '3', '4', '5']), 'dep_value'] = 'Yes'
record.loc[~record['STATUS'].isin(['2', '3', '4', '5']), 'dep_value'] = 'No'
cpunt = record.groupby('ID').count()
cpunt['dep_value'] = cpunt['dep_value'].apply(lambda x: 'Yes' if x > 0 else 'No')
cpunt = cpunt[['dep_value']]
new_data = pd.merge(new_data, cpunt, how='inner', on='ID')
new_data['target'] = new_data['dep_value'].apply(lambda x: 1 if x == 'Yes' else 0)

In [35]:

# Drop unnecessary columns and handle missing values
new_data = new_data.drop(columns=['dep_value'])
new_data = new_data.replace('NULL', np.nan).dropna()


In [36]:
# Feature Engineering
# Rename columns
new_data.rename(columns={
    'CODE_GENDER': 'Gender',
    'FLAG_OWN_CAR': 'Car',
    'FLAG_OWN_REALTY': 'Reality',
    'CNT_CHILDREN': 'ChldNo',
    'AMT_INCOME_TOTAL': 'inc',
    'NAME_EDUCATION_TYPE': 'edutp',
    'NAME_FAMILY_STATUS': 'famtp',
    'NAME_HOUSING_TYPE': 'houtp',
    'FLAG_EMAIL': 'email',
    'NAME_INCOME_TYPE': 'inctp',
    'FLAG_WORK_PHONE': 'wkphone',
    'FLAG_PHONE': 'phone',
    'CNT_FAM_MEMBERS': 'famsize',
    'OCCUPATION_TYPE': 'occyp'
}, inplace=True)

In [37]:

# Convert categorical features to numerical values
new_data['Gender'] = new_data['Gender'].replace(['F', 'M'], [0, 1])
new_data['Car'] = new_data['Car'].replace(['N', 'Y'], [0, 1])
new_data['Reality'] = new_data['Reality'].replace(['N', 'Y'], [0, 1])
new_data['phone'] = new_data['phone'].astype(str).replace('nan', '0')
new_data['email'] = new_data['email'].astype(str).replace('nan', '0')
new_data['wkphone'] = new_data['wkphone'].astype(str).replace('nan', '0')

In [38]:
# Process continuous features
new_data['inc'] = new_data['inc'] / 10000
new_data['inc'] = pd.qcut(new_data['inc'], q=3, labels=['low', 'medium', 'high'])

In [39]:

# Convert categorical features using one-hot encoding
new_data['ChldNo'] = new_data['ChldNo'].replace({0: '0', 1: '1', 2: '2More'})
new_data = pd.get_dummies(new_data, columns=['ChldNo', 'inc'], drop_first=True)

In [40]:
# Convert any remaining non-numeric columns to numeric
for col in new_data.select_dtypes(include=['object']).columns:
    new_data[col] = pd.factorize(new_data[col])[0]

In [41]:

# Split the dataset
X = new_data.drop(columns=['target'])
y = new_data['target']


In [43]:
# Debugging: Check the target distribution
print("Target distribution:")
print(y.value_counts())

Target distribution:
target
1    25134
Name: count, dtype: int64


In [48]:
# Proceed if there are at least two classes
if y.nunique() > 1:
    # Balance the dataset
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

    # Define models
    models = {
        'Logistic Regression': LogisticRegression(),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'SVM': SVC(),
        'XGBoost': XGBClassifier(),
        'LightGBM': LGBMClassifier(),
        'CatBoost': CatBoostClassifier(silent=True)
    }
     # Train and evaluate models
    def plot_confusion_matrix(cm, classes,
                              title='Confusion Matrix',
                              cmap=plt.cm.Blues):
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.ylabel('True label')
        plt.xlabel('Predicted label')

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        print(f"{name} Accuracy: {accuracy:.4f}")
        plt.figure()
        plot_confusion_matrix(cm, classes=['No', 'Yes'], title=f'{name} Confusion Matrix')
        plt.show()

else:
    print("Cannot apply SMOTE as the target variable has only one class.")


Cannot apply SMOTE as the target variable has only one class.
