In [1]:
# Data
import numpy as np
import pandas as pd
import random

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

# # styling
%matplotlib inline
# sns.set_style('darkgrid')
# mpl.rcParams['font.size'] = 14
# mpl.rcParams['figure.facecolor'] = '#00000000'
# mpl.rcParams['font.size'] = 14
# mpl.rcParams['figure.facecolor'] = '#00000000'

import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_cr = pd.read_csv("/content/credit_risk_dataset1.csv")
df_cr.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2.0
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.44,N,3.0
2,23,65500,RENT,4.0,MEDICAL,C,23125,15.23,1,0.44,N,2.0
3,24,54400,RENT,8.0,MEDICAL,C,23125,14.27,1,0.44,Y,4.0
4,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2.0


In [3]:
df_cr.isnull().sum()

Unnamed: 0,0
person_age,0
person_income,0
person_home_ownership,0
person_emp_length,0
loan_intent,0
loan_grade,0
loan_amnt,0
loan_int_rate,0
loan_status,0
loan_percent_income,0


In [4]:
df_cr.duplicated().sum()

15

In [5]:
df_cr.drop_duplicates(inplace = True)

In [6]:
df_cr["loan_status"].value_counts()


Unnamed: 0_level_0,count
loan_status,Unnamed: 1_level_1
0,25303
1,7086


In [7]:
X = df_cr.drop(columns= ['loan_status'])
y = df_cr['loan_status']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=12, stratify = y )
print(f"Shape of X train {X_train.shape}, X test {X_test.shape}")
print(f"Shape of Y train {y_train.shape}, Y test {y_test.shape}")

Shape of X train (24291, 11), X test (8098, 11)
Shape of Y train (24291,), Y test (8098,)


In [9]:
from sklearn.preprocessing import OneHotEncoder
ohe_colums = ['loan_grade', 'person_home_ownership','loan_intent']

ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Fit and transform using a DataFrame
temp_X_train = pd.DataFrame(ohe.fit_transform(X_train[ohe_colums]), columns=ohe.get_feature_names_out(ohe_colums), index=X_train.index)
temp_X_test = pd.DataFrame(ohe.transform(X_test[ohe_colums]), columns=ohe.get_feature_names_out(ohe_colums), index=X_test.index)

# Concatenate the one-hot encoded features back to the original DataFrames
X_train_ohe = pd.concat([X_train, temp_X_train], axis=1)
X_test_ohe = pd.concat([X_test, temp_X_test], axis=1)

In [10]:
from sklearn.preprocessing import OrdinalEncoder

ore = OrdinalEncoder()

X_train_ohe["cb_person_default_on_file"] = ore.fit_transform(X_train[["cb_person_default_on_file"]])
X_test_ohe["cb_person_default_on_file"] = ore.transform(X_test[["cb_person_default_on_file"]])

In [11]:
X_train_ohe.drop(columns= ['loan_grade', 'person_home_ownership','loan_intent'], inplace=True)
X_test_ohe.drop(columns= ['loan_grade', 'person_home_ownership','loan_intent'], inplace=True)

In [12]:
X_test_ohe

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_grade_A,loan_grade_B,...,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
22642,27,85000,1.00,7525,11.017265,0.09,0.0,8.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
22066,32,22704,4.00,6800,11.017265,0.30,0.0,8.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4048,21,70000,5.00,4000,7.900000,0.06,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
24041,31,70000,5.00,4000,6.540000,0.06,0.0,7.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
14129,25,100000,9.00,8000,10.250000,0.08,0.0,3.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14964,22,50004,6.00,15000,15.050000,0.30,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
6341,24,49000,7.00,3500,7.880000,0.07,0.0,3.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8756,24,37876,4.00,8000,9.630000,0.21,0.0,2.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
23841,34,30636,4.00,10000,10.370000,0.33,0.0,5.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(sampling_strategy={1: 9700}, random_state=3)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train_ohe, y_train)

In [16]:
y_train_resampled.value_counts(1)

Unnamed: 0_level_0,proportion
loan_status,Unnamed: 1_level_1
0,0.66175
1,0.33825


In [18]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
X_train_scaled= pd.DataFrame(std.fit_transform(X_train_resampled), columns=X_train_resampled.columns, index= X_train_resampled.index)
X_test_scaled= pd.DataFrame(std.transform(X_test_ohe), columns=X_test_ohe.columns, index= X_test_ohe.index)

In [32]:
from sklearn.compose import ColumnTransformer
import pickle
import joblib
from sklearn.preprocessing import OneHotEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

In [33]:
X_train

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1786,24,24000,RENT,8.0,MEDICAL,B,2000,11.017265,0.08,N,3.0
4830,25,42000,MORTGAGE,2.0,PERSONAL,A,6000,8.940000,0.14,N,2.0
4518,24,41000,OWN,9.0,HOMEIMPROVEMENT,D,10000,15.330000,0.24,N,4.0
15233,22,125004,OTHER,6.0,VENTURE,D,10000,14.740000,0.08,N,2.0
4621,25,42000,MORTGAGE,1.0,VENTURE,A,13975,8.900000,0.33,N,4.0
...,...,...,...,...,...,...,...,...,...,...,...
2782,22,57000,RENT,6.0,EDUCATION,C,12000,13.110000,0.21,N,2.0
8946,23,60000,MORTGAGE,8.0,MEDICAL,B,10000,11.017265,0.17,N,3.0
10933,22,45500,RENT,0.0,VENTURE,A,10000,5.990000,0.22,N,4.0
27383,31,20000,OWN,1.0,HOMEIMPROVEMENT,A,5000,8.590000,0.25,N,8.0


In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

# Định nghĩa các chỉ số cột cho các loại dữ liệu
categorical_cols = [2, 4, 5, 9]
ordinal_cols = [3]
numeric_cols = [0, 1, 7, 8, 10]

# Cấu hình ColumnTransformer
ct_encoding = ColumnTransformer(
    transformers=[
        ("ohe_enc", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_cols),
        ("ord_enc", OrdinalEncoder(), ordinal_cols),
        ("num_scaler", StandardScaler(), numeric_cols)
    ],
    remainder="passthrough"
)


In [35]:
from xgboost import XGBClassifier

model = XGBClassifier(random_state = 15,colsample_bytree =  0.5, n_estimators=500, gamma = 1, min_child_weight = 1, reg_alpha = 1, reg_lambda = 1, max_depth=18, learning_rate = 0.1)

In [36]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(steps=[
    ("encoding", ct_encoding),
    ("scaler", StandardScaler()),
    ("model", model)
])

In [37]:
pipe.fit(X_train, y_train)

In [38]:
y_pred_log = pipe.predict(X_test)

In [39]:
pd.Series(y_pred_log).value_counts()

Unnamed: 0,count
0,6773
1,1325


In [40]:
X_test.index

Index([22642, 22066,  4048, 24041, 14129, 30114,  7315,  6009,  8445, 13920,
       ...
       11121, 20976,  3433, 28602, 31219, 14964,  6341,  8756, 23841, 27272],
      dtype='int64', length=8098)

In [44]:
X_test.loc[4048]

Unnamed: 0,4048
person_age,21
person_income,70000
person_home_ownership,RENT
person_emp_length,5.0
loan_intent,EDUCATION
loan_grade,A
loan_amnt,4000
loan_int_rate,7.9
loan_percent_income,0.06
cb_person_default_on_file,N


In [45]:
y_test.loc[4048]

0

In [46]:
df_cr.sample(5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
27475,29,140250,MORTGAGE,12.0,PERSONAL,A,7000,9.32,0,0.05,N,6.0
1100,22,24900,MORTGAGE,1.0,MEDICAL,C,5500,13.16,1,0.22,N,2.0
15236,25,126000,MORTGAGE,9.0,VENTURE,B,10200,11.017265,0,0.08,N,4.0
6475,26,50000,MORTGAGE,3.0,DEBTCONSOLIDATION,A,8000,6.62,0,0.16,N,4.0
23212,35,62000,MORTGAGE,0.0,EDUCATION,A,9475,6.03,0,0.15,N,5.0


In [49]:
import pandas as pd

columns = [
    'person_home_ownership',
    'loan_intent',
    'loan_grade',
    'cb_person_default_on_file',
    'person_income',
    'person_age',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length'
]

# Tạo DataFrame với dữ liệu
temp_df = pd.DataFrame(
    data=[["RENT", "VENTURE", "D", "Y", 13200, 24, 6.0, 4000, 15.65, 3, 0.3]],
    columns=columns
)

print(temp_df)


  person_home_ownership loan_intent loan_grade cb_person_default_on_file  \
0                  RENT     VENTURE          D                         Y   

   person_income  person_age  person_emp_length  loan_amnt  loan_int_rate  \
0          13200          24                6.0       4000          15.65   

   loan_percent_income  cb_person_cred_hist_length  
0                    3                         0.3  


In [50]:
pipe.predict(temp_df)

array([1])

In [51]:
# Count unique values
unique_values, counts = np.unique(y_pred_log, return_counts=True)

# Display unique values and their counts
for value, count in zip(unique_values, counts):
    print(f"{value}: {count}")

0: 6773
1: 1325


In [52]:
joblib.dump(pipe, "best_pipeline.joblib")

['best_pipeline.joblib']

In [53]:
rfc_model = joblib.load("best_pipeline.joblib")

In [54]:
rfc_model.predict(X_test)

array([0, 0, 0, ..., 0, 1, 0])

In [55]:
rfc_model.predict(temp_df)

array([1])