In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay

warnings.filterwarnings("ignore")

In [2]:
pd.set_option("display.max_columns",100)
pd.set_option("display.max_rows",100)

In [3]:
df = pd.read_csv("../data/raw/Loan_approval_data_2025.csv") 

In [4]:
columns = [column for column in df.columns if column != "customer_id"]

numeric_features = [feature for feature in columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in columns if df[feature].dtype == 'O']

In [5]:
continuous_features=[feature for feature in numeric_features if len(df[feature].unique())>=10]
print('Names of continues features:',continuous_features)

Names of continues features: ['age', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'delinquencies_last_2yrs', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio']


In [6]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def compute_vif(considered_features, df):
    
    X = df[considered_features]
    # the calculation of variance inflation requires a constant
    X['intercept'] = 1
    
    # create dataframe to store vif values
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif[vif['Variable']!='intercept']
    return vif
#Just to check what will happen if we romove a feature
cont_features = continuous_features.copy()
cont_features.remove('loan_to_income_ratio')
compute_vif(cont_features, df)

Unnamed: 0,Variable,VIF
0,age,2.508155
1,years_employed,1.669854
2,annual_income,6.24159
3,credit_score,1.569388
4,credit_history_years,1.745786
5,savings_assets,1.103103
6,current_debt,6.651593
7,delinquencies_last_2yrs,1.088738
8,loan_amount,5.282914
9,interest_rate,1.337799


In [7]:
df['loan_to_income_ratio'] = df['loan_amount'] / df['annual_income']
df['payment_to_income_ratio'] = (df['loan_amount'] * df['interest_rate']/100) / df['annual_income']
df["risk_score"] = (
    df["defaults_on_file"]*2 +
    df["derogatory_marks"]*1.5 +
    df["delinquencies_last_2yrs"]*1 -
    df["credit_score"]*0.01
)

# df = df.drop(["interest_rate", "payment_to_income_ratio", "loan_to_income_ratio",'debt_to_income_ratio','loan_to_income_ratio'],axis=1)
df=df.drop(['customer_id'],axis=1)

In [8]:
df.shape

(50000, 20)

In [9]:
df.columns

Index(['age', 'occupation_status', 'years_employed', 'annual_income',
       'credit_score', 'credit_history_years', 'savings_assets',
       'current_debt', 'defaults_on_file', 'delinquencies_last_2yrs',
       'derogatory_marks', 'product_type', 'loan_intent', 'loan_amount',
       'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio',
       'payment_to_income_ratio', 'loan_status', 'risk_score'],
      dtype='object')

In [10]:
cat = df.select_dtypes(include='object').columns
num = df.select_dtypes(exclude='object').columns

In [11]:
con_features=[feature for feature in num if len(df[feature].unique())>=10]
print('Names of continues features:',continuous_features)

Names of continues features: ['age', 'years_employed', 'annual_income', 'credit_score', 'credit_history_years', 'savings_assets', 'current_debt', 'delinquencies_last_2yrs', 'loan_amount', 'interest_rate', 'debt_to_income_ratio', 'loan_to_income_ratio', 'payment_to_income_ratio']


In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def compute_vif(considered_features, df):
    
    X = df[considered_features]
    # the calculation of variance inflation requires a constant
    X['intercept'] = 1
    
    # create dataframe to store vif values
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif = vif[vif['Variable']!='intercept']
    return vif
#Just to check what will happen if we romove a feature
cont_features = con_features.copy()
cont_features.remove('loan_to_income_ratio')
compute_vif(cont_features, df)

Unnamed: 0,Variable,VIF
0,age,2.510598
1,years_employed,1.672626
2,annual_income,5.776801
3,credit_score,2.888161
4,credit_history_years,1.745823
5,savings_assets,1.103099
6,current_debt,6.651453
7,delinquencies_last_2yrs,2.658101
8,loan_amount,4.236177
9,interest_rate,1.622575


In [13]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['occupation_status', 'product_type', 'loan_intent']

encoder = OneHotEncoder(sparse_output=False, drop='first')

encoded_data = encoder.fit_transform(df[categorical_cols])

encoded_cols = encoder.get_feature_names_out(categorical_cols)

encoded_df = pd.DataFrame(encoded_data, columns=encoded_cols)

In [15]:
df_numeric = df.drop(columns=categorical_cols)

df_final = pd.concat([df_numeric, encoded_df], axis=1)

X = df_final.drop('loan_status', axis=1)  

y = df_final['loan_status'] 

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve,confusion_matrix
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn import metrics 

models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Logistic Regression": LogisticRegression(),
     "K-Neighbors Classifier": KNeighborsClassifier(),
    "XGBClassifier": XGBClassifier(), 
     "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

In [23]:
# Create a function which can evaluate models and return a report 
def evaluate_models(X, y, models):
    '''
    This function takes in X and y and models dictionary as input
    It splits the data into Train Test split
    Iterates through the given model dictionary and evaluates the metrics
    Returns: Dataframe which contains report of all models metrics with cost
    '''
    # separate dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)
    

    models_list = []
    scores = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_pred = model.predict(X_test)

        score = accuracy_score(y_test,y_pred)
        
        model_name = list(models.keys())[i]
        print(f'---- score for --- {model_name} ----')
        print(f"{score}")
        models_list.append(model_name)
        scores.append(score)
    
    print()
    
    report = pd.DataFrame()
    report['Model_name'] = models_list
    report['Score'] = scores        
    return report

In [24]:
report = evaluate_models(X, y, models)

---- score for --- Random Forest ----
0.9106
---- score for --- Decision Tree ----
0.8679333333333333
---- score for --- Gradient Boosting ----
0.9208666666666666
---- score for --- Logistic Regression ----
0.7386666666666667
---- score for --- K-Neighbors Classifier ----
0.6644666666666666
---- score for --- XGBClassifier ----
0.9278
---- score for --- CatBoosting Classifier ----
0.932
---- score for --- AdaBoost Classifier ----
0.8923333333333333



In [25]:
report.sort_values('Score')

Unnamed: 0,Model_name,Score
4,K-Neighbors Classifier,0.664467
3,Logistic Regression,0.738667
1,Decision Tree,0.867933
7,AdaBoost Classifier,0.892333
0,Random Forest,0.9106
2,Gradient Boosting,0.920867
5,XGBClassifier,0.9278
6,CatBoosting Classifier,0.932


In [26]:
param_grid = {
    'iterations': [100, 200, 300, 500],  # Number of boosting iterations (trees)
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # Step size at each iteration
    'depth': [4, 6, 8, 10],  # Depth of individual trees
    'l2_leaf_reg': [1, 3, 5, 7],  # L2 regularization term on leaf weights
    'random_strength': [0.5, 1, 1.5],  # Controls randomness in feature selection
    'border_count': [32, 64, 128],  # Number of splits for numerical features
    'bagging_temperature': [0.5, 1.0, 1.5],  # Controls randomness in data sampling
    'early_stopping_rounds': [10, 20, 30] # Number of iterations with no improvement after which training stops
}

In [27]:
# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [None]:
from sklearn.model_selection import GridSearchCV


catboost=CatBoostClassifier(verbose=False)

catboost_cv=GridSearchCV(catboost,param_grid ,cv=10) 
catboost_cv.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",catboost_cv.best_params_) 
print("accuracy :",catboost_cv.best_score_)