# DS Workshop 3 - 29th November 2024
## Team: Anna King

In this notebook I attempt to improve on the model via:
* Alternative models
* Feature importance
* Ensamble
* Hyperparameter tuning
* Outlier removal
* Additional features

In [8]:
!pip install matplotlib seaborn scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ----------------------- ---------------- 6.6/11.1 MB 33.6 MB/s eta 0:00:01
   ---------------------------------------- 11.1/11.1 MB 31.7 MB/s eta 0:00:00
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.15.2-cp311-cp311-win_amd64.whl (41.2 MB)
   ---------------------------------------- 0.0/41.2 MB ? eta -:--:--
   ------- -------------------------------- 7.3/41.2 MB 34.9 MB/s eta 0:00:01
  

In [9]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [10]:
import pandas as pd
import os

data_folder = os.getcwd().split("loan_approval_prediction")[0] + "loan_approval_prediction\\data\\"
df = pd.read_csv(data_folder + 'credit_risk_dataset.csv')  


# Data Cleaning

## Remove Duplicates 

In [11]:
df = pd.read_csv(data_folder + 'credit_risk_dataset.csv')  

# Identify duplicate rows
duplicates = df.duplicated(keep='first')
num_duplicates = duplicates.sum()

# Remove duplicate rows
df = df.drop_duplicates(keep='first')

print('no. duplicates: ', num_duplicates)

df['person_emp_length'] = df['person_emp_length'].fillna(-1)
df['loan_int_rate'] = df['loan_int_rate'].fillna(-1)

# We're also adding 'missing indicator' fields to explicity call out rows with missing data:
df['missing_emp_length'] = (df['person_emp_length'] == -1).astype(int)
df['missing_loan_rate'] = (df['loan_int_rate'] == -1).astype(int)

df['person_age'] = df['person_age'].astype('uint8')
df['person_income'] = df['person_income'].astype('uint32')
df['loan_amnt'] = df['loan_amnt'].astype('uint32')
df['loan_int_rate'] = df['loan_int_rate'].astype('float32')
df['loan_status'] = df['loan_status'].astype('uint8')
df['loan_percent_income'] = df['loan_percent_income'].astype('float32')
df['cb_person_cred_hist_length'] = df['cb_person_cred_hist_length'].astype('uint8')
df['missing_emp_length'] = df['missing_emp_length'].astype('uint8')
df['missing_loan_rate'] = df['missing_loan_rate'].astype('uint8')

# We are using min-max scaling for continuous variables as the underlying data is not normally distributed
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
df[['person_age', 'person_income','loan_amnt']] = min_max_scaler.fit_transform(df[['person_age','person_income','loan_amnt']])
df.sample(5)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)

encoded_data = encoder.fit_transform(df[['person_home_ownership','loan_intent','loan_grade']])

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['person_home_ownership','loan_intent','loan_grade']))
df_final = pd.concat([df.drop(['person_home_ownership','loan_intent','loan_grade'], axis=1).reset_index(drop=True) , encoded_df.reset_index(drop=True) ], axis=1)

df_final.sample(5)

# Converting Ys and Ns to 1s and 0s
df_final['cb_person_default_on_file'] = df_final['cb_person_default_on_file'].replace({'Y': 1, 'N': 0})
df_final.head()

# Mark NaN values in person_emp_length and loan_int_rate

df_final['missing_emp_length'] = df_final['person_emp_length'].isnull().astype(int)
df_final['missing_int_rate'] = df_final['loan_int_rate'].isnull().astype(int)

# Also mark loans with 0 interest as a missing value
df_final.loc[df_final['loan_int_rate'] == 0, 'missing_int_rate'] = 1

# Let NaNs equal -1 to indicate missing value
df_final.loc[df_final['person_emp_length'].isnull(), 'person_emp_length'] = -1
df_final.loc[df_final['loan_int_rate'].isnull(), 'loan_int_rate'] = -1

df_final["loan_rejected"] =  1 - df_final['loan_status']
df_final = df_final.drop(['loan_status'], axis=1).reset_index(drop=True)

no. duplicates:  165


  df_final['cb_person_default_on_file'] = df_final['cb_person_default_on_file'].replace({'Y': 1, 'N': 0})


In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Prepare the data (features and labels)
X = df_final.drop('loan_rejected', axis=1)  # Features
y = df_final['loan_rejected']  # Target label

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
!pip install shap

^C


Collecting shap
  Downloading shap-0.47.0-cp311-cp311-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Using cached slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Collecting numba>=0.54 (from shap)
  Downloading numba-0.61.0-cp311-cp311-win_amd64.whl.metadata (2.8 kB)
Collecting cloudpickle (from shap)
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.54->shap)
  Downloading llvmlite-0.44.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting numpy (from shap)
  Downloading numpy-2.1.3-cp311-cp311-win_amd64.whl.metadata (60 kB)
Downloading shap-0.47.0-cp311-cp311-win_amd64.whl (530 kB)
   ---------------------------------------- 0.0/530.3 kB ? eta -:--:--
   ---------------------------------------- 530.3/530.3 kB 8.6 MB/s eta 0:00:00
Using cached slicer-0.0.8-py3-none-any.whl (15 kB)
Downloading numba-0.61.0-cp311-cp311-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8

  You can safely remove it manually.
  You can safely remove it manually.


## Feature Importance

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Example: Train a Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Compute permutation importance
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')

# Accessing the results
importances_mean = result.importances_mean
importances_std = result.importances_std
importances = result.importances

# Create a DataFrame for better readability
import pandas as pd
feature_importances = pd.DataFrame({
    'feature': X_test.columns,
    'Importance Mean': importances_mean,
    'Importance Std': importances_std
}).sort_values(by='Importance Mean', ascending=False)

print(round(feature_importances, 2))

                        feature  Importance Mean  Importance Std
5           loan_percent_income             0.09             0.0
12   person_home_ownership_RENT             0.05             0.0
1                 person_income             0.04             0.0
20                 loan_grade_D             0.04             0.0
4                 loan_int_rate             0.03             0.0
14  loan_intent_HOMEIMPROVEMENT             0.02             0.0
2             person_emp_length             0.01             0.0
13        loan_intent_EDUCATION             0.01             0.0
16         loan_intent_PERSONAL             0.01             0.0
17          loan_intent_VENTURE             0.01             0.0
21                 loan_grade_E             0.01             0.0
19                 loan_grade_C             0.01             0.0
15          loan_intent_MEDICAL             0.01             0.0
11    person_home_ownership_OWN             0.00             0.0
0                    pers

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("SVM", SVC()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("XGBoost", XGBClassifier()),
    ("LightGBM", LGBMClassifier())
]

# Initialize an empty list to store the results
results = []
feature_importance = pd.DataFrame()
# Iterate through models
for model_name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')

    # Accessing the results
    importances_mean = result.importances_mean
    importances_std = result.importances_std
    importances = result.importances

    # Create a DataFrame for better readability
    import pandas as pd
    model_feature_importances = pd.DataFrame({
        'feature': X_test.columns,
        model_name + '_mean': importances_mean,
        model_name + '_std': importances_std
        
    })

    relevent_features = model_feature_importances[(model_feature_importances[model_name + '_mean'] > 0.005)]["feature"].tolist()

    # Append results to the list
    results.append({
        "Model": model_name,
        "Precision": precision,
        "Recall": recall,
        "f1_score": f1,
        "Accuracy": accuracy,
        "Relevant_features": relevent_features
    })

    if feature_importance.shape[0] > 0:
        feature_importance = feature_importance.merge(model_feature_importances, "outer", on="feature")
    else:
        feature_importance = model_feature_importances.copy()

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 17758, number of negative: 4933
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000530 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 937
[LightGBM] [Info] Number of data points in the train set: 22691, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.782601 -> initscore=1.280889
[LightGBM] [Info] Start training from score 1.280889
                 Model  Precision    Recall  f1_score  Accuracy  \
0  Logistic Regression   0.880799  0.955741  0.916741  0.864884   
1                  SVM   0.839908  0.967631  0.899257  0.831260   
2        Decision Tree   0.929997  0.923240  0.926606  0.886170   
3        Random Forest   0.925985  0.990091  0.956966  0.930694   
4              XGBoost   0.931365  0.987845  0.958774  0.933882   
5             LightGBM   0.926250  0.993923 

In [None]:
!pip install Jinja2



In [None]:
import numpy as np
import seaborn as sns
cm = sns.light_palette("yellow", as_cmap=True)

def style_negative(v, props=''):
    return props if v < 0 else None

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

mean_cols = [col for col in feature_importance.columns if 'mean' in col]
feat_mean_importance = round(feature_importance.set_index("feature"), 3)[mean_cols]

s2 = feat_mean_importance.style.text_gradient(cmap=cm)\
    .map(lambda v: 'opacity: 20%;' if (v < 0.01) and (v > -0.01) else None)\
        .format(precision=2)

s2

Unnamed: 0_level_0,Logistic Regression_mean,SVM_mean,Decision Tree_mean,Random Forest_mean,XGBoost_mean,LightGBM_mean
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cb_person_cred_hist_length,0.0,0.0,0.0,-0.0,0.0,0.0
cb_person_default_on_file,-0.0,0.0,0.0,0.0,-0.0,-0.0
loan_amnt,0.01,0.0,0.02,-0.0,0.01,0.0
loan_grade_B,-0.0,-0.0,0.0,-0.0,0.0,0.0
loan_grade_C,-0.0,0.0,0.03,0.01,0.01,0.01
loan_grade_D,0.03,0.02,0.04,0.04,0.04,0.03
loan_grade_E,0.01,0.0,0.01,0.01,0.01,0.01
loan_grade_F,0.0,-0.0,0.0,0.0,0.0,0.0
loan_grade_G,0.0,0.0,0.0,0.0,0.0,0.0
loan_int_rate,0.03,0.03,0.07,0.03,0.04,0.05


In [None]:
results_df

Unnamed: 0,Model,Precision,Recall,f1_score,Accuracy,Relevant_features
0,Logistic Regression,0.880799,0.955741,0.916741,0.864884,"[loan_amnt, loan_int_rate, loan_percent_income..."
1,SVM,0.839908,0.967631,0.899257,0.83126,"[person_emp_length, loan_int_rate, loan_grade_D]"
2,Decision Tree,0.929997,0.92324,0.926606,0.88617,"[person_age, person_income, person_emp_length,..."
3,Random Forest,0.925985,0.990091,0.956966,0.930694,"[person_income, person_emp_length, loan_int_ra..."
4,XGBoost,0.931365,0.987845,0.958774,0.933882,"[person_age, person_income, person_emp_length,..."
5,LightGBM,0.92625,0.993923,0.958894,0.933676,"[person_age, person_income, person_emp_length,..."


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("SVM", SVC()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("XGBoost", XGBClassifier()),
    ("LightGBM", LGBMClassifier())
]

# Initialize an empty list to store the results
results = []
feature_importance = pd.DataFrame()
# Iterate through models
for model_name, model in models:
    # Train the model
    model.fit(X_train[results_df[results_df["Model"] == model_name]["Relevant_features"].values[0]], y_train)
    
    # Make predictions
    y_pred = model.predict(X_test[results_df[results_df["Model"] == model_name]["Relevant_features"].values[0]])
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append results to the list
    results.append({
        "Model": model_name,
        "Precision": precision,
        "Recall": recall,
        "f1_score": f1,
        "Accuracy": accuracy
    })

# Convert results to a DataFrame
results_reduced_features = pd.DataFrame(results)

# Display the results
print(results_reduced_features)

[LightGBM] [Info] Number of positive: 17758, number of negative: 4933
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000497 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 665
[LightGBM] [Info] Number of data points in the train set: 22691, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.782601 -> initscore=1.280889
[LightGBM] [Info] Start training from score 1.280889
                 Model  Precision    Recall  f1_score  Accuracy
0  Logistic Regression   0.873765  0.958383  0.914120  0.859846
1                  SVM   0.822712  0.961950  0.886899  0.809049
2        Decision Tree   0.930631  0.925221  0.927918  0.888123
3        Random Forest   0.924360  0.988109  0.955172  0.927815
4              XGBoost   0.930450  0.986260  0.957542  0.931928
5             LightGBM   0.925188  0.993394  0.958078  0.932339


In [None]:
results_df

Unnamed: 0,Model,Precision,Recall,f1_score,Accuracy,Relevant_features
0,Logistic Regression,0.880799,0.955741,0.916741,0.864884,"[loan_amnt, loan_int_rate, loan_percent_income..."
1,SVM,0.839908,0.967631,0.899257,0.83126,"[person_emp_length, loan_int_rate, loan_grade_D]"
2,Decision Tree,0.929997,0.92324,0.926606,0.88617,"[person_age, person_income, person_emp_length,..."
3,Random Forest,0.925985,0.990091,0.956966,0.930694,"[person_income, person_emp_length, loan_int_ra..."
4,XGBoost,0.931365,0.987845,0.958774,0.933882,"[person_age, person_income, person_emp_length,..."
5,LightGBM,0.92625,0.993923,0.958894,0.933676,"[person_age, person_income, person_emp_length,..."


In [None]:
results_reduced_features

Unnamed: 0,Model,Precision,Recall,f1_score,Accuracy
0,Logistic Regression,0.873765,0.958383,0.91412,0.859846
1,SVM,0.822712,0.96195,0.886899,0.809049
2,Decision Tree,0.930631,0.925221,0.927918,0.888123
3,Random Forest,0.92436,0.988109,0.955172,0.927815
4,XGBoost,0.93045,0.98626,0.957542,0.931928
5,LightGBM,0.925188,0.993394,0.958078,0.932339


## Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Define base models
models = [
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ("LightGBM", LGBMClassifier(random_state=42))
]

# Create Voting Classifier
voting_clf = VotingClassifier(estimators=models, voting='soft')  # 'soft' uses predicted probabilities

# Train on the training set
voting_clf.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Ensemble Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 17758, number of negative: 4933
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 937
[LightGBM] [Info] Number of data points in the train set: 22691, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.782601 -> initscore=1.280889
[LightGBM] [Info] Start training from score 1.280889
Voting Ensemble Accuracy: 0.9351


## Hyper parameter tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd

# Define parameter grids for each model
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

param_grid_lgbm = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [-1, 10, 20],
    'num_leaves': [31, 50, 100],
    'subsample': [0.8, 1.0]
}

# Create models and parameter grids
models = {
    'Random Forest': (RandomForestClassifier(random_state=42), param_grid_rf),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), param_grid_xgb),
    'LightGBM': (LGBMClassifier(random_state=42), param_grid_lgbm)
}

# Perform RandomizedSearchCV for each model
best_estimators = {}
results = []

for model_name, (model, param_grid) in models.items():
    print(f"Running RandomizedSearchCV for {model_name}...")
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=20,  # Number of random combinations
        cv=3,       # 3-fold cross-validation
        scoring='accuracy',
        random_state=42,
        n_jobs=-1   # Use all CPU cores
    )
    search.fit(X_train, y_train)
    best_estimators[model_name] = search.best_estimator_
    
    # Evaluate on the test set
    y_pred = search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({
        'Model': model_name,
        'Best Params': search.best_params_,
        'Test Accuracy': accuracy
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
print("\nHyperparameter Search Results:")
print(results_df)

Running RandomizedSearchCV for Random Forest...
Running RandomizedSearchCV for XGBoost...


Parameters: { "use_label_encoder" } are not used.



Running RandomizedSearchCV for LightGBM...
[LightGBM] [Info] Number of positive: 17758, number of negative: 4933
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 937
[LightGBM] [Info] Number of data points in the train set: 22691, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.782601 -> initscore=1.280889
[LightGBM] [Info] Start training from score 1.280889

Hyperparameter Search Results:
           Model                                        Best Params  \
0  Random Forest  {'n_estimators': 200, 'min_samples_split': 2, ...   
1        XGBoost  {'subsample': 1.0, 'n_estimators': 200, 'max_d...   
2       LightGBM  {'subsample': 0.8, 'num_leaves': 50, 'n_estima...   

   Test Accuracy  
0       0.930900  
1       0.934499  
2       0.933779  


In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Define base models
models = [
    ("Random Forest", RandomForestClassifier(max_depth=30, n_estimators=200, random_state=42)),
    ("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='logloss', subsample= 1.0, n_estimators= 200, max_depth= 6, learning_rate=0.1, colsample_bytree= 1.0, random_state=42)),
    ("LightGBM", LGBMClassifier(max_depth=20, n_estimators=200, num_leaves=50, random_state=42, subsample=0.8))
]

# Create Voting Classifier
voting_clf = VotingClassifier(estimators=models, voting='soft')  # 'soft' uses predicted probabilities

# Train on the training set
voting_clf.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Ensemble Accuracy: {accuracy:.2f}")

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 17758, number of negative: 4933
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000571 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 937
[LightGBM] [Info] Number of data points in the train set: 22691, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.782601 -> initscore=1.280889
[LightGBM] [Info] Start training from score 1.280889
Voting Ensemble Accuracy: 0.94


In [None]:
accuracy

0.935012853470437

## Additional features

In [None]:
df = pd.read_csv(data_folder + 'credit_risk_dataset.csv')  

# Identify duplicate rows
duplicates = df.duplicated(keep='first')
num_duplicates = duplicates.sum()

# Remove duplicate rows
df = df.drop_duplicates(keep='first')

print('no. duplicates: ', num_duplicates)

df['person_emp_length'] = df['person_emp_length'].fillna(-1)
df['loan_int_rate'] = df['loan_int_rate'].fillna(-1)

# We're also adding 'missing indicator' fields to explicity call out rows with missing data:
df['missing_emp_length'] = (df['person_emp_length'] == -1).astype(int)
df['missing_loan_rate'] = (df['loan_int_rate'] == -1).astype(int)

df['person_age'] = df['person_age'].astype('uint8')
df['person_income'] = df['person_income'].astype('uint32')
df['loan_amnt'] = df['loan_amnt'].astype('uint32')
df['loan_int_rate'] = df['loan_int_rate'].astype('float32')
df['loan_status'] = df['loan_status'].astype('uint8')
df['loan_percent_income'] = df['loan_percent_income'].astype('float32')
df['cb_person_cred_hist_length'] = df['cb_person_cred_hist_length'].astype('uint8')
df['missing_emp_length'] = df['missing_emp_length'].astype('uint8')
df['missing_loan_rate'] = df['missing_loan_rate'].astype('uint8')


no. duplicates:  165


In [None]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,missing_emp_length,missing_loan_rate
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,0,0
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,0,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,0,0
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2,0,0
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4,0,0


In [None]:
import numpy as np
df = df.drop(df[df['person_age'] > 100].index)
df = df.drop(df[df['person_emp_length'] > 100].index)

df['loantoincome'] = (df['loan_amnt'] / df['person_income']) - df['loan_percent_income']
df['income'] = np.log(df['person_income'])

# # df["loan_percent_incometoincome"] = round((df["loan_percent_income"] / df["person_income"]), 8)
# df['person_age_to_person_income'] = round(df['person_age'] / df['person_income'], 8)
# df['person_emp_length_to_person_age'] = round(df['person_emp_length'] / df['person_age'], 8)
# df['loan_int_rate_to_loan_amnt'] = round(df['loan_int_rate'] / df['loan_amnt'], 8)

In [None]:
# We are using min-max scaling for continuous variables as the underlying data is not normally distributed
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()
df[['person_age', 'person_income','loan_amnt']] = min_max_scaler.fit_transform(df[['person_age','person_income','loan_amnt']])
df.sample(5)

from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)

encoded_data = encoder.fit_transform(df[['person_home_ownership','loan_intent','loan_grade']])

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['person_home_ownership','loan_intent','loan_grade']))
df_final = pd.concat([df.drop(['person_home_ownership','loan_intent','loan_grade'], axis=1).reset_index(drop=True) , encoded_df.reset_index(drop=True) ], axis=1)

df_final.sample(5)

# Converting Ys and Ns to 1s and 0s
df_final['cb_person_default_on_file'] = df_final['cb_person_default_on_file'].replace({'Y': 1, 'N': 0})
df_final.head()

# Mark NaN values in person_emp_length and loan_int_rate

df_final['missing_emp_length'] = df_final['person_emp_length'].isnull().astype(int)
df_final['missing_int_rate'] = df_final['loan_int_rate'].isnull().astype(int)

# Also mark loans with 0 interest as a missing value
df_final.loc[df_final['loan_int_rate'] == 0, 'missing_int_rate'] = 1

# Let NaNs equal -1 to indicate missing value
df_final.loc[df_final['person_emp_length'].isnull(), 'person_emp_length'] = -1
df_final.loc[df_final['loan_int_rate'].isnull(), 'loan_int_rate'] = -1

df_final["loan_rejected"] =  1 - df_final['loan_status']
df_final = df_final.drop(['loan_status'], axis=1).reset_index(drop=True)

  df_final['cb_person_default_on_file'] = df_final['cb_person_default_on_file'].replace({'Y': 1, 'N': 0})


In [None]:
# Step 1: Prepare the data (features and labels)
X = df_final.drop('loan_rejected', axis=1)  # Features
y = df_final['loan_rejected']  # Target label

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Define base models
models = [
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ("LightGBM", LGBMClassifier(random_state=42))
]

# Create Voting Classifier
voting_clf = VotingClassifier(estimators=models, voting='soft')  # 'soft' uses predicted probabilities

# Train on the training set
voting_clf.fit(X_train, y_train)

# Evaluate the ensemble model
y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Ensemble Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 17716, number of negative: 4970
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2189
[LightGBM] [Info] Number of data points in the train set: 22686, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.780922 -> initscore=1.271048
[LightGBM] [Info] Start training from score 1.271048
Voting Ensemble Accuracy: 0.9399


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("SVM", SVC()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("XGBoost", XGBClassifier()),
    ("LightGBM", LGBMClassifier())
]

# Initialize an empty list to store the results
results = []
feature_importance = pd.DataFrame()
# Iterate through models
for model_name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Append results to the list
    results.append({
        "Model": model_name,
        "Precision": precision,
        "Recall": recall,
        "f1_score": f1,
        "Accuracy": accuracy
    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display the results
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 17716, number of negative: 4970
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2189
[LightGBM] [Info] Number of data points in the train set: 22686, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.780922 -> initscore=1.271048
[LightGBM] [Info] Start training from score 1.271048
                 Model  Precision    Recall  f1_score  Accuracy
0  Logistic Regression   0.883633  0.950559  0.915875  0.863417
1                  SVM   0.844737  0.970809  0.903396  0.837602
2        Decision Tree   0.937517  0.929257  0.933369  0.896225
3        Random Forest   0.933308  0.988166  0.959954  0.935514
4              XGBoost   0.935837  0.989612  0.961974  0.938805
5             LightGBM   0.932699  0.993162  0.961982  0.938599


In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# List of models to evaluate
models = [
    ("Logistic Regression", LogisticRegression()),
    ("SVM", SVC()),
    ("Decision Tree", DecisionTreeClassifier()),
    ("Random Forest", RandomForestClassifier()),
    ("XGBoost", XGBClassifier()),
    ("LightGBM", LGBMClassifier())
]

# Initialize an empty list to store the results
results = []
feature_importance = pd.DataFrame()
# Iterate through models
for model_name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42, scoring='accuracy')

    # Accessing the results
    importances_mean = result.importances_mean
    importances_std = result.importances_std
    importances = result.importances

    # Create a DataFrame for better readability
    import pandas as pd
    model_feature_importances = pd.DataFrame({
        'feature': X_test.columns,
        model_name + '_mean': importances_mean,
        model_name + '_std': importances_std
        
    })

    relevent_features = model_feature_importances[(model_feature_importances[model_name + '_mean'] > 0.005)]["feature"].tolist()

    # Append results to the list
    results.append({
        "Model": model_name,
        "Precision": precision,
        "Recall": recall,
        "f1_score": f1,
        "Accuracy": accuracy,
        "Relevant_features": relevent_features
    })

    if feature_importance.shape[0] > 0:
        feature_importance = feature_importance.merge(model_feature_importances, "outer", on="feature")
    else:
        feature_importance = model_feature_importances.copy()

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Display the results
print(results_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KeyboardInterrupt: 

In [None]:
import numpy as np
import seaborn as sns
cm = sns.light_palette("yellow", as_cmap=True)

def style_negative(v, props=''):
    return props if v < 0 else None

def highlight_max(s, props=''):
    return np.where(s == np.nanmax(s.values), props, '')

mean_cols = [col for col in feature_importance.columns if 'mean' in col]
feat_mean_importance = round(feature_importance.set_index("feature"), 3)[mean_cols]

s2 = feat_mean_importance.style.text_gradient(cmap=cm)\
    .map(lambda v: 'opacity: 20%;' if (v < 0.01) and (v > -0.01) else None)\
        .format(precision=2)

In [None]:
s2

Unnamed: 0_level_0,Logistic Regression_mean,SVM_mean,Decision Tree_mean,Random Forest_mean,XGBoost_mean,LightGBM_mean
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
cb_person_cred_hist_length,-0.0,-0.0,0.0,-0.0,0.0,0.0
cb_person_default_on_file,-0.0,-0.0,0.04,-0.0,0.0,-0.0
income,0.01,0.01,0.07,0.0,0.0,0.0
loan_amnt,0.0,0.0,0.03,-0.0,0.0,0.0
loan_grade_B,0.0,-0.0,0.0,0.0,-0.0,0.0
loan_grade_C,0.0,-0.0,0.05,0.0,0.01,0.01
loan_grade_D,0.02,0.01,0.04,0.03,0.03,0.03
loan_grade_E,0.01,-0.0,0.01,0.0,0.01,0.0
loan_grade_F,0.0,-0.0,0.0,-0.0,0.0,0.0
loan_grade_G,0.0,0.0,0.0,0.0,0.0,0.0
