In [124]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cleaned-dataset-my/cleaned.csv


# Feature selection

In [125]:
import pandas as pd
df = pd.read_csv("/kaggle/input/cleaned-dataset-my/cleaned.csv")

In [126]:
# Remove high dimentional columns

def get_categorical_columns(df):
    """
    Takes a pandas DataFrame as input and identifies categorical columns. Outputs the number of categories in each column
    and drops any columns with more than 20 categories. Returns the updated DataFrame.
    """
    categorical_cols = df.select_dtypes(include=['category', 'object']).columns.tolist()
    for col in categorical_cols:
        if df[col].nunique() > 20:
            df = df.drop(col, axis=1)
            print(f"Dropped column '{col}' because it had more than 20 categories.")
        else:
            print(f"Column '{col}' has {df[col].nunique()} categories.")
    return df

df = get_categorical_columns(df)

Column 'term' has 2 categories.
Column 'grade' has 7 categories.
Dropped column 'sub_grade' because it had more than 20 categories.
Dropped column 'emp_title' because it had more than 20 categories.
Column 'emp_length' has 11 categories.
Column 'home_ownership' has 6 categories.
Column 'verification_status' has 3 categories.
Dropped column 'issue_d' because it had more than 20 categories.
Column 'loan_status' has 6 categories.
Column 'pymnt_plan' has 2 categories.
Dropped column 'url' because it had more than 20 categories.
Column 'purpose' has 14 categories.
Dropped column 'title' because it had more than 20 categories.
Dropped column 'zip_code' because it had more than 20 categories.
Dropped column 'addr_state' because it had more than 20 categories.
Dropped column 'earliest_cr_line' because it had more than 20 categories.
Column 'initial_list_status' has 2 categories.
Column 'last_pymnt_d' has 7 categories.
Column 'next_pymnt_d' has 3 categories.
Dropped column 'last_credit_pull_d' 

In [127]:
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE

def apply_feature_selection(df, target_col, k=15):
    # separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # get numerical and categorical columns
    num_cols = X.select_dtypes(include=['float', 'int']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()

    # select features using SelectKBest and f_classif on numerical columns
    selector = SelectKBest(score_func=f_classif, k=k)
    X_num = X[num_cols]
    X_num_new = selector.fit_transform(X_num, y)
    selected_cols = selector.get_support(indices=True)
    selected_features = X_num.columns[selected_cols]
    X_num_selected = pd.DataFrame(X_num_new, columns=selected_features)

    # concatenate numerical and categorical columns
    X_new = pd.concat([X_num_selected, X[cat_cols]], axis=1)


    # return preprocessed data as DataFrame
    return pd.concat([X_new, y ], axis=1)


In [128]:
df_selected = apply_feature_selection(df, 'loan_status', k=15)

In [129]:
df_selected

Unnamed: 0,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,revol_bal,revol_util,total_pymnt,total_pymnt_inv,...,emp_length,home_ownership,verification_status,pymnt_plan,purpose,initial_list_status,last_pymnt_d,next_pymnt_d,application_type,loan_status
0,6.620000,368.450000,105000.000000,14.050000,0.000000,1.000000,13168.000000,21.600000,8842.800000,8842.800000,...,10+ years,MORTGAGE,Not Verified,n,debt_consolidation,w,Jan-2016,Feb-2016,INDIVIDUAL,Current
1,12.618566,386.423936,67145.999461,18.741797,0.460752,0.445776,12664.435838,56.934101,3596.851282,3595.528095,...,2 years,MORTGAGE,Not Verified,n,debt_consolidation,w,Jan-2016,Feb-2016,INDIVIDUAL,Current
2,12.618566,386.423936,67145.999461,18.741797,0.460752,0.445776,12664.435838,56.934101,3596.851282,3595.528095,...,4 years,RENT,Not Verified,n,debt_consolidation,f,Jan-2016,Feb-2016,INDIVIDUAL,Current
3,9.670000,321.130000,102000.000000,15.550000,2.000000,0.000000,9912.000000,44.400000,7706.810000,7706.810000,...,7 years,MORTGAGE,Not Verified,n,debt_consolidation,f,Jan-2016,Feb-2016,INDIVIDUAL,Current
4,12.618566,386.423936,67145.999461,18.741797,0.460752,0.445776,12664.435838,56.934101,3596.851282,3595.528095,...,6 years,MORTGAGE,Source Verified,n,debt_consolidation,f,Jan-2016,Feb-2016,INDIVIDUAL,Current
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291305,6.490000,306.450000,75000.000000,33.660000,0.000000,0.000000,10529.000000,34.700000,3670.190000,3670.190000,...,1 year,RENT,Verified,n,debt_consolidation,w,Jan-2016,Feb-2016,INDIVIDUAL,Current
291306,12.618566,386.423936,67145.999461,18.741797,0.460752,0.445776,12664.435838,56.934101,3596.851282,3595.528095,...,8 years,RENT,Verified,n,debt_consolidation,f,Jan-2016,Feb-2016,INDIVIDUAL,Current
291307,12.618566,386.423936,67145.999461,18.741797,0.460752,0.445776,12664.435838,56.934101,3596.851282,3595.528095,...,< 1 year,MORTGAGE,Verified,n,credit_card,f,Jan-2016,Feb-2016,INDIVIDUAL,Current
291308,11.990000,797.030000,79000.000000,3.900000,0.000000,1.000000,8621.000000,84.500000,9532.390000,9532.390000,...,10+ years,MORTGAGE,Verified,n,home_improvement,f,Jan-2016,Feb-2016,INDIVIDUAL,Current


In [130]:
from sklearn.preprocessing import StandardScaler

def scale(df, target_col):
    # separate features and target
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    # perform feature scaling
    scaler = StandardScaler()
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
    
    return X , y
    

In [131]:
X , y = scale(df, 'loan_status' )

In [132]:
import pandas as pd

def get_categorical_info(df):
    # Get all the categorical columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

    # For each categorical column, count the number of unique values and print if more than 50
    for col in cat_cols:
        n_unique = df[col].nunique()
        if n_unique > 50:
            print(f"{col} has {n_unique} unique categories")

get_categorical_info(X)

In [133]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print the shape of the training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (233048, 34)
X_test shape: (58262, 34)
y_train shape: (233048,)
y_test shape: (58262,)


In [134]:
def process_y_train(y_train):
    y_train = y_train.copy()
    default_statuses = {'Late (16-30 days)', 'Late (31-120 days)', 'Default'}
    y_train.replace(default_statuses, 'Default', inplace=True)
    y_train.replace(y_train[y_train != 'Default'].unique(), 'Not Default', inplace=True)
    return y_train


y_train = process_y_train(y_train)

In [135]:
def process_y_train(y_train):
    y_train = y_train.copy()
    default_statuses = {'Late (16-30 days)', 'Late (31-120 days)', 'Default'}
    y_train.replace(default_statuses, 'Default', inplace=True)
    y_train.replace(y_train[y_train != 'Default'].unique(), 'Not Default', inplace=True)
    return y_train


y_test = process_y_train(y_test)

In [136]:
# One hot encoding
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)


In [137]:
def remove_additional_columns(X_train, X_test):
    # Check if number of columns in X_train and X_test are equal
    if X_train.shape[1] != X_test.shape[1]:
        # Find additional columns in X_train
        extra_train_columns = set(X_train.columns) - set(X_test.columns)
        # Remove additional columns from X_train
        X_train = X_train.drop(columns=extra_train_columns)
        
        # Find additional columns in X_test
        extra_test_columns = set(X_test.columns) - set(X_train.columns)
        # Remove additional columns from X_test
        X_test = X_test.drop(columns=extra_test_columns)
        
    return X_train, X_test

X_train, X_test = remove_additional_columns(X_train, X_test)


In [138]:
y_train

Unnamed: 0,Default,Not Default
245294,0,1
264277,0,1
148436,0,1
198460,0,1
149291,0,1
...,...,...
119879,0,1
259178,0,1
131932,0,1
146867,0,1


## Model Building

In [139]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def build_classification_models(X_train, y_train, X_test, y_test):
    models = {}
    
    # Random Forest Classifier
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    models['Random Forest'] = rf
    
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    models['Decision Tree'] = dt
    
    # Evaluate models on testing data
    scores = {}
    for name, model in models.items():
        y_pred = model.predict(X_test)
        scores[name] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred, average='weighted'),
            'Recall': recall_score(y_test, y_pred, average='weighted'),
            'F1': f1_score(y_test, y_pred, average='weighted')
        }
        
    # sort results by F1 score in ascending order
    sorted_scores = sorted(scores.items(), key=lambda x: x[1]['F1'])
    
    return sorted_scores, models


In [140]:
sorted_scores, models=  build_classification_models(X_train, y_train, X_test, y_test)

In [141]:
sorted_scores

[('Decision Tree',
  {'Accuracy': 0.9865778723696406,
   'Precision': 0.9869011703870906,
   'Recall': 0.9865778723696406,
   'F1': 0.9865612493139816}),
 ('Random Forest',
  {'Accuracy': 0.9910919638872678,
   'Precision': 0.9909633525363641,
   'Recall': 0.9910919638872678,
   'F1': 0.9902750444861365})]

#### The best model is random forest

In [146]:
models

{'Random Forest': RandomForestClassifier(),
 'Decision Tree': DecisionTreeClassifier()}

# Model optimization

In [142]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}


def build_best_rf_model(X_train, y_train, X_test, y_test, param_grid):
    # Initialize a random forest classifier object
    rf = RandomForestClassifier()
    
    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(rf, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    # Get the best hyperparameters and their corresponding score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Train a new random forest model using the best hyperparameters
    best_rf_model = RandomForestClassifier(**best_params)
    best_rf_model.fit(X_train, y_train)
    
    # Evaluate the best model on the testing data
    y_pred = best_rf_model.predict(X_test)
    scores = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1': f1_score(y_test, y_pred, average='weighted')
    }
    
    return best_params, best_score, best_rf_model, scores


In [143]:
# since , our model accuracy is above 90%, no need to hyper perameter optimization
'''best_params, best_score, best_rf_model, scores= build_best_rf_model(X_train, y_train, X_test, y_test, param_grid)'''

'best_params, best_score, best_rf_model, scores= build_best_rf_model(X_train, y_train, X_test, y_test, param_grid)'

In [144]:
import joblib

In [147]:
# Save the trained model
joblib.dump(models['Random Forest'], 'random_forest_model.joblib')

['random_forest_model.joblib']