## Model Training

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [2]:
# Import Library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Import Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
#Import Train Test Split
from sklearn.model_selection import train_test_split
# Import Hyperparameter Parameter
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
#Import Evalution Metric
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import warnings

#### Import the CSV Data as Pandas DataFrame

In [16]:
df=pd.read_csv("data\loan_data.csv")

#### Show Top 5 Records

In [11]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


#### Convert Categorical data into Numerical Labeled using Label Encoder

In [5]:
from sklearn.preprocessing import LabelEncoder

#Fit Label_Encoder to the categorical column
label_encoder = LabelEncoder()
df['gender_encoded'] = label_encoder.fit_transform(df['person_gender'])
df['education_encoded'] = label_encoder.fit_transform(df['person_education'])
df['home_own_encoded'] = label_encoder.fit_transform(df['person_home_ownership'])
df['loan_intent_encoded'] = label_encoder.fit_transform(df['loan_intent'])
df['previous_loan_default_encoded'] = label_encoder.fit_transform(df['previous_loan_defaults_on_file'])

In [6]:
#Drop encoded column/features
df=df.drop(['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file'],axis=1)

In [15]:
df.head(2)

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status,gender_encoded,education_encoded,home_own_encoded,loan_intent_encoded,previous_loan_default_encoded
0,22.0,71948.0,0,35000.0,16.02,0.49,3.0,561,1,0,4,3,4,0
1,21.0,12282.0,0,1000.0,11.14,0.08,2.0,504,0,0,3,2,1,1


#### Remove Ouliers from Dataset

In [7]:
def remove_outliers(df):
    # Create a copy of the DataFrame to avoid modifying the original one
    df_cleaned = df.copy()

    for column in df_cleaned[['person_age','person_emp_exp']]:
        if pd.api.types.is_numeric_dtype(df_cleaned[column]):  # Only process numeric columns
            # Calculate Q1 (25th percentile) and Q3 (75th percentile)
            Q1 = np.percentile(df_cleaned[column].dropna(), 25)
            Q3 = np.percentile(df_cleaned[column].dropna(), 75)
            IQR = Q3 - Q1
            
            # Define the lower and upper bounds for outliers
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Remove outliers by keeping only values within the bounds
            df_cleaned = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)]

    return df_cleaned

#df = pd.DataFrame(data)
df_no_outliers = remove_outliers(df)

print("Original DataFrame:\n", df)
print("\nDataFrame after removing outliers:\n", df_no_outliers)

Original DataFrame:
        person_age  person_income  person_emp_exp  loan_amnt  loan_int_rate  \
0            22.0        71948.0               0    35000.0          16.02   
1            21.0        12282.0               0     1000.0          11.14   
2            25.0        12438.0               3     5500.0          12.87   
3            23.0        79753.0               0    35000.0          15.23   
4            24.0        66135.0               1    35000.0          14.27   
...           ...            ...             ...        ...            ...   
44995        27.0        47971.0               6    15000.0          15.66   
44996        37.0        65800.0              17     9000.0          14.07   
44997        33.0        56942.0               7     2771.0          10.02   
44998        29.0        33164.0               4    12000.0          13.23   
44999        24.0        51609.0               1     6665.0          17.05   

       loan_percent_income  cb_person_cred

In [8]:
df_no_outliers.shape

(42306, 14)

In [9]:
X = df.drop(columns=['loan_status'],axis=1)

In [10]:
y = df['loan_status']

In [11]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [13]:
# Initialize classifiers with parameters
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
dt_classifier = DecisionTreeClassifier(random_state=42, class_weight='balanced')
svm_classifier = SVC(random_state=42, class_weight='balanced')
lr_classifier = LogisticRegression(random_state=42, class_weight='balanced')
knn_classifier = KNeighborsClassifier()  # Default parameters
ada_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)  # Default parameters

# Store classifiers in a list
classifiers = [rf_classifier, dt_classifier, svm_classifier, lr_classifier, knn_classifier, ada_classifier]

In [15]:
for model in classifiers:
    print(f"{model}: Evaluation Metrics")
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    #Evaluation Metrics Accuracy_Score
    accuracy=accuracy_score(y_test,y_pred)
    print(f"Accuracy: {accuracy}")
    print("---------------------------------------------------------------------")
    #Evaluation Metrics Precision, Recall and F1 score
    precision = precision_score(y_test, y_pred, average='weighted') 
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    #Evaluation Metrics confusionMatrix
    print("---------------------------------------------------------------------")
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_matrix)
    #Evaluation Metrics Classification Report
    print("---------------------------------------------------------------------")
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)

RandomForestClassifier(class_weight='balanced', random_state=42): Evaluation Metrics
Accuracy: 0.9265555555555556
---------------------------------------------------------------------
Precision: 0.925432866951538
Recall: 0.9265555555555556
F1-Score: 0.9241379458431996
---------------------------------------------------------------------
Confusion Matrix:
 [[6818  166]
 [ 495 1521]]
---------------------------------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      6984
           1       0.90      0.75      0.82      2016

    accuracy                           0.93      9000
   macro avg       0.92      0.87      0.89      9000
weighted avg       0.93      0.93      0.92      9000

DecisionTreeClassifier(class_weight='balanced', random_state=42): Evaluation Metrics
Accuracy: 0.8946666666666667
---------------------------------------------------------------------
Precision:

### After Evaluation we can see that Highest accuracy we get with RandomForestClassifier is around 92.6%

## Random Forest Classifier