# Loan Approval Prediction

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report

In [3]:
# Load dataset
data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Data preparation

In [11]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

In [13]:
# Categorical features
categorical_columns = X.select_dtypes(include=['object']).columns

# Numerical features
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop('id')

## Pipeline

Already did EDA in a separate notebook.
Key points:
- No missing values
- Categorical columns:

		'person_home_ownership',
		'loan_intent',
		'loan_grade',
		'cb_person_default_on_file'
- Numerical columns:
	
		'person_age',
		'person_income',
		'person_emp_length',
		'loan_amnt',
		'loan_int_rate',
		'loan_percent_income',
		'cb_person_cred_hist_length'

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [14]:
preprocessor = ColumnTransformer(
	transformers=[
		('num', StandardScaler(), numerical_columns),
		('cat', OneHotEncoder(), categorical_columns)
	])

In [15]:
logistic_regression = Pipeline(steps=[
	('preprocessor', preprocessor),
	('classifier', LogisticRegression())
])

random_forest = Pipeline(steps=[
	('preprocessor', preprocessor),
	('classifier', RandomForestClassifier())
])

gradient_boosting = Pipeline(steps=[
	('preprocessor', preprocessor),
	('classifier', GradientBoostingClassifier())
])

## Model Training

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
logistic_regression.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
gradient_boosting.fit(X_train, y_train)

In [19]:
log_preds = logistic_regression.predict(X_test)
rf_preds = random_forest.predict(X_test)
gb_preds = gradient_boosting.predict(X_test)

## Evaluation

In [20]:
def evaluate_model(y_true, y_pred):
	roc_auc = roc_auc_score(y_true, y_pred)
	accuracy = accuracy_score(y_true, y_pred)
	confusion = confusion_matrix(y_true, y_pred)
	classification = classification_report(y_true, y_pred)
	return roc_auc, accuracy, confusion, classification

In [21]:
# Evaluate models

logistic_regression_results = evaluate_model(y_test, log_preds)
random_forest_results = evaluate_model(y_test, rf_preds)
gradient_boosting_results = evaluate_model(y_test, gb_preds)

In [24]:
# Output results
models = ['Logistic Regression', 'Random Forest', 'Gradient Boosting']
results = [logistic_regression_results, random_forest_results, gradient_boosting_results]

for model, result in zip(models, results):
	print(f'{model} results:')
	print(f'ROC AUC: {result[0]}')
	print(f'Accuracy: {result[1]}')
	print(f'Confusion Matrix:')
	print(result[2])
	print(f'Classification Report:')
	print(result[3])
	print('\n')
	print('-'*50)

Logistic Regression results:
ROC AUC: 0.7497456054373237
Accuracy: 0.9115866655298832
Confusion Matrix:
[[9830  257]
 [ 780  862]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95     10087
           1       0.77      0.52      0.62      1642

    accuracy                           0.91     11729
   macro avg       0.85      0.75      0.79     11729
weighted avg       0.90      0.91      0.90     11729



--------------------------------------------------
Random Forest results:
ROC AUC: 0.8554799191008989
Accuracy: 0.9522550942109301
Confusion Matrix:
[[9985  102]
 [ 458 1184]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     10087
           1       0.92      0.72      0.81      1642

    accuracy                           0.95     11729
   macro avg       0.94      0.86      0.89     11729
weighted avg       0.95      0.95      0.95   

## Tuning hyperparameters

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

### Logistic Regression

In [None]:
# Parameter grid for Logistic Regression

param_grid = {
	'classifier__penalty': [None, 'l2', 'elasticnet'],
	'classifier__C': np.logspace(0, 4, 10),
	'classifier__solver': ['saga'],
	'classifier__l1_ratio': np.linspace(0, 1, 5),
	'classifier__max_iter': [1000]
}

logistic_regression_cv = GridSearchCV(logistic_regression, param_grid, cv=3,
									  n_jobs=-1, scoring='roc_auc', verbose=1,
									  return_train_score=True, error_score='raise')
logistic_regression_cv.fit(X_train, y_train)

print(f'Best parameters for Logistic Regression: {logistic_regression_cv.best_params_}')
print(f'Best score for Logistic Regression: {logistic_regression_cv.best_score_}')