<a href="https://colab.research.google.com/github/HimankPatidar/Advance_House_Price_Prediction/blob/main/Credit%20Risk%20Modelling%20-%20Loan%20Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'loan-detection:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5539054%2F9166966%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240814%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240814T102412Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2e66647fac00143b6c27742c61f29d51e2bc76c2fd6954c64f1db761868cd4ef3ba3e1da072f50181b2090272ddaf8b73972569faa4a50aaca6cca03b8d09770a6d86507555d2338761455d35e205969802943ccb4ec3a50fd4096e5be9e04376da0e2d45c043f9b8cd940ebefe9281b6bf2a518ec189549c8c7c067979b971c0cee2dfd06c411a96167ef9dcc70ceeddfdb4bc2e48bfc38fb13803ad3445ce1d80d1b92da1b85d64accf680f668d23ad5383acbaa21fb4d5ac1b634547823224d856dce71b731533d64764b3fe287006a3b7eeca9d325adee620aff44aab55913d878a2eba4e6ed5681a031bc64b5af0479b1e92a0f42a40215ef9a095dd556'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Credit Risk Modelling - Loan Classification


# Import relevant Libraries/Modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import warnings
warnings.filterwarnings("ignore")

# Load the Dataset

In [None]:
df = pd.read_csv("/kaggle/input/loan-detection/loan_detection.csv")
df.head()

# Basic EDA

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.columns

# Handling Missing Value

In [None]:
df.isnull().sum()

In [None]:
round(df.isnull().mean() * 100, 2)


In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
sns.distplot(df['age'], color='r', label=df.age.skew())
plt.legend()


In [None]:
df.age.mean()

In [None]:
df.age.median()

In [None]:
for i in df:
    if df[i].isna().sum() > 0:
        print(f' {i} : {df[i].mean()}')
        df[i].fillna(df[i].mean(), inplace=True)

# Duplicate Data

In [None]:
df.drop_duplicates(keep="first", inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.shape

# Outliers or Anomalies

## Using IQR - Inter Quartile Range

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
IQR

In [None]:
print(Q1 - 1.5 * IQR)
print()
print(Q3 + 1.5 * IQR)

In [None]:
lower_bound = (Q1 - 1.5  * IQR)
upper_bound = (Q3 + 1.5 * IQR)


In [None]:
upper_bound

In [None]:
df.shape

In [None]:

Q1 = df["age"].quantile(0.25)
Q3 = df["age"].quantile(0.75)


In [None]:
IQR = Q3 - Q1
IQR

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


In [None]:
lower_bound


In [None]:
upper_bound


In [None]:
df_filtered = df[(df['age'] >= lower_bound) & (df['age'] <= upper_bound)]


In [None]:
print(f"Q1: {Q1}, Q3: {Q3}")
print(f"IQR: {IQR}")
print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {upper_bound}")


In [None]:
df_filtered

In [None]:
sns.boxplot(df_filtered['age'])
plt.title("age")
plt.grid()
plt.show()


In [None]:
df=  df_filtered
df

# Feature Selection

In [None]:
corr_matrix = df.corr()
corr_matrix

# Model Building

## Split the Independent and Dependent Data

In [None]:
X = df.iloc[:, :-1]
X

In [None]:
y = df['Loan_Status_label']
y

In [None]:
y.value_counts()

In [None]:
df['Loan_Status_label'].value_counts()

plt.figure(figsize=(6,4))
sns.barplot(x=y.index, y=y.values, palette='viridis')
plt.xlabel('Loan Status (0: No, 1: Yes)')
plt.ylabel('Count')
plt.title('Distribution of Loan Status')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='poutcome_success', hue='Loan_Status_label', data=df, palette='Set2')
plt.xlabel('Previous Outcome: Success (0: No, 1: Yes)')
plt.ylabel('Count')
plt.title('Loan Status by Previous Outcome Success')
plt.legend(title='Loan Status', labels=['No', 'Yes'])
plt.show()


In [None]:

plt.figure(figsize=(6,4))
sns.countplot(x='contact_cellular', hue='Loan_Status_label', data=df, palette='Set2')
plt.xlabel('Contacted via Cellular (0: No, 1: Yes)')
plt.ylabel('Count')
plt.title('Loan Status by Contact Method')
plt.legend(title='Loan Status', labels=['No', 'Yes'])
plt.show()

## Split data into Training and Test Set

In [None]:

X_train, X_test, y_train , y_test = train_test_split(X, y , test_size=0.2, random_state=42)


In [None]:
X_train


In [None]:
X_test

# Feature Scaling

In [None]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.fit_transform(X_test)

In [None]:
X_test_sc

In [None]:
X_train_sc

# Model Selection

## Using Logistic Regression

In [None]:
lr = LogisticRegression()
lr.fit(X_train , y_train)

In [None]:
print(f'Training Accuracy : {lr.score(X_train, y_train)}')
print(f'Test Accuracy : {lr.score(X_test, y_test)}')


In [None]:
# On Scaled Data

lr = LogisticRegression()
lr.fit(X_train_sc, y_train)


In [None]:
print(f'Training Accuracy : {lr.score(X_train_sc, y_train)}')
print(f'Test Accuracy : {lr.score(X_test_sc, y_test)}')


## Using DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)


In [None]:
print(f'Training Accuracy : {dt.score(X_train, y_train)}')
print(f'Test Accuracy : {dt.score(X_test, y_test)}')


## Using RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

rfc.fit(X_train, y_train)

In [None]:
print(f'Training Accuracy : {rfc.score(X_train, y_train)}')
print(f'Test Accuracy : {rfc.score(X_test, y_test)}')


## Using XGBClassifier

In [None]:
xgb = XGBClassifier(gamma=0.7, reg_alpha=0.5, reg_lambda=0.2)
xgb.fit(X_train ,y_train)


In [None]:
print(f'Training Accuracy : {xgb.score(X_train, y_train)}')
print(f'Test Accuracy : {xgb.score(X_test, y_test)}')


In [None]:
y_pred_xgtr = xgb.predict(X_train)
y_pred_xgts = xgb.predict(X_test)


In [None]:
X_train[:3]


In [None]:
y_train[:3]


In [None]:
y_pred_xgtr[:3]


In [None]:
confusion_matrix(y_train, y_pred_xgtr)


In [None]:
sns.heatmap(confusion_matrix(y_train, y_pred_xgtr), annot=True, fmt='.4g')


In [None]:
accuracy_score(y_train, y_pred_xgtr)


In [None]:
print(classification_report(y_train, y_pred_xgtr))


In [None]:
#TEST

confusion_matrix(y_test, y_pred_xgts)


In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred_xgts), annot=True, fmt='.3g')


In [None]:
accuracy_score(y_test, y_pred_xgts)


# Hyperparameter Tuning

In [None]:
parameters = {
    'n_estimators' : [100, 200],
    'learning_rate' : [0.1,0.01,1.0,0.05],
    'max_depth' : [3,4,5],
    'gamma' : [0.2,0.3],
    'reg_alpha' : [0.1,1,0.2],
    'reg_lambda' : [0.1,1]
}

parameters


In [None]:
# perform GridSearchCV

grid_search = GridSearchCV(estimator=xgb, param_grid=parameters, scoring='accuracy', cv=5, verbose=3)
grid_search.fit(X_train, y_train)


In [None]:
print(f'Best Selected Hyperparamters : \n\n{grid_search.best_params_}\n')

print(f'Best Estimators : \n\n{grid_search.best_estimator_}')


In [None]:

print(f'Training Accuracy : {grid_search.score(X_train, y_train)}')
print(f'Test Accuracy : {grid_search.score(X_test, y_test)}')
