In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px 
import pygwalker as pyg
from ydata_profiling import ProfileReport
##############################################################
# Preprocessing library
from sklearn.preprocessing import LabelEncoder,PolynomialFeatures,StandardScaler,RobustScaler,MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,root_mean_squared_error,confusion_matrix,accuracy_score,classification_report
from sklearn.decomposition import PCA
####################################################################
# Sampling library
from imblearn.combine import SMOTEENN
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
###################################################################
# Algorithm Library
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pickle

In [None]:
# color schema for visualizion 
colors10 = ['#387478', '#4682B4', '#32CD32', '#FFD700','#001F3F','#B17457','#F2E5BF','#DA8359','#FFD09B','#A66E38']  # You can define your own colors
blue_1=['#2D4356', '#435B66', '#A76F6F', '#EAB2A0']
blue_2=['#0C134F', '#1D267D', '#2D4263', '#347474']
green1=['#1A1A19', '#31511E', '#859F3D', '#88C273']
brown1=['#A79277', '#D1BB9E', '#EAD8C0', '#FFF2E1']
yel_gre1=['#F3CA52', '#F6E9B2', '#0A6847', '#7ABA78']
red_tel=['#C96868', '#FADFA1', '#FFF4EA', '#7EACB5']
cofee=['#EAC696', '#C8AE7D', '#765827', '#65451F']
pastel=['#B5C0D0', '#CCD3CA', '#B4B4B8', '#B3A398']
retro=['#060047', '#B3005E', '#E90064', '#FF5F9E']
white_blue=['#04009A', '#77ACF1', '#77ACF1', '#C0FEFC']
cold_blue=['#240750', '#344C64', '#577B8D', '#57A6A1']
cold_green=['#006769', '#40A578', '#9DDE8B', '#E6FF94']
happy=['#D2E0FB', '#F9F3CC', '#D7E5CA', '#8EACCD']
sky=['#00A9FF', '#89CFF3', '#A0E9FF', '#CDF5FD']
grad_brown=['#8D7B68', '#A4907C', '#C8B6A6', '#F1DEC9']
grad_black=['#2C3333', '#2E4F4F', '#0E8388', '#CBE4DE']
grad_green=['#439A97', '#62B6B7', '#97DECE', '#CBEDD5']
grad_blue=['#164863', '#427D9D', '#9BBEC8', '#DDF2FD']
night=['#003C43', '#135D66', '#77B0AA', '#E3FEF7']


<p style="font-size:30px; color:green; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>Data Understanding</b>
 </p>

# Import the date

In [None]:
df=pd.read_csv('Loan approval prediction.csv')

In [None]:
df.head()

In [None]:
df.info()

<p align="center" style="font-size:20px; color:#1A1A19; background-color:#697565; padding:10px; border-radius:5px;">
    The dataset contains loan approval prediction data with 58,645 rows and 13 columns. Hereâ€™s a description of each column: <br></br>
    id: Unique identifier for each loan application.<br></br>
    person_age: Age of the loan applicant.<br></br>
    person_income: Annual income of the applicant.<br></br>
    person_home_ownership: Homeownership status (e.g., RENT, OWN).<br></br>
    person_emp_length: Number of years of employment experience.<br></br>
    loan_intent: Purpose of the loan (e.g., EDUCATION, MEDICAL, PERSONAL, VENTURE).<br></br>
    loan_grade: Loan grade representing the risk level.<br></br>
    loan_amnt: Requested loan amount.<br></br>
    loan_int_rate: Interest rate on the loan.<br></br>
    loan_percent_income: Ratio of the loan amount to the applicantâ€™s income.<br></br>
    cb_person_default_on_file: Whether the person has a history of defaults ("Y" for yes, "N" for no).<br></br>
    cb_person_cred_hist_length: Length of the applicantâ€™s credit history in years.<br></br>
    loan_status: Loan status (0 for rejected, 1 for approved).<br></br>
</p>

# Data Cleaning 

In [None]:
# Check if the data has missing value or not 
# there is no missing value 
df.isnull().sum()

In [None]:
df.describe().T

<p align="center" style="font-size:20px; color:#1A1A19; background-color:#697565; padding:10px; border-radius:5px;">
    From the describtion it semms that : <br></br>
    person_age has outliers and right skewness ( meran > median).<br></br>
    person_income has high outliers and right skewness ( meran > median).<br></br>
    person_emp_length has small outliers and right skewness ( meran > median).<br></br>
    loan_amnt has high outliers and right skewness ( meran > median).<br></br>
    loan_int_rate has samll outliers and left skewness ( meran < median).<br></br>
    loan_percent_income	 has outliers and right skewness ( meran < median).<br></br>
</p>    

In [None]:
df[df['person_age']==123]

In [None]:
# show the max value of the Number of years of employment experiencein the data , it displays as data entry error as the person age is less than the 123 
df[df['person_emp_length']==123]

<p align="center" style="font-size:20px; color:#508C9B; background-color:#40534C; padding:10px; border-radius:5px;">
    isnights : <br></br>
    person_age has max age is 123 and has number of years experices which is 7 ! , so it' a wrong data entry .<br></br>
    person_emp_length has number of years experices is 123 but their ages are 28 , 21 ,so it' a wrong data entry .<br></br>

</p>    

In [None]:
# determine the row that has the person age is 123
index_to_drop = df[df['person_age'] == 123].index

# Delete the row who has age 123 
df = df.drop(index_to_drop)

In [None]:
# determine the row that has the person_emp_length is 123
index_to_drop = df[df['person_emp_length'] == 123].index

# Delete the row who has age 123 
df = df.drop(index_to_drop)


In [None]:
df[df['loan_status']==1]

In [None]:
df[df['loan_status']==0]

In [None]:
df.describe(include='object').T

# Profile Report

In [None]:
profile=ProfileReport(df,title='Loan approval prediction')
profile.to_notebook_iframe()

# Exploratory Data Analysis (EDA)

In [None]:
df.columns

In [None]:
# Create the scatter plot to display the Age of the loan applicant 
fig = px.scatter(df['person_age'].value_counts().reset_index(), 
                 x='person_age',y='count',
                 color='person_age', color_discrete_sequence=yel_gre1)

# Update layout for title
fig.update_layout( title="Age of the loan applicant",  title_x=0.5, title_font=dict(size=20))

# Annotate mean, min, and max values
fig.add_annotation( x=df['person_age'].mean(), y=0, text=f"Mean: {df['person_age'].mean():.2f}", showarrow=True, arrowhead=2, ax=0, ay=-50, font=dict(size=12, color="blue") )
fig.add_annotation(x=df['person_age'].min(), y=0, text=f"Min: {df['person_age'].min()}",showarrow=True, arrowhead=2, ax=0, ay=-50,font=dict(size=12, color="green"))
fig.add_annotation(x=df['person_age'].max(), text=f"Max: {df['person_age'].max()}",showarrow=True, arrowhead=2, ax=0, ay=-50,font=dict(size=12, color="red"))

In [None]:
# Create the scatter plot to display the Annual income of the applicant.
fig = px.scatter(df['person_income'].value_counts().reset_index(), 
                 x='person_income', y='count', 
                 color='person_income', color_discrete_sequence=yel_gre1)

# Update layout for title
fig.update_layout(
    title=" Annual income of the applicant.",
    title_x=0.5,
    title_font=dict(size=20)
)

# Annotate mean, min, and max values
fig.add_annotation(
    x=df['person_income'].mean(), y=0, text=f"Mean: {df['person_income'].mean():.2f}",
    showarrow=True, arrowhead=2, ax=0, ay=-150,
    font=dict(size=14, color="blue")
)
fig.add_annotation(
    x=df['person_income'].min(), y=0, text=f"Min: {df['person_income'].min()}",
    showarrow=True, arrowhead=2, ax=0, ay=-100,
    font=dict(size=12, color="green")
)
fig.add_annotation(
    x=df['person_income'].max(), text=f"Max: {df['person_income'].max()}",
    showarrow=True, arrowhead=2, ax=0, ay=-100,
    font=dict(size=12, color="red")
)

In [None]:
# Create the pie chart for Homeownership status
fig = px.pie(
    df['person_home_ownership'].value_counts().reset_index(),
    names='person_home_ownership',  # Set regions as labels
    values='count',  # Set count as the value for each slice
    color='person_home_ownership'
)

# Update layout for title
fig.update_layout(
    title="Homeownership status",
    title_x=0.5,
    title_font=dict(size=20)
)

# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+percent+value', hole=0.4)

fig.show()


In [None]:
# Create the scatter plot for Number of years of employment experience
fig = px.scatter(df['person_emp_length'].value_counts().reset_index(), 
                 x='person_emp_length', y='count', 
                 color='person_emp_length', color_discrete_sequence=yel_gre1)

# Update layout for title
fig.update_layout(
    title="Number of years of employment experience.",
    title_x=0.5,
    title_font=dict(size=20)
)

# Annotate mean, min, and max values
fig.add_annotation(
    x=df['person_emp_length'].mean(), y=0, text=f"Mean: {df['person_emp_length'].mean():.2f}",
    showarrow=True, arrowhead=2, ax=20, ay=-40,
    font=dict(size=12, color="black")
)
fig.add_annotation(
    x=df['person_emp_length'].min(), y=0, text=f"Min: {df['person_emp_length'].min()}",
    showarrow=True, arrowhead=2, ax=20, ay=-40,
    font=dict(size=12, color="green")
)
fig.add_annotation(
    x=df['person_emp_length'].max(), text=f"Max: {df['person_emp_length'].max()}",
    showarrow=True, arrowhead=2, ax=20, ay=-40,
    font=dict(size=12, color="red")
)


In [None]:
# Create the histogram plot for Purpose of the loan
fig = px.histogram(df['loan_intent'].value_counts().reset_index(), 
                 x='loan_intent', y='count', 
                 color='loan_intent', color_discrete_sequence=colors10)

# Update layout for title
fig.update_layout(
    title="Purpose of the loan",
    title_x=0.5,
    title_font=dict(size=20)
)

In [None]:
# Create the pie chart for Loan grade representing the risk level
fig = px.pie(
    df['loan_grade'].value_counts().reset_index(),
    names='loan_grade',  # Set regions as labels
    values='count',  # Set count as the value for each slice
    color='loan_grade')

# Update layout for title
fig.update_layout(
    title="Loan grade representing the risk level.",
    title_x=0.5,
    title_font=dict(size=20)
)

# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+value', hole=0.4)

In [None]:
# Create the scatter plot for Requested loan amount
fig = px.scatter(df['loan_amnt'].value_counts().reset_index(), 
                 x='loan_amnt', y='count', 
                 color='loan_amnt', color_discrete_sequence=yel_gre1)

# Update layout for title
fig.update_layout(
    title="Requested loan amount",
    title_x=0.5,
    title_font=dict(size=20)
)

# Annotate mean, min, and max values
fig.add_annotation(
    x=df['loan_amnt'].mean(), y=0, text=f"Mean: {df['loan_amnt'].mean():.2f}",
    showarrow=True, arrowhead=2, ax=20, ay=-40,
    font=dict(size=12, color="black")
)
fig.add_annotation(
    x=df['loan_amnt'].min(), y=0, text=f"Min: {df['loan_amnt'].min()}",
    showarrow=True, arrowhead=2, ax=20, ay=-40,
    font=dict(size=12, color="green")
)
fig.add_annotation(
    x=df['loan_amnt'].max(), text=f"Max: {df['loan_amnt'].max()}",
    showarrow=True, arrowhead=2, ax=20, ay=-40,
    font=dict(size=12, color="red")
)



In [None]:
# Create the scatter plot for Interest rate on the loan
fig = px.scatter(df['loan_int_rate'].value_counts().reset_index(), 
                 x='loan_int_rate', y='count', 
                 color='loan_int_rate', color_discrete_sequence=yel_gre1)

# Update layout for title
fig.update_layout(
    title="Interest rate on the loan.",
    title_x=0.5,
    title_font=dict(size=20)
)

# Annotate mean, min, and max values
fig.add_annotation(
    x=df['loan_int_rate'].mean(), y=0, text=f"Mean: {df['loan_int_rate'].mean():.2f}",
    showarrow=True, arrowhead=2, ax=0, ay=-30,
    font=dict(size=12, color="black")
)
fig.add_annotation(
    x=df['loan_int_rate'].min(), y=0, text=f"Min: {df['loan_int_rate'].min()}",
    showarrow=True, arrowhead=2, ax=0, ay=-30,
    font=dict(size=12, color="green")
)
fig.add_annotation(
    x=df['loan_int_rate'].max(), y=0, text=f"Max: {df['loan_int_rate'].max()}",
    showarrow=True, arrowhead=2, ax=0, ay=-30,
    font=dict(size=12, color="red")
)


In [None]:
# Create the scatter plot for Ratio of the loan amount to the applicantâ€™s income.
fig = px.scatter(df['loan_percent_income'].value_counts().reset_index(), 
                 x='loan_percent_income', y='count', 
                 color='loan_percent_income', color_discrete_sequence=yel_gre1)

# Update layout for title
fig.update_layout(
    title="Ratio of the loan amount to the applicantâ€™s income.",
    title_x=0.5,
    title_font=dict(size=20)
)

# Annotate mean, min, and max values
fig.add_annotation(
    x=df['loan_percent_income'].mean(), y=0, text=f"Mean: {df['loan_percent_income'].mean():.2f}",
    showarrow=True, arrowhead=2, ax=10, ay=-40,
    font=dict(size=12, color="blue")
)
fig.add_annotation(
    x=df['loan_percent_income'].min(), y=0, text=f"Min: {df['loan_percent_income'].min()}",
    showarrow=True, arrowhead=2, ax=10, ay=-40,
    font=dict(size=12, color="green")
)
fig.add_annotation(
    x=df['loan_percent_income'].max(), y=0, text=f"Max: {df['loan_percent_income'].max()}",
    showarrow=True, arrowhead=2, ax=10, ay=-40,
    font=dict(size=12, color="red")
)



In [None]:
# Create the scatter plot for Length of the applicantâ€™s credit history in years
fig = px.scatter(df['cb_person_cred_hist_length'].value_counts().reset_index(), 
                 x='cb_person_cred_hist_length', y='count', 
                 color='cb_person_cred_hist_length', color_discrete_sequence=yel_gre1)

# Update layout for title
fig.update_layout(
    title="Length of the applicantâ€™s credit history in years.",
    title_x=0.5,
    title_font=dict(size=20)
)

# Annotate mean, min, and max values
fig.add_annotation(
    x=df['cb_person_cred_hist_length'].mean(), y=0, text=f"Mean: {df['cb_person_cred_hist_length'].mean():.2f}",
    showarrow=True, arrowhead=2, ax=0, ay=-30,
    font=dict(size=12, color="blue")
)
fig.add_annotation(
    x=df['cb_person_cred_hist_length'].min(), y=0, text=f"Min: {df['cb_person_cred_hist_length'].min()}",
    showarrow=True, arrowhead=2, ax=0, ay=-30,
    font=dict(size=12, color="green")
)
fig.add_annotation(
    x=df['cb_person_cred_hist_length'].max(), y=0, text=f"Max: {df['cb_person_cred_hist_length'].max()}",
    showarrow=True, arrowhead=2, ax=0, ay=-30,
    font=dict(size=12, color="red")
)



In [None]:
# Create the pie chart for Whether the person has a history of defaults.
fig = px.pie(
    df['cb_person_default_on_file'].value_counts().reset_index(),
    names='cb_person_default_on_file',  # Set regions as labels
    values='count',   # Set count as the value for each slice
    color='cb_person_default_on_file',
    color_discrete_sequence=cold_green
)

# Update layout for title
fig.update_layout(
    title="Whether the person has a history of defaults.",
    title_x=0.5,
    title_font=dict(size=20)
)


# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+percent+value', hole=0.4)


In [None]:
# Create the pie chart for Loan status (0 for rejected, 1 for approved)
fig = px.pie(
    df['loan_status'].value_counts().reset_index(),
    names='loan_status',  # Set regions as labels
    values='count',   # Set count as the value for each slice
    color='loan_status',
    color_discrete_sequence=sky
)

# Update layout for title
fig.update_layout(
    title="Loan status (0 for rejected, 1 for approved).",
    title_x=0.5,
    title_font=dict(size=20)
)
# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+percent+value', hole=0.4)

# Multidimension analysis

In [None]:
df.columns

In [None]:
# Create Scater plot between Applicant's Income vs Age with Loan Status as Bubble Siz
fig = px.scatter(
    df,
    x='person_age',
    y='person_income',
    size='person_income',  # Bubble size based on credit history length
    color='loan_status',
    color_continuous_scale='Viridis',  # Use a continuous color scale for better gradient
    title="Applicant's Income vs Age with Loan Status as Bubble Size"
)
# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)
fig.show()


<p>
    <div style="text-align:left; font-size:18px; color:#000000; background-color:#77ACF1; padding:15px; border-radius:8px; border:1px solid #04009A;">
        The most common age range with higher income is between 22 and 50 years old.<br><br>
        Applicants aged between 22 and 35 tend to have incomes greater than 500k.   
    </div>
    <div style="text-align:left; font-size:18px; color:yellow; background-color:#77ACF1; padding:15px; border-radius:8px; border:1px solid #04009A;">
        we notice that the accepted loans mostly for the age between 22 and 60 and it doesn't depend on the higher income at all ,the max income has accepted loan is 379k however the the applicant has more than 1M income hasn't accepted
    </div>
</p>


In [None]:

# Create a Pie chart between loan_status and loan_amount
fig = px.pie(df, names='loan_status', values='loan_amnt')

# Update the layout with title and styling
fig.update_layout(
    title="Loan Status vs Loan Amount",
    title_x=0.5,
    title_font=dict(size=20)
)
# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+percent+value', hole=0.4)

# Show the plot
fig.show()


<p align="center" style="font-size:20px; color:#000000; background-color:#77ACF1; padding:10px; border-radius:5px;">
        <b>the loan amount of The Accepted Loan is 93M with 17.3% out of the total.</b><br><br>
        <b>the loan amount of The rejected Loan is 447M with 82.7% out of the total.</b> <br><br>
         <div style="text-align:center; font-size:20px; color:yellow; background-color:black; padding:15px; border-radius:8px; border:1px solid #04009A;">
                <b> So Most of the Applicant Loans is rejected </b>
        </div>
</p>


In [None]:
# Create Scater plot between Applicant's Income vs loan_amount with loan_status 
fig = px.scatter(
    df,
    x='loan_amnt',
    y='person_income',
    size='loan_amnt',  # Bubble size based on credit history length
    color='loan_status',
    color_continuous_scale='Plasma',  # Use a continuous color scale for better gradient
    title="Applicant's Income vs loan_amount with loan_status as Bubble Size"
)
# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)
fig.show()



<p align="center" style="font-size:20px; color:#00A9FF; background-color:#164863; padding:10px; border-radius:5px;">
        <b>most of approval loan amount between 15k and 30k and the applicant has less income .</b>
</p>


In [None]:
# Create Scater plot between Applicant's loan_amnt vs Age with Loan Status
fig = px.scatter(
    df,
    x='person_age',
    y='loan_amnt',
    size='loan_amnt',  # Bubble size based on credit history length
    color='loan_status',
    color_continuous_scale='Viridis',  # Use a continuous color scale for better gradient
    title="Applicant's loan_amount vs Age with Loan Status as Bubble Size"
)

# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


<p align="center" style="font-size:20px; color:#9EDF9C; background-color:#164863; padding:10px; border-radius:5px;">
        <b>most of approval loan amount between 15k and 30k and the applicant has ages between 22 to 60.</b>
</p>


In [None]:
# Create Scater plot between Applicant's person_emp_length vs Age with Loan Status
fig = px.scatter(
    df,
    x='person_age',
    y='person_emp_length',
    size='person_emp_length',  # Bubble size based on credit history length
    color='loan_status',
    color_continuous_scale='Plasma',  # Use a continuous color scale for better gradient
    title="Applicant's person_emp_length vs Age with Loan Status as Bubble Size"
)

# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)
fig.show()


<p align="center" style="font-size:20px; color:red; background-color:#black; padding:10px; border-radius:5px;">
        <b>there are inconsistencies in the relationship between a person's age (person_age) and their years of employment (person_emp_length).</b>
</p>


In [None]:
# Create Scater plot between Applicant's cb_person_cred_hist_length vs Age with Loan Status
fig = px.scatter(
    df,
    x='person_age',
    y='cb_person_cred_hist_length',
    size='cb_person_cred_hist_length',  # Bubble size based on credit history length
    color='loan_status',
    color_continuous_scale='Viridis',  # Use a continuous color scale for better gradient
    title="Applicant's cb_person_cred_hist_length vs Age with Loan Status as Bubble Size"
)
# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)
fig.show()


<p align="center" style="font-size:20px; color:#181C14; background-color:#6CBEC7; padding:10px; border-radius:5px;">
        <b>there is a positive strong relationship between age and the Length of the applicantâ€™s credit history in years.</b>
</p>


In [None]:
fig = px.treemap(
    df,
    path=['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'],
    values='loan_amnt',
    color='loan_amnt',  # Use loan amount for a gradient effect
    color_continuous_scale='Viridis',  # Choose a visually appealing color scale
    title="Loan Amount Distribution by Home Ownership, Loan Intent, Loan Grade, and Default Status"
)

# Update layout for a centered title, background, and padding
fig.update_layout(
    title={
        'text': "Loan Amount Distribution by Home Ownership, Loan Intent, Loan Grade, and Default Status",
        'x': 0.5,
        'y': 0.95,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(size=22, color='black')
    },
    margin=dict(t=50, l=25, r=25, b=25),
    paper_bgcolor="white"  # Set background color to white for a cleaner look
)

# Customize the hover info for clarity
fig.update_traces(
    textinfo="label+value+percent root",  # Show label, value, and percentage of the root level
    hovertemplate="<b>%{label}</b><br>Loan Amount: %{value}<br>Percentage of Total: %{percentRoot:.2%}"
)
fig.show()


<p align="center" style="font-size:20px; color:#CBDCEB; background-color:#526D82; padding:10px; border-radius:5px;">
        <b>it seems that in higher loan grade in any home ownership the person doesn't have a history of defaults so the most is paying on time .</b> <br></br>
        <b>most of the loan reasons is for Education and for peopple has rent and Mortgage home while in Own and Other place in the second phase so most of the people who pay for living need loan for education and asking for high loan amount.</b> <br></br>
        <b>most of the Medical loans has second place for rent and the last choise for mortgage and other and at the 4th place for own home  .</b> <br></br>
        <b>most of the Venture loans has 1st place for own ,other and the last choise for rent and other and at the 3th place for mortgage home  .</b> <br></br>

</p>

In [None]:
# Create a Pie chart between loan_intent and loan_amnt
fig = px.pie(df, names='loan_intent', values='loan_amnt')

# Update the layout with title and styling
fig.update_layout(
    title="loan_intent with the loan amount",
    title_x=0.5,
    title_font=dict(size=20)
)
# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+percent+value', hole=0.4)

# Show the plot
fig.show()

In [None]:
fig = px.box(
    data_frame=df,
    x='loan_grade',
    y='loan_int_rate',
    color='loan_status',
    color_discrete_sequence=['red', 'green'],
    title="Applicant's Interest Rate by loan_grade"
)

# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20),
    xaxis_title="Loan Status (0 = Rejected, 1 = Approved)",
    yaxis_title="Interest Rate (%)"
)

# Display the plot
fig.show()

In [None]:
fig = px.treemap(
    df, 
    path=['person_home_ownership', 'loan_intent', 'loan_grade'], 
    values='person_income',
    color='person_income',  # Use loan_int_rate for a gradient color
    color_continuous_scale='Viridis',  # Choose a visually pleasing continuous color scale
    title="person_income  by Home Ownership, Loan Intent, and Grade"
)

# Update layout for a cleaner and more readable design
fig.update_layout(title={ 'text': "person_income by Home Ownership, Loan Intent, and Grade",  'x': 0.5,'xanchor': 'center','yanchor': 'top','font': dict(size=22, color='black') } # Center and customize title font  
  ,  margin=dict(t=50, l=25, r=25, b=25),  # Add padding for clarity
    paper_bgcolor="white"  # Set background color to white
)

# Update trace for better hover information
fig.update_traces(
    textinfo="label+percent entry+value",  # Display label, percentage, and value
    hovertemplate="<b>%{label}</b><br>person_income: %{value}<br>Percentage: %{percentEntry:.2%}"
)

fig.show()

In [None]:
# Create a Pie chart between person_home_ownership and loan_amnt
fig = px.pie(df, names='person_home_ownership', values='person_income',color_discrete_sequence=cofee)

# Update the layout with title and styling
fig.update_layout(
    title="person_home_ownership with the person_income",
    title_x=0.5,
    title_font=dict(size=20)
)
# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+percent+value', hole=0.3)

<p align="center" style="font-size:20px; color:#201E43; background-color:#227B94; padding:10px; border-radius:5px;">
        <b>Mortgage home Has higher Income , Rent home is the 2nd one and the Own home is the 3rd .</b>
</p>

In [None]:
# Create the histogram plot for person_home_ownership and loan_amnt
fig = px.histogram(df, 
                 x='person_home_ownership', y='loan_amnt', 
                 color='person_home_ownership', color_discrete_sequence=colors10)

# Update layout for title
fig.update_layout(
    title="person_home_ownership and loan_amnt ",
    title_x=0.5,
    title_font=dict(size=20)
)

In [None]:
df.columns

In [None]:
fig = px.box(
    data_frame=df,
    x='loan_status',
    y='loan_percent_income',
    color='loan_status',
    color_discrete_sequence=['red', 'green'],
    title="Applicant's loan_percent_income by Loan Status"
)

# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20),
    xaxis_title="Loan Status (0 = Rejected, 1 = Approved)",
    yaxis_title="loan_percent_income(%)"
)

# Display the plot
fig.show()

In [None]:
fig = px.box(
    data_frame=df,
    x='loan_status',
    y='loan_int_rate',
    color='loan_status',
    color_discrete_sequence=['red', 'green'],
    title="Applicant's Interest Rate by Loan Status"
)

# Update layout for title
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20),
    xaxis_title="Loan Status (0 = Rejected, 1 = Approved)",
    yaxis_title="Interest Rate (%)"
)

# Display the plot
fig.show()

<div style="background-color:#EAF2F8; padding:20px; border-radius:8px; border:1px solid #2980B9; font-size:20px;">
    <p align="center" style="color:#2C3E50; font-weight:bold; font-size:22px;">ðŸ’¡ Insights from the Visualization:</p>
    <!-- Insight 1 -->
    <div style="background-color:#D6EAF8; padding:15px; border-radius:8px; margin-bottom:15px;">
        <p style="font-size:18px; color:#2C3E50;">
            The most common age range with higher income is between 22 and 40 years old.
        </p>
        <p style="font-size:18px; color:#2980B9;">
            Most accepted loans are for applicants aged 22-60, and high income does not guarantee loan acceptance.
        </p>
        <p style="font-size:18px; color:#2980B9;">  Applicants aged between 22 and 35 tend to have incomes greater than 500k.   </p>
        <p style="font-size:18px; color:#2980B9;">  the accepted loans mostly for the age between 22 and 60 ,the max income has accepted loan is 379k however the the applicant has more than 1M income hasn't accepted.   </p>
    </div>
 <!-- Loan Approval Insight -->
    <div style="background-color:#D6EAF8; padding:15px; border-radius:8px; margin-bottom:15px;">
        <p style="font-size:18px; color:#2C3E50;">
            <b>Accepted Loan Amount: 93M (17.3% of total)</b><br>
            <b>Rejected Loan Amount: 447M (82.7% of total)</b>
        </p>
        <p style="font-size:20px; color:#E74C3C; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>Most Applicant Loans are Rejected</b>
        </p>
    </div>
  <!-- Additional Insights -->
    <div style="background-color:#F9EBEA; padding:15px; border-radius:8px; margin-bottom:15px;">
        <p style="font-size:18px; color:#2C3E50;">
            Most approved loan amounts are between 15k and 30k, often for applicants with lower incomes or ages 22-60.
        </p>
    </div>
 <!-- Relationship between Age and Credit History -->
    <div style="background-color:#E8F8F5; padding:15px; border-radius:8px; margin-bottom:15px;">
        <p style="font-size:18px; color:#1A5276; text-align:center;">
            <b>Strong positive relationship observed between age and length of credit history.</b>
        </p>
    </div>
 <!-- Loan Intent and Home Ownership Insights -->
    <div style="background-color:#FEF5E7; padding:15px; border-radius:8px; color:#7D6608; margin-bottom:15px;">
        <p style="font-size:18px;">
            Higher loan grades correlate with applicants who have no default history and often pay on time.
        </p>
        <p style="font-size:18px;">
            Most loans are for education, especially for those renting or with mortgages, while homeowners often request loans for ventures.
        </p>
        <p style="font-size:18px;">
            Medical loans are primarily for renters, and venture loans are commonly requested by homeowners.
        </p>
    </div>
 <!-- Interest Rate and Loan Grade Insights -->
    <div style="background-color:#D1F2EB; padding:15px; border-radius:8px; margin-bottom:15px;">
        <p style="font-size:18px; color:#117A65; text-align:center;">
            Higher loan grades (higher risk) are associated with higher interest rates.
        </p>
    </div>
  <!-- Income by Home Ownership -->
    <div style="background-color:#EBEDEF; padding:15px; border-radius:8px; text-align:center;">
        <p style="font-size:18px; color:#5D6D7E;">
            Mortgage homeowners have the highest income, followed by renters, and then owners.
        </p>
    </div>
</div>


#
<p style="font-size:50px; color:#EEEEEE; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>Modeling</b>
 </p>


<p style="font-size:30px; color:green; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>Preprocessing</b>
 </p>

## Copy data

In [None]:
loan=df.copy()
loan.head()

# 

<p style="font-size:30px; color:brown; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>Encoding</b>
 </p>

In [None]:
label=LabelEncoder()

In [None]:
object_column=loan[['person_home_ownership','loan_intent','loan_grade','cb_person_default_on_file']]

In [None]:
for i in object_column:
    loan[i]=label.fit_transform(loan[i])

In [None]:
loan.head()

In [None]:
loan.info()

# 

<p style="font-size:30px; color:red; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>Handling Outliers</b>
 </p>

# check the outliers 

In [None]:
px.box(loan)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create a subplot grid
fig = make_subplots(rows=1, cols=5, subplot_titles=["Person Income", "Person Age", "Employment Length", "Loan Interest Rate", "Loan Amount"])

# Add box plots for each column
fig.add_trace(go.Box(y=loan['person_income'], name="Person Income"), row=1, col=1)
fig.add_trace(go.Box(y=loan['person_age'], name="Person Age"), row=1, col=2)
fig.add_trace(go.Box(y=loan['person_emp_length'], name="Employment Length"), row=1, col=3)
fig.add_trace(go.Box(y=loan['loan_int_rate'], name="Loan Interest Rate"), row=1, col=4)
fig.add_trace(go.Box(y=loan['loan_amnt'], name="Loan Amount"), row=1, col=5)

# Update layout for better aesthetics
fig.update_layout(
    title_text="Box Plots of Loan Dataset Features",
    title_x=0.5,
    height=600,
    width=1400,
    showlegend=False,
    title_font=dict(size=20)
)

# Show the figure
fig.show()


<p style="font-size:30px; color:#CBD2A4; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>There are outliers in the data with person_income amounts exceeding 1,000,000 . </b>
</p>

<p style="font-size:30px; color:#CBD2A4; background-color:#006A67; padding:10px; border-radius:8px; text-align:center;">
            <b>There are outliers in the data with Number of person_age exceeding 80 years</b>
</p>

<p style="font-size:30px; color:#6A9AB0; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:left;">
            <b>There are outliers in the data with Number of years of employment experience exceeding 40 years </b>
</p>

<p style="font-size:30px; color:#CBD2A4; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>There are outliers in the data with loan amounts exceeding 35,000, indicating that a few individuals requested unusually high loans compared to others. </b>
</p>

<p style="font-size:30px; color:#CBD2A4; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>There is a small outliers in the data with loan_int_rate exceeding 20 . </b>
</p>

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(loan.corr(),annot=True,cmap='viridis')
plt.show()

# Handling if exist

In [None]:
outliers_column=loan[['person_income','person_age','person_emp_length','loan_amnt','loan_int_rate']]

In [None]:
def handling_outliers(loan, lst_of_col):
    for i in lst_of_col:
        # Calculate Q1, Q3, and IQR for the column
        q1 = loan[i].quantile(0.25)
        q3 = loan[i].quantile(0.75)
        iqr = q3 - q1
        upper_limit = q3 + 1.5 * iqr
        lower_limit = q1 - 1.5 * iqr

        # Apply outlier handling
        loan[i] = loan[i].apply(lambda x: lower_limit if x < lower_limit else (upper_limit if x > upper_limit else x))

    return loan

In [None]:
loan=handling_outliers(loan,loan.columns)    

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create a subplot grid
fig = make_subplots(rows=1, cols=5, subplot_titles=["Person Income", "Person Age", "Employment Length", "Loan Interest Rate", "Loan Amount"])

# Add box plots for each column
fig.add_trace(go.Box(y=loan['person_income'], name="Person Income"), row=1, col=1)
fig.add_trace(go.Box(y=loan['person_age'], name="Person Age"), row=1, col=2)
fig.add_trace(go.Box(y=loan['person_emp_length'], name="Employment Length"), row=1, col=3)
fig.add_trace(go.Box(y=loan['loan_int_rate'], name="Loan Interest Rate"), row=1, col=4)
fig.add_trace(go.Box(y=loan['loan_amnt'], name="Loan Amount"), row=1, col=5)

# Update layout for better aesthetics
fig.update_layout(
    title_text="Box Plots of Loan Dataset Features",
    title_x=0.5,
    height=600,
    width=1400,
    showlegend=False,
    title_font=dict(size=20)
)

# Show the figure
fig.show()


In [None]:
# Calculate correlation matrix
corr_matrix = loan.corr()

# Create a heatmap using plotly.graph_objects
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='Viridis',  # Color scale
    zmin=-1, zmax=1,  # Correlation range
    colorbar=dict(title="Correlation"),
    hoverongaps=False
))

# Add annotations for correlation values
for i in range(len(corr_matrix.columns)):
    for j in range(len(corr_matrix.columns)):
        fig.add_annotation(
            x=corr_matrix.columns[j],
            y=corr_matrix.columns[i],
            text=f'{corr_matrix.iloc[i, j]:.2f}',
            showarrow=False,
            font=dict(size=12, color='black'),
            align="center"
        )

# Update layout with title and axis labels
fig.update_layout(
    title="Correlation Heatmap of Loan Data",
    title_x=0.5,
    title_font=dict(size=20),
    xaxis_title="Features",
    yaxis_title="Features",
    width=1000,
    height=800
)

# Show the plot
fig.show()


In [None]:
loan.head()

In [None]:
# Convert 'cb_person_default_on_file' and 'loan_status' columns to integers
loan['cb_person_default_on_file'] = loan['cb_person_default_on_file'].astype(int)
loan['loan_status'] = loan['loan_status'].astype(int)

# 

<p style="font-size:30px; color:green; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b>   Model Selection  </b>
 </p>

# Spliting Data

In [None]:
X=loan.drop(['id','loan_status',"person_emp_length"],axis=1)
y=loan['loan_status']

In [None]:
# Create the pie chart for Loan status (0 for rejected, 1 for approved)
fig = px.pie(
    df['loan_status'].value_counts().reset_index(),
    names='loan_status',  # Set regions as labels
    values='count',   # Set count as the value for each slice
    color='loan_status',
    color_discrete_sequence=sky
)
# Update layout for title
fig.update_layout(
    title="Loan status (0 for rejected, 1 for approved).",
    title_x=0.5,
    title_font=dict(size=20)
)
# Show name, percentage, and value on each slice
fig.update_traces(textinfo='label+percent+value', hole=0.4)

# 

<p style="font-size:30px; color:green; background-color:#2C3E50; padding:10px; border-radius:8px; text-align:center;">
            <b> there is an imbalance as shown in the graph so we have to handle it  </b>
 </p>

# Combined (Hybrid) Sampling

In [None]:
# Combined (Hybrid) Sampling
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_combined, y_combined = smote_enn.fit_resample(X, y)

print(f"Original dataset shape: {Counter(y)}")
print(f"Combined SMOTEENN dataset shape: {Counter(y_combined)}")


In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X_combined,y_combined,test_size=0.2,random_state=42)

# Scaling data

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# xgboost Alg

In [None]:
import xgboost as xgb
gb = xgb.XGBClassifier( n_estimators=600,learning_rate=0.08, gamma=0,subsample=0.75,colsample_bytree=1, max_depth=10)

In [None]:
gb.fit(X_train, y_train)

In [None]:
y_test_preds = gb.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_test_preds)
print(cm)
accuracy_score(y_test, y_test_preds)

In [None]:
gb = pickle.load(open('model loan.pkl','rb'))

In [None]:
pickle.dump(gb, open('model loan.pkl','wb'))

In [None]:
model = pickle.load(open('model loan.pkl','rb'))

In [None]:
selected_features = ['person_age', 'person_income', 'person_home_ownership', 
                        'loan_intent', 'loan_amnt', 'person_emp_length']
X = loan[selected_features]

In [None]:
# Scale the selected features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

with open('scaler loan.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# RandomForestClassifier ALG 

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 300, criterion = 'gini', random_state = 42)

In [None]:
classifier.fit(X_train, y_train)


In [None]:
y_pred = classifier.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)


# DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier1 = DecisionTreeClassifier(criterion = 'entropy',random_state = 42,max_depth=None,min_samples_leaf=1,min_samples_split=5)
classifier1.fit(X_train, y_train)

In [None]:
y_pred_dec=classifier1.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred_dec)

# SVM ( poly )

In [None]:
from sklearn.svm import SVC
svc_model1 = SVC(kernel = 'poly', random_state = 42,C=2,degree=2)

In [None]:
svc_model1.fit(X_train,y_train)

In [None]:
y_pred_svc1=svc_model1.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred_svc1)

# SVM ( rbf )

In [None]:
svc_model1 = SVC(kernel = 'rbf', random_state = 42,C=5)

In [None]:
svc_model1.fit(X_train,y_train)

In [None]:
y_pred_svc1=svc_model1.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred_svc1)

# SVM ( linear )

In [None]:
svc_model=SVC(C=5)

In [None]:
svc_model.fit(X_train,y_train)

In [None]:
y_pred_svc=svc_model.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred_svc)

# LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
log.fit(X_train,y_train)

In [None]:
y_pred_log=log.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred_log)

# KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 1, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)


In [None]:

cm1= confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred, labels=classifier.classes_)
plt.show()

print(cm)
accuracy_score(y_test, y_pred)



In [None]:
test_error_rates = []

for k in range(1,20):
    knn_model = KNeighborsClassifier(n_neighbors=k)
    knn_model.fit(X_train,y_train) 
   
    y_pred_test = knn_model.predict(X_test)
    
    test_error = 1 - accuracy_score(y_test,y_pred_test)
    test_error_rates.append(test_error)

In [None]:
plt.figure(figsize=(10,4),dpi=200)
plt.plot(range(1,20),test_error_rates,label='Test Error')
plt.legend()
plt.ylabel('Error Rate')
plt.xlabel("K Value")