In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# üìä Loan Approval Classification Dataset Overview

This dataset offers a comprehensive view of **loan applications** along with **key financial and demographic attributes** that contribute to assessing **loan approval or rejection**. The data includes both **applicant-specific** and **loan-specific features** that can help model the probability of loan approval. Below is a quick summary:

- **üî¢ Number of Records**: 45,000
- **üß© Total Features**: 14 (mix of Categorical and Continuous types)

### üìù Feature Descriptions:

| Column                               | Description                                                 | Data Type    |
|--------------------------------------|-------------------------------------------------------------|--------------|
| üë§ **person_age**                     | Applicant's age                                             | Float        |
| üöª **person_gender**                  | Applicant's gender                                          | Categorical  |
| üéì **person_education**               | Applicant's highest level of education                      | Categorical  |
| üí∞ **person_income**                  | Applicant's annual income in currency                       | Float        |
| üìÖ **person_emp_exp**                 | Years of employment experience                              | Integer      |
| üè† **person_home_ownership**          | Home ownership status (e.g., rent, own, mortgage)           | Categorical  |
| üè¶ **loan_amnt**                      | Amount of loan requested                                    | Float        |
| üéØ **loan_intent**                    | Intended purpose of the loan (e.g., personal, education)    | Categorical  |
| üìà **loan_int_rate**                  | Interest rate applicable to the loan                        | Float        |
| üìä **loan_percent_income**            | Loan amount as a percentage of annual income                | Float        |
| üï∞Ô∏è **cb_person_cred_hist_length**     | Number of years of credit history                           | Float        |
| üí≥ **credit_score**                   | Applicant's credit score                                    | Integer      |
| ‚ùó **previous_loan_defaults_on_file**  | Indicator of previous loan defaults (Yes/No)                | Categorical  |
| ‚úÖ **loan_status**                    | Loan status outcome (1 = approved, 0 = rejected)            | Integer       |



<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">1. Imports and Setup</h1>
</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

from sklearn.utils import resample
import category_encoders as ce
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

# <span style="color:transparent;">2. Load and Explore Dataset</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">2. Load and Explore Dataset</h1>
</div>

In [None]:
# Load the dataset
df = pd.read_csv('/kaggle/input/loan-approval-classification-data/loan_data.csv')

In [None]:
# Display basic information about the dataset
print("Shape of the dataset:", df.shape)
display(df.head())
print("\nDataset Information:")
print(df.info())
print("\nStatistical Summary:")
display(df.describe().T)

A summary of the dataset:

- **Shape of the dataset**: (45,000, 14) - The dataset contains 45,000 records and 14 columns.
  
- **Columns and Data Types**:
   - `float64`: person_age, person_income, loan_amnt, loan_int_rate, loan_percent_income, cb_person_cred_hist_length
   - `int64`: person_emp_exp, credit_score, loan_status
   - `object`: person_gender, person_education, person_home_ownership, loan_intent, previous_loan_defaults_on_file

- **Statistical Summary**:
   - The `person_age` ranges from 20 to 144, which includes potential outliers (age > 100).
   - `person_income` has a mean of approximately 80,319 but also shows a high maximum value (7,200,766), indicating possible income outliers.
   - `loan_amnt` has a median of 8,000, with a max value of 35,000.
   - `credit_score` has a range from 390 to 850, which is typical for credit scores.
   - `loan_status` indicates that about 22% of loans were approved  specific parts!

In [None]:
# Check for missing and duplicated values
print(f'\nMissing values: {df.isna().sum().sum()}')
print(f'Duplicated values: {df.duplicated().sum()}')


The dataset has no missing values or duplicated records:

- **Missing values**: 0
- **Duplicated values**:teps!

# <span style="color:transparent;">3. Unique Value Exploration</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">3. Unique Value Exploration</h1>
</div>

In [None]:
# Display the number of unique values in each column
print("\nUnique Values in Each Column:")
print(df.nunique())

In [None]:
# Separate numerical and categorical columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numerical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Display the lists of numerical and categorical columns
print("\nNumerical Columns:", numerical_columns)
print("Categorical Columns:", non_numerical_columns)


In [None]:
# Display unique values for each categorical column
for col in non_numerical_columns:
    print(f"\nColumn: {col}")
    print(f"Unique Values: {df[col].unique()}")

The summary of unique values and column types:

### Unique Values in Each Column
- **person_age**: 60 unique values
- **person_gender**: 2 unique values (`female`, `male`)
- **person_education**: 5 unique values (`Master`, `High School`, `Bachelor`, `Associate`, `Doctorate`)
- **person_income**: 33,989 unique values
- **person_emp_exp**: 63 unique values
- **person_home_ownership**: 4 unique values (`RENT`, `OWN`, `MORTGAGE`, `OTHER`)
- **loan_amnt**: 4,483 unique values
- **loan_intent**: 6 unique values (`PERSONAL`, `EDUCATION`, `MEDICAL`, `VENTURE`, `HOMEIMPROVEMENT`, `DEBTCONSOLIDATION`)
- **loan_int_rate**: 1,302 unique values
- **loan_percent_income**: 64 unique values
- **cb_person_cred_hist_length**: 29 unique values
- **credit_score**: 340 unique values
- **previous_loan_defaults_on_file**: 2 unique values (`No`, `Yes`)
- **loan_status**: 2 unique values (target variable)

### Column Types
- **Numerical Columns**: `person_age`, `person_income`, `person_emp_exp`, `loan_amnt`, `loan_int_rate`, `loan_percent_income`, `cb_person_cred_hist_length`, `credit_sc
  ore`, `loan_status`
- **Categorical Columns**: `person_gender`, `person_education`, `person_home_ownership`, `loan_intent`, `previous_loath any specific steps!

# <span style="color:transparent;">4. Exploratory Data Analysis (EDA)</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">4. Exploratory Data Analysis (EDA)</h1>
</div>

In [None]:
# Create subplots for loan status visualization
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

# Count the occurrences of each loan status
loan_status_counts = df['loan_status'].value_counts()

# Bar plot for loan status distribution
sns.barplot(x=loan_status_counts.index, y=loan_status_counts, ax=axes[0], palette='tab10')
axes[0].set_title('Distribution of Loan Approval Status')
axes[0].set_ylabel('Count')
axes[0].set_xlabel('Loan Status (0 = Rejected, 1 = Approved)')

# Annotate bar plot with counts
for p in axes[0].patches:
    axes[0].annotate(f'{p.get_height()}', 
                     (p.get_x() + p.get_width() / 2., p.get_height()),
                     ha='center', va='center', 
                     xytext=(0, 10), textcoords='offset points')

sns.despine(left=True, bottom=True)

# Pie chart for percentage distribution of loan status
loan_status_percentage = loan_status_counts / loan_status_counts.sum() * 100
axes[1].pie(loan_status_percentage, labels=loan_status_percentage.index, autopct='%1.1f%%', 
            colors=sns.color_palette('tab10'))
axes[1].set_title('Percentage Distribution of Loan Approval Status')
plt.legend(['Rejected (0)', 'Approved (1)'])

# Adjust layout and show plot
plt.tight_layout()
plt.show()

The insights from the loan status distribution visualization:

* The count annotations on top of each bar provide a clear view of the actual number of records in each category, highlighting any imbalance in the dataset.
* This visualization suggests a higher count of rejected loans compared to approved ones, which may indicate a class imbalance that should be addressed when building predictive models.
* The pie chart illustrates the percentage distribution of loan approval status, offering a complementary view of the relative proportions of approvals and rejections.

In [None]:
# Function to perform univariate analysis for numeric columns
def univariate_analysis(data, columns):
    plt.figure(figsize=(10, 12))  
    
    for i, column in enumerate(columns, 1):
        plt.subplot(4, 2, i)  
        sns.histplot(data[column], kde=True, bins=30, color='dodgerblue')
        plt.title(f'{column.replace("_", " ")} Distribution with KDE')
        plt.xlabel(column.replace('_', ' '))
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

columns_to_analyze = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

univariate_analysis(df, columns_to_analyze)

Some insights from the univariate analysis of the numeric columns:

1. **Person Age**:
   - The age distribution is slightly right-skewed, with most individuals in the dataset being between 20 and 40 years old.
   - The presence of a KDE (Kernel Density Estimate) helps smooth out the distribution, making it easier to visualize common age ranges.

2. **Person Income**:
   - The income distribution is highly right-skewed, with a large portion of incomes clustered at lower values.
   - Some very high income values are present, indicating potential outliers that could affect model performance if left unaddressed.

3. **Person Employment Experience**:
   - The majority of individuals have fewer than 10 years of experience, with a rapid decline in frequency as years increase.
   - A few instances show very high values, likely outliers, which could skew the analysis if not addressed.

4. **Loan Amount**:
   - The loan amounts are concentrated at lower values, indicating that most applicants are requesting smaller loans.
   - The distribution gradually tapers off, with a few applicants requesting high loan amounts.

5. **Loan Interest Rate**:
   - Interest rates are mostly clustered around 10% to 15%, which aligns with typical loan rates.
   - There‚Äôs a notable density between 5% and 10% as well, possibly indicating applicants with lower-risk profiles.

6. **Loan Percent Income**:
   - This distribution shows that most loan amounts are a small percentage of the applicant‚Äôs income, often less than 20%.
   - A few cases have a higher percentage, suggesting higher risk or lower-income applicants relative to their loan amounts.

7. **Credit History Length**:
   - The credit history length peaks at around 3 to 5 years, with fewer individuals having a credit history over 10 years.
   - This pattern can reflect a younger demographic or individuals newer to credit systems.

8. **Credit Score**:
   - Credit scores are distributed normally around the mid-range (600-700).
   - The distribution tails off near 850, which is tydig deeper into specific features!

In [None]:
# Function to perform univariate analysis for numeric columns
def univariate_analysis(data, column, title):
    plt.figure(figsize=(10, 2))
    
    sns.boxplot(x=data[column], color='sandybrown')
    plt.title(f'{title} Boxplot')
    
    plt.tight_layout()
    plt.show()

    print(f'\nSummary Statistics for {title}:\n', data[column].describe())

columns_to_analyze = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

for column in columns_to_analyze:
    univariate_analysis(df, column, column.replace('_', ' '))

Insights based on the boxplots and summary statistics for each numeric column:

1. **Person Age**:
   - Ages range from 20 to 144, with the median age at 26. The high maximum value suggests a few outliers.
   - The interquartile range (IQR) is fairly narrow, with most values between 24 and 30.

2. **Person Income**:
   - The income distribution has a significant range, from 8,000 to over 7.2 million, with a median around 67,048.
   - The high maximum value indicates extreme outliers, which could potentially skew analyses and models.

3. **Person Employment Experience**:
   - Most values fall below 10 years, with the median at 4 years.
   - A maximum of 125 years is unusual, indicating outliers or data anomalies.

4. **Loan Amount**:
   - Loan amounts vary from 500 to 35,000, with a median of 8,000.
   - The distribution suggests a broad spread, with some applicants requesting higher amounts near the upper limit.

5. **Loan Interest Rate**:
   - Interest rates range from 5.42% to 20%, with a median rate of 11.01%.
   - Most rates cluster within the IQR (8.59% to 12.99%), which is typical for loan interest rates.

6. **Loan Percent Income**:
   - This metric ranges from 0 to 0.66, with a median of 0.12, suggesting that most loans are less than 20% of the borrower‚Äôs income.
   - The presence of high values near 0.66 may indicate loans that represent a higher financial burden for some applicants.

7. **Credit History Length**:
   - Credit history spans from 2 to 30 years, with a median of 4 years.
   - Most applicants have shorter credit histories, likely reflecting a younger demographic.

8. **Credit Score**:
   - Credit scores range from 390 to 850, with a median of 640.
   - The distribution appears fairly symmetric around the mean of 632, with most values falling within  columns or address any anomalies.

In [None]:
def plot_categorical_distribution(column_name, data=df):
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    sns.countplot(y=column_name, data=df, palette='muted')  
    plt.title(f'Distribution of {column_name}')
    
    ax = plt.gca()
    for p in ax.patches:
        ax.annotate(f'{int(p.get_width())}', (p.get_width(), p.get_y() + p.get_height() / 2), 
                    ha='center', va='center', xytext=(10, 0), textcoords='offset points')
    
    sns.despine(left=True, bottom=True)
    
    plt.subplot(1, 2, 2)
    df[column_name].value_counts().plot.pie(autopct='%1.1f%%', colors=sns.color_palette('muted'), startangle=90, explode=[0.05]*df[column_name].nunique())
    plt.title(f'Percentage Distribution of {column_name}')
    plt.ylabel('')  
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_categorical_distribution('person_gender')
plot_categorical_distribution('person_education')
plot_categorical_distribution('person_home_ownership')
plot_categorical_distribution('loan_intent')
plot_categorical_distribution('previous_loan_defaults_on_file')

Insights based on the categorical distributions:

1. **Person Gender**:
   - The dataset is relatively balanced in terms of gender, though there might be a slight skew toward one gender, depending on the counts.
   - The pie chart offers a visual summary of the proportion of each gender in the dataset.

2. **Person Education**:
   - Most applicants have a high school, bachelor‚Äôs, or master‚Äôs education, with fewer holding an associate or doctorate degree.
   - The education distribution could influence loan approval patterns, as education level often correlates with income and creditworthiness.

3. **Person Home Ownership**:
   - The majority of applicants either rent or own homes, with a smaller number having mortgages or classified under "other."
   - Applicants with different homeownership statuses may have varying financial stability, impacting their credit risk.

4. **Loan Intent**:
   - Loan purposes are diverse, with common intents like personal use, debt consolidation, medical expenses, and education.
   - The distribution reveals common reasons for seeking loans, which may impact loan approval criteria depending on risk.

5. **Previous Loan Defaults on File**:
   - Most applicants have no record of previous loan defaults, though there is a significant portion with defaults.
   - This feature can strongly influence loan decisions, as past defaults indicate higher risk.


In [None]:
# Create subplots for loan status by categorical features
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle("Loan Status by Categorical Features", fontsize=18)

# Visualization for person_gender vs loan_status
sns.countplot(data=df, x='person_gender', hue='loan_status', ax=axes[0, 0], palette='muted')
axes[0, 0].set_title("Loan Status by Gender")
axes[0, 0].set_xlabel("Gender")
axes[0, 0].set_ylabel("Count")
axes[0, 0].legend(title='Loan Status', labels=['0 = Rejected', '1 = Approved'])

# Visualization for person_education vs loan_status
sns.countplot(data=df, x='person_education', hue='loan_status', ax=axes[0, 1], palette='muted')
axes[0, 1].set_title("Loan Status by Education Level")
axes[0, 1].set_xlabel("Education Level")
axes[0, 1].set_ylabel("Count")
axes[0, 1].legend(title='Loan Status', labels=['0 = Rejected', '1 = Approved'])
axes[0, 1].tick_params(axis='x', rotation=45)

# Visualization for person_home_ownership vs loan_status
sns.countplot(data=df, x='person_home_ownership', hue='loan_status', ax=axes[0, 2], palette='muted')
axes[0, 2].set_title("Loan Status by Home Ownership")
axes[0, 2].set_xlabel("Home Ownership")
axes[0, 2].set_ylabel("Count")
axes[0, 2].legend(title='Loan Status', labels=['0 = Rejected', '1 = Approved'])

# Visualization for loan_intent vs loan_status
sns.countplot(data=df, x='loan_intent', hue='loan_status', ax=axes[1, 0], palette='muted')
axes[1, 0].set_title("Loan Status by Loan Intent")
axes[1, 0].set_xlabel("Loan Intent")
axes[1, 0].set_ylabel("Count")
axes[1, 0].legend(title='Loan Status', labels=['0 = Rejected', '1 = Approved'])

# Visualization for previous_loan_defaults_on_file vs loan_status
sns.countplot(data=df, x='previous_loan_defaults_on_file', hue='loan_status', ax=axes[1, 1], palette='muted')
axes[1, 1].set_title("Loan Status by Previous Loan Defaults")
axes[1, 1].set_xlabel("Previous Loan Defaults")
axes[1, 1].set_ylabel("Count")
axes[1, 1].legend(title='Loan Status', labels=['0 = Rejected', '1 = Approved'])

# Hide the last subplot (if not needed)
fig.delaxes(axes[1][2])

# Adjust layout and show plot
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

Insights based on the relationship between **loan status** and various categorical features:

1. **Loan Status by Gender**:
   - Loan approvals and rejections are fairly balanced across genders, though there may be slight variations.
   - This balance suggests that gender may not be a strong determining factor in loan approval outcomes.

2. **Loan Status by Education Level**:
   - Applicants with higher education levels (Bachelor's, Associate's, etc.) seem to have a higher count of loan approvals compared to those with lower education levels.
   - Education level could be a predictor of loan approval, as it often correlates with higher income and creditworthiness.

3. **Loan Status by Home Ownership**:
   - Individuals who rent appear to have a higher rate of loan rejections compared to those with mortgages or who own homes.
   - This trend might indicate that home ownership status is considered a risk factor, as renters may have less financial stability than homeowners.

4. **Loan Status by Loan Intent**:
   - Certain loan purposes, like debt consolidation and personal loans, show more loan rejections than approvals.
   - Conversely, loan intents for ventures and education appear to have relatively balanced approval and rejection rates, possibly due to the perceived potential for income generation or improvement.

5. **Loan Status by Previous Loan Defaults**:
   - Applicants with a history of previous loan defaults have a significantly higher rejection rate compared to those without defaults.
   - This feature likely has a strong influence on loan status, as past defaults signal any specific analysis.

In [None]:
numerical_columns = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']

fig, axes = plt.subplots(4, 2, figsize=(16, 20))
fig.suptitle('Numerical Features vs Loan Status (Density Plots)', fontsize=16)

for i, col in enumerate(numerical_columns):
    sns.kdeplot(data=df, x=col, hue='loan_status', ax=axes[i//2, i%2], fill=True, common_norm=False, palette='muted')
    axes[i//2, i%2].set_title(f'{col} vs Loan Status')
    axes[i//2, i%2].set_xlabel(col)
    axes[i//2, i%2].set_ylabel('Density')

fig.delaxes(axes[3, 1])

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
fig, axes = plt.subplots(len(numerical_columns), 1, figsize=(14, 20))
fig.suptitle('Boxplots of Numerical Features by Loan Status', fontsize=16)

for i, feature in enumerate(numerical_columns):
    sns.boxplot(data=df, x='loan_status', y=feature, ax=axes[i], palette='muted')
    axes[i].set_title(f'{feature} vs Loan Status')
    axes[i].set_xlabel('Loan Status')
    axes[i].set_ylabel(feature)

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

Insights from the boxplots of numerical features by loan status:

1. **Person Age**:
   - Approved loans tend to have a slightly younger median age, though the difference is subtle.
   - The distribution range is wider for rejected loans, with outliers at the upper end, possibly indicating higher age as a minor risk factor.

2. **Person Income**:
   - Approved loans generally correspond to applicants with higher incomes.
   - The median income is notably higher for approved loans, and there are many high-income outliers for approved applications, suggesting that income positively impacts approval.

3. **Person Employment Experience**:
   - Higher employment experience shows a slight correlation with loan approval, as approved applicants have a higher median experience.
   - However, both approved and rejected groups share a wide range, implying other factors might play a stronger role.

4. **Loan Amount**:
   - Loan amounts are relatively similar across approved and rejected groups, though slightly higher medians are observed in rejected loans.
   - This may suggest that larger loans are slightly more prone to rejection, but the difference is not substantial.

5. **Loan Interest Rate**:
   - Approved loans tend to have slightly lower interest rates on average compared to rejected loans.
   - This difference aligns with a higher perceived risk for applicants with higher interest rates, possibly due to lower credit scores.

6. **Loan Percent Income**:
   - Applicants with approved loans generally have lower loan-to-income ratios, indicating that loans making up a smaller percentage of income are more likely to be approved.
   - High loan-to-income ratios in rejected loans suggest that lenders are cautious when loan amounts represent a significant portion of income.

7. **Credit History Length**:
   - Longer credit histories are observed for approved loans, suggesting that applicants with established credit histories have a higher likelihood of approval.
   - The trend reflects lenders‚Äô preference for borrowers with more credit experience.

8. **Credit Score**:
   - Approved loans are associated with higher credit scores, as expected.
   - This significant difference highlights credit score as a strong predictor of loan approval, with higher scores reflecting lower perceived risk.

In [None]:
np.isinf(df[numerical_columns]).sum()

- There are no infinite values in any of the numerical columns. Each column in `numerical_columns` has zero occurrences of infinite values.

In [None]:
# Create the catplot for person_education vs loan_status by person_gender
g = sns.catplot(
    data=df,
    x='person_education', 
    hue='loan_status', 
    col='person_gender',
    kind='count',
    height=5, 
    aspect=1.2,
    palette='muted',
    legend=False
)

# Set axis labels and titles
g.set_axis_labels("Education Level", "Count")
g.set_titles("Gender: {col_name}")

# Annotate bars with counts
for ax in g.axes.flat:
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', 
                    va='baseline', 
                    fontsize=10, 
                    color='black', 
                    xytext=(0, 5), 
                    textcoords='offset points')

# Add legend for loan status
plt.legend(
    title="Loan Status",
    loc='upper right', 
    labels=['0 = Rejected', '1 = Approved'],
    bbox_to_anchor=(1.15, 1)
)

# Show the plot
plt.show()

Insights for **education level vs. loan status by gender**:

1. **Gender Comparison**:
   - Both male and female applicants show similar trends in loan approval and rejection rates across education levels, though there may be slight variations.

2. **Education Level and Loan Status**:
   - For both genders, applicants with higher education levels (e.g., Bachelor's, Master's) tend to have higher loan approval counts.
   - Applicants with only a high school education appear to have more rejections than approvals, indicating that education level might influence loan outcomes, likely due to its association with income stability.

3. **Approval Pattern by Gender**:
   - Among both genders, there is a noticeable trend where more educated applicants (e.g., those with Master‚Äôs and Bachelor‚Äôs degrees) are more likely to get approved, which suggests that education level is a positive indicator for loan approval, possibly reflecting a more stable financial profile.

4. **Rejected Applications**:
   - The rejection rate is higher for applicants with lower educational attainment, which might point to a perceived higher risk by lenders.

These patterns suggest that **education level** is an influential factor in loan approval decisions, and it interacts similarly across both genders. 

In [None]:
# Create the catplot for person_home_ownership vs loan_status by person_gender
g = sns.catplot(
    data=df,
    x='person_home_ownership', 
    hue='loan_status', 
    col='person_gender',
    kind='count',
    height=5, 
    aspect=1.2,
    palette='muted',
    legend=False
)

# Set axis labels and titles
g.set_axis_labels("Home Ownership", "Count")
g.set_titles("Gender: {col_name}")

# Annotate bars with counts
for ax in g.axes.flat:
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', 
                    va='baseline', 
                    fontsize=10, 
                    color='black', 
                    xytext=(0, 5), 
                    textcoords='offset points')

# Add legend for loan status
plt.legend(
    title="Loan Status",
    loc='upper right', 
    labels=['0 = Rejected', '1 = Approved'],
    bbox_to_anchor=(1.15, 1)
)

# Show the plot
plt.show()

Insights for **home ownership vs. loan status by gender**:

1. **Home Ownership and Loan Status**:
   - Both male and female applicants with a mortgage or who own their home have higher counts of approved loans compared to those who rent or are categorized under ‚Äúother.‚Äù
   - Renting is associated with a higher rejection rate, possibly indicating that renters may be perceived as having a higher credit risk.

2. **Gender-Specific Patterns**:
   - While both genders show similar trends, males who own homes or have a mortgage appear to have a slightly higher number of loan approvals compared to females with similar homeownership status.
   - In both genders, applicants under the ‚Äúother‚Äù category have relatively fewer approvals, suggesting that homeownership stability (owning or mortgaging) is a positive factor for loan approval.

3. **Rejected Applications**:
   - Among renters, the rejection rate is notably high across both genders, which could imply that renters might lack certain financial security or creditworthiness that homeowners possess.

4. **Homeownership as a Stability Indicator**:
   - The data suggests that applicants who own or have a mortgage are viewed more favorably, likely due to perceived financial stability, which positively influences loan approval chances.

These observations indicate that **homeownership** is a significant predictor of loan approval, reflecting the applicant's financial stability, and is consistent across both genders. 

In [None]:
# Create the catplot for loan_intent vs loan_status by person_gender
g = sns.catplot(
    data=df,
    x='loan_intent', 
    hue='loan_status', 
    col='person_gender',
    kind='count',
    height=5, 
    aspect=1.2,
    palette='muted',
    legend=False
)

# Set axis labels and titles
g.set_axis_labels("Loan Intent", "Count")
g.set_titles("Gender: {col_name}")

# Rotate x-axis labels by 90 degrees
for ax in g.axes.flat:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

# Annotate bars with counts
for ax in g.axes.flat:
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', 
                    va='baseline', 
                    fontsize=10, 
                    color='black', 
                    xytext=(0, 5), 
                    textcoords='offset points')

# Add legend for loan status
plt.legend(
    title="Loan Status",
    loc='upper right', 
    labels=['0 = Rejected', '1 = Approved'],
    bbox_to_anchor=(1.15, 1)
)

# Show the plot
plt.show()

Insights from the catplot of **loan intent vs. loan status by gender**:

1. **Loan Intent and Approval Likelihood**:
   - For both male and female applicants, loan intents such as **debt consolidation** and **personal loans** have higher rejection counts compared to approval, suggesting these purposes might carry higher perceived risk.
   - Conversely, loan intents like **education** and **home improvement** show a relatively balanced approval-to-rejection ratio, indicating these are viewed more favorably.

2. **Gender Differences in Loan Intent**:
   - Both genders follow similar trends in loan intent, though some differences exist. For example, females seeking loans for education and medical purposes have relatively high approval rates, whereas male applicants appear to have more approvals for venture loans.
   - This could reflect varying approval policies based on loan intent and demographic profiles.

3. **High Rejection for Debt Consolidation**:
   - Debt consolidation loans have a notably higher rejection rate across both genders. This trend suggests lenders may associate such loans with higher financial risk, possibly due to previous debt issues.

4. **Approved Applications for Productive Purposes**:
   - Loan purposes with productive or asset-building outcomes, like home improvement and education, have relatively higher approval rates. This indicates that lenders may view these purposes as investments in the applicant's long-term stability.

These observations suggest that **loan intent** is an important factor in loan approval, as certain purposes are associated with higher risks, impacting loan decisions across both genders. 

In [None]:
# Create the catplot for previous_loan_defaults_on_file vs loan_status by person_gender
g = sns.catplot(
    data=df,
    x='previous_loan_defaults_on_file', 
    hue='loan_status', 
    col='person_gender',
    kind='count',
    height=5, 
    aspect=1.2,
    palette='muted',
    legend=False
)

# Set axis labels and titles
g.set_axis_labels("Previous Loan Defaults", "Count")
g.set_titles("Gender: {col_name}")

# Annotate bars with counts
for ax in g.axes.flat:
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', 
                    va='baseline', 
                    fontsize=10, 
                    color='black', 
                    xytext=(0, 5), 
                    textcoords='offset points')

# Add legend for loan status
plt.legend(
    title="Loan Status",
    loc='upper right', 
    labels=['0 = Rejected', '1 = Approved'],
    bbox_to_anchor=(1.15, 1)
)

# Show the plot
plt.show()

Insights for **previous loan defaults vs. loan status by gender**:

1. **Impact of Previous Defaults on Loan Approval**:
   - Applicants with previous loan defaults have a significantly higher count of rejections for both genders, indicating that a history of loan defaults is a strong negative factor in loan approval decisions.
   - Those without previous loan defaults have noticeably more approvals, highlighting that a clean credit history is associated with higher approval rates.

2. **Gender-Specific Observations**:
   - Both male and female applicants with no prior defaults show similar approval and rejection patterns, suggesting that previous defaults affect loan decisions consistently across genders.
   - Among applicants with previous defaults, the rejection rate is overwhelmingly high for both genders, reinforcing the importance lenders place on a clean credit history.

3. **High Rejection for Defaults**:
   - The high rejection rate for applicants with prior defaults suggests that lenders view these individuals as higher risk, irrespective of other factors, which strongly influences loan status outcomes.

These patterns underscore that **previous loan defaults** are a crucial factor in loan approval, and maintaining a clear credit record is beneficial for both genders when seeking loan approvals. 

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.violinplot(x='loan_status', y='loan_amnt', data=df)
plt.title('Loan Amount Distribution by Loan Status')

plt.subplot(1, 2, 2)
sns.violinplot(x='loan_status', y='loan_int_rate', data=df)
plt.title('Loan Interest Rate Distribution by Loan Status')

plt.tight_layout()
plt.show()

Insights for **loan amount** and **loan interest rate** by loan status:

1. **Loan Amount Distribution by Loan Status**:
   - The distribution of loan amounts differs between approved and rejected loans.
   - Approved loans show a broader spread across different loan amounts, with a central tendency around mid-range amounts.
   - Rejected loans tend to cluster at both low and high loan amounts, indicating that very small or large loan requests are more likely to be rejected.

2. **Loan Interest Rate Distribution by Loan Status**:
   - Loan interest rates are higher for rejected loans, with a central tendency around a higher rate compared to approved loans.
   - Approved loans have a narrower distribution with lower average interest rates, suggesting that lower-risk (lower-interest) loans have better chances of approval.

In [None]:
# Define numerical columns with target
numerical_columns_with_target = [
    'person_age', 
    'person_income', 
    'person_emp_exp', 
    'loan_amnt', 
    'loan_int_rate', 
    'loan_percent_income', 
    'cb_person_cred_hist_length', 
    'credit_score'
]

# Create pairplot of numerical features with loan_status as hue
sns.pairplot(df[numerical_columns_with_target + ['loan_status']], 
             hue='loan_status', 
             palette='muted'
            )
plt.show()

# <span style="color:transparent;">5. Data Preprocessing</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">5. Data Preprocessing</h1>
</div>

In [None]:
# Binary Encoding for person_gender
df['person_gender'] = df['person_gender'].map({'female': 0, 'male': 1})
# Binary Encoding for previous_loan_defaults_on_file
df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].map({'No': 0, 'Yes': 1})

# Ordinal Encoding for person_education (if applicable)
education_order = {'High School': 1, 'Associate': 2, 'Bachelor': 3, 
                   'Master': 4, 'Doctorate': 5}
df['person_education'] = df['person_education'].map(education_order)

# One-Hot Encoding for person_home_ownership and loan_intent
df = pd.get_dummies(df, columns=['person_home_ownership', 'loan_intent'], drop_first=True)


# Display the transformed DataFrame
print(df.head())

The transformation applied to the dataset:

1. **Binary Encoding**:
   - `person_gender` is now represented as `0` (female) and `1` (male).
   - `previous_loan_defaults_on_file` is represented as `0` (No) and `1` (Yes).

2. **Ordinal Encoding**:
   - `person_education` has been mapped based on the level of education, with higher values indicating higher educational attainment (e.g., `High School` = 1, `Doctorate` = 5).

3. **One-Hot Encoding**:
   - New columns have been created for `person_home_ownership` and `loan_intent`, each representing a unique category (e.g., `person_home_ownership_OWN`, `loan_intent_PERSONAL`), with one category dropped to avoid multicollsformations.

In [None]:
# Replacing Outliers with Median
median_age = df['person_age'].median()
df['person_age'] = df['person_age'].apply(lambda x: median_age if x > 100 else x)

- The maximum age value of 144 is indeed an outlier, as it exceeds a reasonable human lifespan. To handle this

    - **Replacing Outliers with Median:** Replace ages above a certain threshold (e.g., 100) with the median age of the dataset to maintain a realistic distribution.

In [None]:
# Analyze the 'person_age' column
column = 'person_age'
title = column.replace('_', ' ')

# Display summary statistics for person_age
print(f'\nSummary Statistics for {title}:\n', df[column].describe())

After replacing outliers with the median age:

   - The maximum age is now **94**, which falls within a reasonable range for human age data.
   - The **mean age** (27.75) and **standard deviation** (5.91) have slightly decreased, suggesting a more compact age distribution.
   - Replacing extreme values with the median has helped to eliminate unrealistic values without removing any rows, preserving the dataset's integrity.
   - This approach retains a realistic spread in the `person_age` data, with the **median** remaining at **26**.



# <span style="color:transparent;">6. Correlation Heatmap</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">6. Correlation Heatmap</h1>
</div>

In [None]:
corr_matrix = df.corr()

# Plotting the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Extract correlation values with respect to the target variable (loan_status)
target_variable = 'loan_status'
target_corr = corr_matrix[[target_variable]].sort_values(by=target_variable, ascending=False)

# Plotting the heatmap for correlation values with respect to the target variable
plt.figure(figsize=(4, 6))
sns.heatmap(target_corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title(f'Correlation with {target_variable}')
plt.show()

The correlation values of each feature with the target variable, `loan_status`, provide insights into which features might be significant predictors for loan approval. 

### Positively Correlated Features with `loan_status`
1. **`loan_percent_income` (0.38)**: This is the strongest positive correlation with `loan_status`, suggesting that applicants with higher loan amounts relative to their income may have a higher likelihood of approval.
2. **`loan_int_rate` (0.33)**: Higher interest rates are positively correlated with approval status, which may indicate that riskier applicants, or those with higher interest rates, are still often approved.
3. **`person_home_ownership_RENT` (0.26)**: Renting status has a positive correlation, suggesting that renters might have slightly higher approval rates than other home ownership statuses.

### Weak Positive Correlations
- **`loan_amnt` (0.11)**: The loan amount has a weak positive correlation, indicating a slight tendency for higher loan amounts to be approved.
- **Loan intents like `MEDICAL` (0.07)** and `HOMEIMPROVEMENT` (0.03)** also show weak positive correlations, suggesting some specific loan purposes might influence approval slightly.

### Negatively Correlated Features with `loan_status`
1. **`previous_loan_defaults_on_file` (-0.54)**: This is the most substantial negative correlation, indicating that a history of previous loan defaults is a significant factor in reducing approval chances.
2. **`person_income` (-0.14)**: Higher income is weakly negatively correlated with loan approval, which may seem counterintuitive. This could indicate that applicants with higher incomes might apply for higher-risk loans or that lower-income applicants are offered more approvals based on specific criteria.
3. **`person_home_ownership_OWN` (-0.09)**: Owned home status has a slight negative correlation, suggesting it may not necessarily favor approvals over renting or other ownership statuses.
4. **Loan intents like `VENTURE` (-0.09)** a**nd `EDUCATION` (-0.06)** have negative correlations with approval, possibly reflecting a perception of higher risk associated with these loan purposes.

### Implications
- The strongest predictors for loan approval appear to be **loan percent income**, **loan interest rate**, and **previous loan defaults**.
- **Credit score**, surprisingly, has a near-zero correlation (-0.007), suggesting it may not play a major role in approval within this dataset, or it may interact in complex ways wiroceed with modeling!

# <span style="color:transparent;">7. Model Training and Evaluation</span>

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">7. Model Training and Evaluation</h1>
</div>

In [None]:
# Separate features and target from the train dataset
X = df.drop(['loan_status'], axis=1)
y = df['loan_status']

# Display the transformed data
display(X.head())
display(y.head())

In [None]:
# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Use RobustScaler 
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [None]:
# Dictionary to store the models and their names
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(verbosity=-1, random_state=42),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

In [None]:
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    
    # Predictions on validation set
    y_val_pred = model.predict(X_val)
    
    # Train and Test Scores
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_val, y_val)
    
    # Accuracy Score
    accuracy = accuracy_score(y_val, y_val_pred)
    
    results.append({
        'Model': name,
        'Train Score': train_score,
        'Test Score': test_score,
        'Accuracy Score': accuracy
    })
    
    # Classification report
    print(f"Classification Report for {name}:\n")
    print(classification_report(y_val, y_val_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_val, y_val_pred)
    
    # Plotting confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='PuBu', xticklabels=['Rejected', 'Approved'], yticklabels=['Rejected', 'Approved'])
    plt.title(f'Confusion Matrix for {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
    print("\n" + "="*60 + "\n")

results_df = pd.DataFrame(results)

# Display the model performance table
print("Model Performance Table:")
display(results_df)

### Key Observations:

* Logistic Regression: Moderate performance, likely due to its linear nature, which may not fully capture the complex relationships in the data.
* Random Forest, XGBoost, LightGBM, and CatBoost: These models performed strongly, achieving high accuracy scores (around 0.93). Ensemble models (Random Forest, XGBoost, LightGBM, CatBoost) generally perform well on structured data, benefiting from the ability to capture non-linear relationships.
* Best Model: XGBoost achieved the highest accuracy of 0.9346, making it the preferred choice for this task.


In [None]:
# Identify the best model by accuracy
best_model_row = results_df.loc[results_df['Accuracy Score'].idxmax()]
best_model_name = best_model_row['Model']
best_model_accuracy = best_model_row['Accuracy Score']

print(f"\nBest Model: {best_model_name} with Accuracy: {best_model_accuracy:.4f}")

In [None]:
# Check if the best model supports feature importances
best_model = models[best_model_name]

if hasattr(best_model, 'feature_importances_'):
    feature_importances = best_model.feature_importances_

    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_importances
    }).sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
    plt.title(f'Feature Importances - {best_model_name}', fontsize=16)
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()
else:
    print(f"The {best_model_name} model does not support feature importances.")

### Feature Importance Analysis
* For XGBoost (the best model), feature importances were derived, providing insights into which features were most influential in predicting loan approval.
* Key Features: Likely features with high importance scores include loan_percent_income, and previous_loan_defaults_on_file, as these factors are closely related to an applicant's creditworthiness and loan risk.

In [None]:
# Obtain predicted probabilities for the validation set
test_probabilities = best_model.predict_proba(X_val)[:, 1]  

# Plot distribution of predicted probabilities
plt.figure(figsize=(10, 6))
sns.histplot(test_probabilities, bins=30, kde=True, color='dodgerblue')
plt.title(f'Distribution of Predicted Loan Approval Probabilities - {best_model_name}')
plt.xlabel('Predicted Probability of Loan Approval')
plt.ylabel('Frequency')
plt.xlim(0, 1) 
plt.grid()
plt.show()


In [None]:
binary_predictions = (test_probabilities > 0.5).astype(int)

# Plot distribution of binary predictions
plt.figure(figsize=(8, 5))
sns.countplot(x=binary_predictions.flatten(), palette='muted')
plt.title('Distribution of Predicted Loan Status')
plt.xlabel('Loan Status (0: Not Approved, 1: Approved)')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Not Approved', 'Approved'])
plt.grid(axis='y')
plt.show()

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 10px; background-color: #faebd7; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #191970; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 24px;">üöÄ If you found this notebook helpful, please consider giving it an upvote! üëç</h1>
    <p style="color: #191970; font-size: 18px; text-align: center;">Your support motivates me to create more useful content like this, and it helps others discover the notebook too! üôå</p>
    <p style="color: #191970; font-size: 18px; text-align: center;">Thank you for your time, and I hope this notebook brings value to your data science journey! üí°üòä</p>
</div>