In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

df =pd.read_csv("../Data/clean_data/cleaned_loan_data.csv")
df.head()

In [None]:
df.info()

In [None]:
# Target Variable Analysis (loan_status) in order to understand class distribution and prediction challenge.
## It is not severe imbalance so we don't need to do resampling
print(df['loan_status'].value_counts(normalize=True))
sns.countplot(x='loan_status', data=df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Target Variable Analysis (loan_status)
print(df['loan_status'].value_counts(normalize=True))

# Create countplot with custom colors - handles both string and boolean values
plt.figure(figsize=(8, 6))
ax = sns.countplot(x='loan_status', data=df, 
                   palette={'True': 'red', 'False': 'lightgray', 
                           True: 'red', False: 'lightgray'})

# Add title and labels
plt.title('Loan Status Distribution')
plt.xlabel('Loan Status')
plt.ylabel('Count')

# Optional: Format y-axis to display full numbers (no scientific notation)
plt.ticklabel_format(style='plain', axis='y')

# Save the plot to a file
plt.savefig('loan_status_distribution.png', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

In [None]:
# let us check the Numeric Features vs. Loan Status to identify trends in defaults by numeric features.
## It is clear that: loan_to_income_ratio person_income_log, loan_amount_log are  playing important role in the variationn between default and non-default
num_cols = ['person_age', 'person_income_log', 'loan_amount_log', 
            'loan_to_income_ratio', 'cb_credit_history_length', "person_income","loan_amount" ]
for col in num_cols:
    sns.boxplot(x='loan_status', y=col, data=df)
    plt.show()

In [None]:
num_cols = ['person_age', 'person_income_log', 'loan_amount_log', 
            'loan_to_income_ratio', 'cb_credit_history_length', "person_income","loan_amount" ]
for col in num_cols:
    sns.boxplot(x='loan_status', y=col, data=df)
    plt.show()

In [None]:
# Ensure 'loan_status' is treated as strings (avoids palette key errors)
# Ensure 'loan_status' is treated as strings (avoids palette key errors)
df['loan_status'] = df['loan_status'].astype(str)

# Define the color palette and order (False first, then True)
palette = {'True': 'red', 'False': 'lightgray'}
order = ['False', 'True']  # This will reverse the default order

# Create the boxplot with specified order
plt.figure(figsize=(8, 6))
sns.boxplot(
    x='loan_status', 
    y='person_income', 
    data=df, 
    palette=palette,
    order=order  # Forces False (right) → True (left)
)

# Customize the plot
plt.title('Personal Income Distribution by Loan Status', fontsize=14)
plt.xlabel('Loan Status (False = non-default, True = Default)', fontsize=12)
plt.ylabel('Income', fontsize=12)

# Save the plot
plt.savefig('person_income_by_loan_status.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Ensure 'loan_status' is treated as strings (avoids palette key errors)
df['loan_status'] = df['loan_status'].astype(str)

# Define the color palette and order (False first, then True)
palette = {'True': 'red', 'False': 'lightgray'}
order = ['False', 'True']  # This will reverse the default order

# Create the boxplot with specified order
plt.figure(figsize=(8, 6))
sns.boxplot(
    x='loan_status', 
    y='loan_interest_rate', 
    data=df, 
    palette=palette,
    order=order  # Forces False (right) → True (left)
)

# Customize the plot
plt.title('Loan interest rate by Loan Status', fontsize=14)
plt.xlabel('Loan Status (False = non-default, True = Default)', fontsize=12)
plt.ylabel('Interest Rate', fontsize=12)

# Save the plot
plt.savefig('interest_rate_by_loan_status.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
## Categorical Features vs. Loan Status in order to find high-risk categories.

cat_cols = ['person_home_ownership', 'loan_purpose', 'loan_grade', 'cb_person_default_on_file']

for col in cat_cols:
    # Calculate normalized crosstab (percentage per category)
    crosstab = pd.crosstab(df[col], df['loan_status'], normalize='index')

    # Sort by default rate (i.e., True column)
    crosstab = crosstab.sort_values(by=True, ascending=False)

    # Plot
    ax = crosstab.plot.bar(stacked=True, figsize=(10, 6), color=['#1f77b4', '#d62728'])

    # Set legend and title
    plt.legend(title='Loan Status', labels=['Non-Default (False)', 'Default (True)'], loc='upper right', fontsize='small')
    plt.title(f'Loan Status Distribution by {col.replace("_", " ").title()} (Sorted by Default Rate)')
    plt.ylabel('Percentage')
    plt.xticks(rotation=45, ha='right')

    # Add percentage labels on each bar segment
    for p in ax.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy()

        if height > 0.05:
            ax.text(x + width/2,
                    y + height/2,
                    f'{height:.1%}',
                    ha='center',
                    va='center',
                    fontsize=8)

    plt.tight_layout()
    plt.show()


In [None]:
# Correlation Analysis to detect multicollinearity and key relationships.
# Numeric features only (exclude bool/object)
corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

In [None]:
# Time-Based Analysis (person_employment_length, cb_credit_history_length) to check if experience/credit age affects defaults.
sns.histplot(data=df, x='cb_credit_history_length', hue='loan_status', kde=True)

In [None]:
# Interest Rate Patterns (loan_interest_rate) to explore how interest rates relate to defaults (despite missing values).
sns.violinplot(x='loan_status', y='loan_interest_rate', data=df)
print(df.groupby('loan_status')['loan_interest_rate'].mean())

In [None]:
# the Loan grade is associated with the interest rate. 
# We will generate new column out of the loan grads by trasnfroming the grades into numbers then we checked the correlation. It is very high
# If loan_grade is categorical, first convert it to an ordinal numeric scale
grade_order = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E_or_lower': 6  # You can also use 5.5 or 7 depending on how you interpret it
}

df['loan_grade_numeric'] = df['loan_grade'].map(grade_order) 

# Plot correlation
sns.heatmap(df[['loan_grade_numeric', 'loan_interest_rate']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Loan Grade and Interest Rate')
plt.show()


In [None]:
### Exploratory Data Analysis (Univarite and bivariate analysis) (EDA) Summary

## Target Variable: loan_status
# The dataset shows a class imbalance: approximately 22% of individuals are in default (True), while around 78% are non-default (False).

## Numerical Feature Analysis

# Age Distribution:
  # There is no significant difference between the mean and median of person_age for defaulted and non-defaulted individuals.

# Credit History Length:
  # The cd_credit_history_length shows no difference in mean or median between the defaulted and non-defaulted groups.

# Income (Log-transformed):
  # The mean and median of person_income_log are higher for non-defaulted individuals compared to those who defaulted.

# Loan Amount (Log-transformed):
  # The median of loan_amount_log is lower for non-defaulted individuals than for those in default.

# Loan-to-Income Ratio:
  # Both the mean and median of the loan_to_income_ratio are lower among non-defaulted individuals, 
  # indicating they tend to borrow more conservatively relative to their income.

# Interest Rate:
  # Higher interest rates are generally associated with a higher likelihood of default.

## Categorical Feature Analysis

# Home Ownership (person_home_ownership):
  # Default rates are highest among individuals who rent, followed by those marked as other, mortgage, and own—the latter showing the lowest default rate.

# Loan Purpose (loan_purpose):
  # The highest proportion of defaults occurs in loans for debt consolidation, followed by medical, home improvement,
  # personal, education, and finally venture, which has the lowest default rate.

# Loan Grade (loan_grade):
  # Defaults are more frequent in lower-grade loans, especially in the E_or_lower category, followed by D, C, B, and A, respectively.

# Credit Default History (cb_person_default_on_file):
  # Individuals with a history of default (Y) are more likely to default again compared to those without such a history (N).

# the Loan grade is strongley correlated with the interest rate.

In [None]:
### Exploratory Data Analysis (multivariate analysis)
sns.boxplot(data=df, x='loan_grade', y='loan_to_income_ratio', hue='loan_status')
plt.title("Interest Rate by Loan Grade (Stratified by Default Status)")

In [None]:
# 2. Segment-Specific Analysis since the EDA hints that certain subgroups (renters, debt-consolidation borrowers) are riskier. Dive deeper.
df_renters = df[df['person_home_ownership'] == 'rent']
sns.scatterplot(data=df_renters, x='person_income_log', y='loan_amount_log', hue='loan_status')

In [None]:
## Temporal Patterns since we  noted person_age and cb_credit_history_length are correlated. Explore time-based risk.
## Approach:
## Bin cb_credit_history_length into groups (e.g., "<5 years", "5–10 years", "10+ years") and check default rates per bin.
df['credit_history_bins'] = pd.cut(df['cb_credit_history_length'], bins=[0, 5, 10, 30])
pd.crosstab(df['credit_history_bins'], df['loan_status'], normalize='index')

In [None]:

df.credit_history_bins.value_counts()

In [None]:
## Key Questions to Answer
# For high-risk groups (e.g., renters, debt-consolidation):

# What’s their typical loan_grade and interest_rate?

# Do they have other red flags (e.g., prior defaults)?

# For loan_to_income_ratio:

# Is there a threshold where default risk spikes (e.g., >30%)?

# For cb_person_default_on_file:

# How much does a prior default increase the odds of a new default?

In [None]:
pd.crosstab(df['person_employment_length'], df['loan_status'], normalize='index')

In [None]:
pd.crosstab(df['person_home_ownership'], df['loan_status'], normalize='index')

In [None]:
df.info()

In [None]:
## Save and export the cleaned data updated in a csv file:
df.to_csv('/Users/souadmouajel/Desktop/Ironhack/lab-sessions/week-8/Ironhack-Final-Project/Data/clean_data/cleaned_loan_updated.csv', index=False)

In [None]:
import os
print(os.getcwd())