In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df =pd.read_csv("/Users/souadmouajel/Desktop/Ironhack/lab-sessions/week-8/Ironhack-Final-Project/Data/clean_data/cleaned_loan_data.csv")
df.head()

In [None]:
df.info()

In [None]:
# Target Variable Analysis (loan_status) in order to understand class distribution and prediction challenge.
## It is not severe imbalance so we don't need to do resampling
print(df['loan_status'].value_counts(normalize=True))
sns.countplot(x='loan_status', data=df)

In [None]:
# let us check the Numeric Features vs. Loan Status to identify trends in defaults by numeric features.
## It is clear that: loan_to_income_ratio person_income_log, loan_amount_log are  playing important role in the variationn between default and non-default
num_cols = ['person_age', 'person_income_log', 'loan_amount_log', 
            'loan_to_income_ratio', 'cb_credit_history_length']
for col in num_cols:
    sns.boxplot(x='loan_status', y=col, data=df)
    plt.show()

In [None]:
## Categorical Features vs. Loan Status in order to find high-risk categories.

import matplotlib.pyplot as plt
import numpy as np

cat_cols = ['person_home_ownership', 'loan_purpose', 'loan_grade', 'cb_person_default_on_file']

for col in cat_cols:
    # Calculate normalized crosstab
    crosstab = pd.crosstab(df[col], df['loan_status'], normalize='index')
    
    # Create stacked bar plot
    ax = crosstab.plot.bar(stacked=True, figsize=(10, 6))
    
    # Set legend and title
    plt.legend(title='Loan Status', labels=['Non-Default (0)', 'Default (1)'], loc='upper right', fontsize='small')
    plt.title(f'Loan Status Distribution by {col.replace("_", " ").title()}')
    plt.ylabel('Percentage')
    plt.xticks(rotation=45, ha='right')
    
    # Add percentage labels on each bar segment
    for p in ax.patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy()
        
        # Only label segments that are visible (height > 5%)
        if height > 0.05:
            ax.text(x + width/2, 
                    y + height/2, 
                    f'{height:.1%}', 
                    ha='center', 
                    va='center',
                    fontsize=8)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation Analysis to detect multicollinearity and key relationships.
# Numeric features only (exclude bool/object)
corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

In [None]:
# Time-Based Analysis (person_employment_length, cb_credit_history_length) to check if experience/credit age affects defaults.
sns.histplot(data=df, x='cb_credit_history_length', hue='loan_status', kde=True)

In [None]:
# Interest Rate Patterns (loan_interest_rate) to explore how interest rates relate to defaults (despite missing values).
sns.violinplot(x='loan_status', y='loan_interest_rate', data=df)
print(df.groupby('loan_status')['loan_interest_rate'].mean())