# Loan Prediction Analysis
This notebook analyzes the loan prediction dataset and visualizes key insights.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set the style for all visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

## Load and Examine Data

In [None]:
# Load raw data
raw_data = pd.read_csv('../data/raw/test_Y3wMUE5_7gLdaTN.csv')

# Display basic information
print('Dataset Shape:', raw_data.shape)
print('
Columns:', raw_data.columns.tolist())
print('
Data Types:
', raw_data.dtypes)
print('
Missing Values:
', raw_data.isnull().sum())

## Analyze Numerical Features

In [None]:
# Statistical summary of numerical features
numerical_features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
raw_data[numerical_features].describe()

In [None]:
# Distribution plots for numerical features
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(data=raw_data, x=col, kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

## Analyze Categorical Features

In [None]:
# Count plots for categorical features
categorical_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_features, 1):
    plt.subplot(2, 3, i)
    sns.countplot(data=raw_data, x=col)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation matrix for numerical features
correlation = raw_data[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## Load Processed Data

In [None]:
# Load processed data
processed_data = pd.read_csv('../data/processed/processed_loan_data.csv')

print('Processed Dataset Shape:', processed_data.shape)
print('
Processed Columns:', processed_data.columns.tolist())
print('
Processed Data Types:
', processed_data.dtypes)
print('
Missing Values after Processing:
', processed_data.isnull().sum())

## Key Findings and Insights

1. Data Quality:
   - Initial missing values in various features
   - Processed data has no missing values

2. Numerical Features:
   - Wide range of applicant incomes
   - Loan amounts follow a right-skewed distribution
   - Most loans have a term of 360 months

3. Categorical Features:
   - Gender distribution shows more male applicants
   - Most applicants are graduates
   - Property area distribution is relatively balanced

4. Correlations:
   - Positive correlation between applicant income and loan amount
   - Weak correlation between co-applicant income and other features