# Exploratory Data Analysis - Personal Loan Dataset

This notebook contains exploratory data analysis for the personal loan prediction project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path().resolve().parent))

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## Load Data

In [None]:
# Load data
df = pd.read_csv('../data/raw/loan_data.csv')  # Update path as needed
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## Basic Information

In [None]:
# Dataset info
df.info()
df.describe()

## Missing Values

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percent
})
missing_df[missing_df['Missing Count'] > 0]

## Data Overview

**Dataset Information:**
- Total records: 5000 customers
- Features: 13 features + 1 target variable
- Business Context: AllLife Bank wants to identify customers likely to accept personal loan offers

## Feature Analysis - Numerical Variables

In [None]:
# Statistical summary for numerical features
numerical_cols = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage']
numerical_data = df[numerical_cols]
print("Numerical Features Summary:")
print(numerical_data.describe())

## Feature Analysis - Binary Variables

In [None]:
# Distribution of binary features
binary_cols = ['Securities_Account', 'CD_Account', 'Online', 'CreditCard']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.ravel()

for idx, col in enumerate(binary_cols):
    counts = df[col].value_counts()
    axes[idx].bar(counts.index, counts.values)
    axes[idx].set_title(f'{col} Distribution')
    axes[idx].set_xlabel('Value (0=No, 1=Yes)')
    axes[idx].set_ylabel('Count')
    axes[idx].set_xticks([0, 1])

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation with target variable
correlation_with_target = df.corr()['Personal_Loan'].sort_values(ascending=False)
print("Correlation with Personal_Loan:")
print(correlation_with_target)

# Visualize correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df.drop(columns=['ID']).corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Distribution by Target Variable

Compare key features between customers who accepted vs. did not accept personal loans

In [None]:
# Compare income distribution
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Income comparison
df[df['Personal_Loan']==0]['Income'].hist(bins=30, alpha=0.7, label='No Loan', ax=axes[0])
df[df['Personal_Loan']==1]['Income'].hist(bins=30, alpha=0.7, label='Accepted Loan', ax=axes[0])
axes[0].set_xlabel('Income (thousand dollars)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Income Distribution by Loan Status')
axes[0].legend()

# CCAvg comparison
df[df['Personal_Loan']==0]['CCAvg'].hist(bins=30, alpha=0.7, label='No Loan', ax=axes[1])
df[df['Personal_Loan']==1]['CCAvg'].hist(bins=30, alpha=0.7, label='Accepted Loan', ax=axes[1])
axes[1].set_xlabel('CCAvg (thousand dollars)')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Credit Card Avg Spending by Loan Status')
axes[1].legend()

# Education comparison
education_loan = pd.crosstab(df['Education'], df['Personal_Loan'], normalize='index') * 100
education_loan.plot(kind='bar', ax=axes[2])
axes[2].set_xlabel('Education (1=Undergrad, 2=Grad, 3=Advanced)')
axes[2].set_ylabel('Percentage')
axes[2].set_title('Loan Acceptance by Education Level')
axes[2].legend(['No Loan', 'Accepted Loan'])
axes[2].set_xticklabels(['Undergrad', 'Graduate', 'Advanced'], rotation=0)

plt.tight_layout()
plt.show()

## Target Variable Distribution

In [None]:
# Update 'Personal_Loan' to match your target column name
target_col = 'Personal_Loan'  # Update this

if target_col in df.columns:
    print(f"Target variable distribution:\n{df[target_col].value_counts()}")
    print(f"\nTarget variable percentage:\n{df[target_col].value_counts(normalize=True) * 100}")
    
    plt.figure(figsize=(8, 5))
    df[target_col].value_counts().plot(kind='bar')
    plt.title('Target Variable Distribution')
    plt.xlabel('Personal Loan')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    plt.show()
else:
    print(f"Column '{target_col}' not found. Available columns: {df.columns.tolist()}")