# Exploratory Data Analysis - Credit Risk Model

This notebook contains exploratory data analysis for the credit risk modeling project.


## 1. Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


## 2. Load Data


In [None]:
# Update this path to your actual data file
data_path = Path('../data/raw/credit_data.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"Data loaded successfully: {df.shape[0]} rows, {df.shape[1]} columns")
else:
    print(f"Data file not found at {data_path}")
    print("Please ensure your data file is in the data/raw/ directory")


## 3. Data Overview


In [None]:
# Display basic information
print("Dataset Info:")
print(df.info())
print("\n" + "="*50 + "\n")
print("First few rows:")
print(df.head())
print("\n" + "="*50 + "\n")
print("Dataset Statistics:")
print(df.describe())


## 4. Missing Values Analysis


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Missing Values:")
    print(missing_df)
    
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    sns.barplot(x=missing_df.index, y=missing_df['Missing Percentage'])
    plt.title('Missing Values by Column')
    plt.xlabel('Columns')
    plt.ylabel('Missing Percentage (%)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset.")


## 5. Target Variable Distribution


In [None]:
# Assuming target column is named 'target' - adjust as needed
target_col = 'target'  # Update this to your actual target column name

if target_col in df.columns:
    # Distribution of target variable
    target_dist = df[target_col].value_counts()
    print(f"Target Variable Distribution:")
    print(target_dist)
    print(f"\nTarget Variable Proportions:")
    print(target_dist / len(df) * 100)
    
    # Visualize target distribution
    plt.figure(figsize=(8, 6))
    sns.countplot(x=target_col, data=df)
    plt.title('Target Variable Distribution')
    plt.xlabel('Target')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
else:
    print(f"Target column '{target_col}' not found. Please update the target_col variable.")


## 6. Numerical Features Analysis


In [None]:
# Select numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if target_col in numeric_cols:
    numeric_cols.remove(target_col)

if len(numeric_cols) > 0:
    print(f"Numerical Features: {len(numeric_cols)}")
    
    # Distribution plots for numerical features
    n_cols = min(3, len(numeric_cols))
    n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(numeric_cols[:n_rows*n_cols]):
        axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black')
        axes[idx].set_title(f'Distribution of {col}')
        axes[idx].set_xlabel(col)
        axes[idx].set_ylabel('Frequency')
    
    # Hide extra subplots
    for idx in range(len(numeric_cols), len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("No numerical features found.")


## 7. Correlation Analysis


In [None]:
# Correlation matrix for numerical features
if len(numeric_cols) > 0:
    corr_matrix = df[numeric_cols + [target_col] if target_col in df.columns else numeric_cols].corr()
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Correlation with target (if target is numerical)
    if target_col in df.columns and df[target_col].dtype in [np.number]:
        target_corr = corr_matrix[target_col].drop(target_col).sort_values(ascending=False)
        print("\nCorrelation with Target Variable:")
        print(target_corr)


## 8. Categorical Features Analysis


In [None]:
# Select categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if len(categorical_cols) > 0:
    print(f"Categorical Features: {len(categorical_cols)}")
    
    for col in categorical_cols[:5]:  # Show first 5 categorical features
        print(f"\n{col}:")
        print(df[col].value_counts())
        
        # Visualize if not too many categories
        if df[col].nunique() <= 10:
            plt.figure(figsize=(8, 5))
            sns.countplot(x=col, data=df)
            plt.title(f'Distribution of {col}')
            plt.xticks(rotation=45, ha='right')
            plt.tight_layout()
            plt.show()
else:
    print("No categorical features found.")


## 9. Summary and Next Steps


In [None]:
print("EDA Summary:")
print(f"Total Rows: {len(df)}")
print(f"Total Columns: {len(df.columns)}")
print(f"Numerical Features: {len(numeric_cols)}")
print(f"Categorical Features: {len(categorical_cols)}")
print(f"Missing Values: {df.isnull().sum().sum()}")
print("\nNext Steps:")
print("1. Handle missing values")
print("2. Feature engineering")
print("3. Feature selection")
print("4. Model training")
