# Task 2 — Exploratory Data Analysis (EDA)

Goal: Explore the dataset to understand distributions, missing values, correlations, and outliers,
and document key insights relevant to credit risk modeling.


In [None]:
# Standard imports and helper
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from src.utils.io import load_csv

sns.set(style="whitegrid")
DATA_PATH = Path('..') / 'data' / 'raw' / 'data.csv'
df = load_csv(DATA_PATH)
print('Loaded', df.shape, 'rows and columns')
df.head()

In [None]:
# Overview: columns, dtypes and missing values
df.info()

# Basic missing value summary
missing = df.isna().sum().sort_values(ascending=False)
missing_percent = 100 * missing / len(df)
pd.concat([missing, missing_percent], axis=1, keys=['missing_count', 'missing_pct']).head(20)

In [None]:
# Summary statistics for numerical features
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[num_cols].describe().T

## Numerical Distributions
We visualize the distributions of `Amount` and `Value`, check skewness, and identify potential outliers.

In [None]:
# Plot distributions for Amount and Value (trimmed for visibility)
fig, axes = plt.subplots(2, 2, figsize=(14,10))
sns.histplot(df['Amount'].clip(lower=df['Amount'].quantile(0.001), upper=df['Amount'].quantile(0.999)), bins=100, ax=axes[0,0]).set_title('Amount - trimmed')
sns.boxplot(x=df['Amount'], ax=axes[0,1]).set_title('Amount - boxplot')
sns.histplot(df['Value'].clip(lower=df['Value'].quantile(0.001), upper=df['Value'].quantile(0.999)), bins=100, ax=axes[1,0]).set_title('Value - trimmed')
sns.boxplot(x=df['Value'], ax=axes[1,1]).set_title('Value - boxplot')
plt.tight_layout()
plt.show()

# Skewness
print('Amount skew:', df['Amount'].skew())
print('Value skew:', df['Value'].skew())

## Categorical Distributions
Frequency counts for `ProductCategory`, `ChannelId`, `ProviderId`, and `CurrencyCode`.

In [None]:
cat_cols = ['ProductCategory','ChannelId','ProviderId','CurrencyCode']
for c in cat_cols:
    if c in df.columns:
        print('
---', c, '---')
        display(df[c].value_counts().head(20))
        plt.figure(figsize=(8,3))
        sns.countplot(data=df, y=c, order=df[c].value_counts().index[:20])
        plt.title(f'{c} - top categories')
        plt.show()

## Correlation Analysis
Compute correlation matrix for numeric variables and show heatmap to identify relationships.

In [None]:
corr = df[num_cols].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='vlag', center=0)
plt.title('Correlation matrix (numeric features)')
plt.show()

## Missing Values and Imputation Strategy
Document missingness and recommended imputation approaches.

In [None]:
missing = df.isna().sum()
display(missing[missing>0].sort_values(ascending=False))

# Recommended strategies (documented):
imputation_notes = {
    'numeric': 'median or domain-specific constant after considering outliers',
    'categorical': 'new category 
 or most frequent with caution',
    'time': 'forward/backward fill only when it makes sense for series'
}
imputation_notes

## Outlier Detection
Use boxplots and quantile-based rules to identify outliers for `Amount` and `Value`.

In [None]:
for col in ['Amount','Value']:
    if col in df.columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outlier_count = df[(df[col] < lower) | (df[col] > upper)].shape[0]
        print(f'{col}: outliers (IQR rule):', outlier_count)
        plt.figure(figsize=(6,2))
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot - {col}')
        plt.show()

## Key Insights (Top 3-5)
- Insight 1: `Amount` and `Value` distributions are heavily skewed with many small transactions and a long right tail; consider log transformation for modeling.
- Insight 2: `ProductCategory`, `ProviderId` and `ChannelId` show strong class imbalance — consider grouping rare categories or using target encoding with caution.
- Insight 3: There are negative `Amount` values (refunds/chargebacks) which should be handled explicitly when defining the proxy default/target.
- Insight 4: Missing values are limited (if any). Use median for numeric imputation and a `Missing` label for categoricals when appropriate.
- Insight 5: Correlation between `Amount` and `Value` is high (expected), while other numeric features show weak correlations; consider feature engineering for behavioral signals.