In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
# Configure display
pd.set_option('display.max_columns', 100)
sns.set_style('whitegrid')

In [None]:
# Update the path to your raw data file
DATA_PATH = Path('../data/raw/transactions.csv')
if not DATA_PATH.exists():
    raise FileNotFoundError(f'Expected data file at {DATA_PATH.resolve()}')
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Overview of the data
df.info()

In [None]:
# Summary statistics for numeric columns
df.describe().T

In [None]:
# Distribution of numerical features
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols].hist(bins=30, figsize=(16, 12))
plt.tight_layout()

In [None]:
# Distribution of categorical features (top categories)
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    plt.figure(figsize=(10, 4))
    df[col].value_counts(dropna=False).head(20).plot(kind='bar')
    plt.title(f'Top categories for {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
corr = df[num_cols].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=False, cmap='coolwarm', center=0)
plt.title('Correlation heatmap')

In [None]:
# Missing values
missing = df.isnull().mean().sort_values(ascending=False)
missing.head(20)

In [None]:
# Outlier detection using boxplots
for col in num_cols:
    plt.figure(figsize=(8, 3))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.tight_layout()
plt.show()

## Key Insights (update after running)
- Insight 1: ...
- Insight 2: ...
- Insight 3: ...
- Insight 4: ...
- Insight 5: ...