In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# --- Load Data ---
df = pd.read_csv("../data/raw/data.csv")
print(df.shape)
df.head()


In [None]:
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing > 0]


In [None]:
num_cols = ['Amount', 'Value']
df[num_cols].hist(bins=30, figsize=(8, 4))
plt.tight_layout()
plt.show()


In [None]:
cat_cols = ['CurrencyCode', 'CountryCode', 'ProductCategory', 'ChannelId']
for col in cat_cols:
    plt.figure(figsize=(6,3))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f"{col} distribution")
    plt.show()


In [None]:
corr = df[num_cols].corr()
sns.heatmap(corr, annot=True, cmap='Blues')
plt.title("Numerical Feature Correlations")
plt.show()


In [None]:
insights = [
    "1️⃣ Transaction amounts are highly skewed — few customers make very large payments.",
    "2️⃣ Financial services and airtime dominate product categories.",
    "3️⃣ Most transactions occur in one country (UGX currency), potential class imbalance.",
    "4️⃣ Negative transaction values indicate refunds or credits — must handle separately.",
    "5️⃣ FraudResult is rare, implying class imbalance issues for risk modeling."
]
for i in insights:
    print(i)
