In [None]:
# imports & paths
import warnings, os
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# seaborn là optional; nếu muốn tối giản, có thể bỏ và dùng thuần matplotlib
try:
    import seaborn as sns
    sns.set(style="whitegrid")
except Exception:
    sns = None

from pathlib import Path

# project utils
from src import load_config, paths, get_logger
from src.data_loader import resolve_data_paths, load_raw_tables, clean_transactions, merge_all, feature_engineering

_LOG = get_logger("EDA")
cfg = load_config("config/config.yaml")
prj = paths()
art_dir = Path(cfg.get("artifacts", {}).get("dir", prj["artifacts_dir"])).resolve()
fig_dir = art_dir / "figures"; fig_dir.mkdir(parents=True, exist_ok=True)

cfg


In [None]:
# load raw
dpaths = resolve_data_paths(cfg)
tables = load_raw_tables(dpaths)
for k, v in tables.items():
    print(k, v.shape)
tables["transactions"].head(3)


In [None]:
# info & null overview
def df_info(df: pd.DataFrame, name: str):
    print(f"=== {name} ===")
    print(df.dtypes.value_counts(), "\n")
    nulls = df.isna().mean().sort_values(ascending=False).head(20)
    display(nulls.to_frame("null_ratio"))

for k in ["transactions","labels","users","cards","merchants"]:
    df_info(tables[k], k)


In [None]:
# clean + merge + simple features
tx_clean = clean_transactions(tables["transactions"], cfg)
merged = merge_all({**tables, "transactions": tx_clean}, cfg)
fe = feature_engineering(merged, cfg)

target = cfg.get("preprocess", {}).get("target", "fraud_label")
dt_col = cfg.get("preprocess", {}).get("datetime_col", None)

fe.shape, fe[target].value_counts(normalize=True)


In [None]:
#  label distribution
vc = fe[target].value_counts().sort_index()
print(vc)
print(vc / vc.sum())

plt.figure(figsize=(4,3))
plt.bar(vc.index.astype(str), vc.values)
plt.title("Label distribution")
plt.xlabel("label"); plt.ylabel("count")
plt.tight_layout(); plt.savefig(fig_dir / "label_distribution.png"); plt.show()


In [None]:
# numeric overview
num_cols = fe.select_dtypes(include=[np.number]).columns.tolist()
num_cols = [c for c in num_cols if c != target]

desc = fe[num_cols].describe().T
display(desc.head(20))

# histograms (top 8 numerics)
cols_show = num_cols[:8]
fig, axes = plt.subplots(nrows=len(cols_show), ncols=1, figsize=(6, 2.4*len(cols_show)))
if len(cols_show) == 1: axes = [axes]
for ax, c in zip(axes, cols_show):
    ax.hist(fe[c].dropna(), bins=50)
    ax.set_title(c)
plt.tight_layout(); plt.savefig(fig_dir / "numeric_hist.png"); plt.show()

# correlation heatmap (top 15 numerics by variance)
top_num = fe[num_cols].var().sort_values(ascending=False).head(15).index
corr = fe[top_num].corr()
plt.figure(figsize=(7,6))
if sns is not None:
    sns.heatmap(corr, annot=False, cmap="viridis")
else:
    plt.imshow(corr.values); plt.xticks(range(len(top_num)), top_num, rotation=90); plt.yticks(range(len(top_num)), top_num)
plt.title("Correlation (top numeric)")
plt.tight_layout(); plt.savefig(fig_dir / "corr_heatmap.png"); plt.show()


In [None]:
#  categorical overview
cat_cols = fe.select_dtypes(include=["object","category"]).columns.tolist()
top_show = {}
for c in cat_cols[:10]:
    vc = fe[c].value_counts().head(10)
    top_show[c] = vc

# hiển thị top 5 cột đầu
for i, (c, vc) in enumerate(top_show.items()):
    print(f"\n== {c} ==")
    display(vc.to_frame("count"))
    if i >= 4: break


In [None]:
# amount vs label
if "amount" in fe.columns:
    plt.figure(figsize=(5,3))
    for lab in [0,1]:
        vals = fe.loc[fe[target]==lab, "amount"].clip(lower=0)
        plt.hist(np.log1p(vals), bins=60, alpha=0.6, label=f"label={lab}")
    plt.xlabel("log1p(amount)"); plt.ylabel("count"); plt.legend()
    plt.title("Amount distribution by label")
    plt.tight_layout(); plt.savefig(fig_dir / "amount_by_label.png"); plt.show()

# time-based
if dt_col and dt_col in fe.columns and pd.api.types.is_datetime64_any_dtype(fe[dt_col]):
    fe["hour"] = fe[dt_col].dt.hour
    fe["dow"]  = fe[dt_col].dt.dayofweek

    fig, axes = plt.subplots(1,2, figsize=(10,3))
    for lab in [0,1]:
        axes[0].plot(fe.loc[fe[target]==lab,"hour"].value_counts().sort_index(), label=f"label={lab}")
        axes[1].plot(fe.loc[fe[target]==lab,"dow"].value_counts().sort_index(), label=f"label={lab}")
    axes[0].set_title("Transactions by Hour"); axes[0].set_xlabel("hour")
    axes[1].set_title("Transactions by DayOfWeek"); axes[1].set_xlabel("0=Mon")
    axes[0].legend(); axes[1].legend()
    plt.tight_layout(); plt.savefig(fig_dir / "time_by_label.png"); plt.show()


In [None]:
# merchant & card view
mcc_col = next((c for c in ["mcc","merchant_category","mcc_code"] if c in fe.columns), None)
if mcc_col:
    top_mcc = fe[mcc_col].value_counts().head(15).index
    df_mcc = fe[fe[mcc_col].isin(top_mcc)].groupby([mcc_col])[target].mean().sort_values(ascending=False)
    display(df_mcc.to_frame("fraud_rate").head(10))
    plt.figure(figsize=(7,3))
    df_mcc.plot(kind="bar")
    plt.ylabel("fraud_rate"); plt.tight_layout(); plt.savefig(fig_dir / "mcc_fraud_rate.png"); plt.show()

card_id = next((c for c in ["card_id","cardId","pan_id"] if c in fe.columns), None)
if card_id:
    card_rate = fe.groupby(card_id)[target].mean()
    print("Card-level fraud rate (top 10):")
    display(card_rate.sort_values(ascending=False).head(10).to_frame("fraud_rate"))


In [None]:
# missingness heatmap (top 30)
missing_ratio = fe.isna().mean().sort_values(ascending=False)
cols = missing_ratio.head(30).index.tolist()
plt.figure(figsize=(8, 6))
if sns is not None:
    sns.heatmap(fe[cols].isna(), cbar=False)
else:
    plt.imshow(fe[cols].isna(), aspect="auto")
plt.title("Missingness (top 30 columns)")
plt.tight_layout(); plt.savefig(fig_dir / "missingness_heatmap.png"); plt.show()


In [None]:
# quick report summary
report = {
    "n_rows": int(len(fe)),
    "n_cols": int(fe.shape[1]),
    "target": target,
    "fraud_ratio": float((fe[target]==1).mean()),
    "n_numeric": int(len(fe.select_dtypes(include=[np.number]).columns)),
    "n_categorical": int(len(fe.select_dtypes(include=['object','category']).columns)),
    "datetime_col": dt_col,
    "figures_dir": str(fig_dir),
}
print(report)

(art_dir / "eda_summary.json").write_text(
    pd.io.json.dumps(report, indent=2), encoding="utf-8"
)
