In [1]:
# BLOCK 0 â€” imports & config
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, RMSE
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, classification_report
import warnings
warnings.filterwarnings("ignore")



  from tqdm.autonotebook import tqdm


In [2]:
# CONFIG
CSV_PATH = "daily_per_biller2.csv"  # change if needed
DATE_COL = "txn_date"
BILLER_COL = "blr_name"
TARGET_COUNT = "total_transactions"
TARGET_AMOUNT = "total_amount"
MAX_ENCODER_LENGTH = 30
MAX_PREDICTION_LENGTH = 7
BATCH_SIZE = 128
NUM_EPOCHS = 20
VAL_PCT = 0.2
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)


In [3]:
df = pd.read_csv("daily_per_biller2.csv")

In [4]:
df.head()

Unnamed: 0,txn_date,blr_name,total_transactions,total_amount,total_biller_fee
0,2025-08-01,BILLER_01,456,128196.0,512.04
1,2025-08-01,BILLER_02,400,656484.918069,593.25
2,2025-08-01,BILLER_03,104,16063.721948,126.33
3,2025-08-01,BILLER_04,336,98528.0,385.8
4,2025-08-01,BILLER_05,103,17140.285451,132.13


In [5]:
df.columns

Index(['txn_date', 'blr_name', 'total_transactions', 'total_amount',
       'total_biller_fee'],
      dtype='object')

In [6]:
# parse date
df[DATE_COL] = pd.to_datetime(df[DATE_COL])

# If dataset is transactional-level (multiple rows per biller/day) aggregate to daily biller-level
agg = df.groupby([BILLER_COL, DATE_COL]).agg(
    txn_count=(TARGET_COUNT, "sum"),
    txn_amount=(TARGET_AMOUNT, "sum")
).reset_index()

# If already aggregated, use that directly:
# agg = df.copy()

# create time_idx (days since min date)
min_date = agg[DATE_COL].min()
agg["time_idx"] = (agg[DATE_COL] - min_date).dt.days
agg[BILLER_COL] = agg[BILLER_COL].astype(str)  # categorical as str
agg = agg.sort_values([BILLER_COL, "time_idx"]).reset_index(drop=True)

print("Data loaded. Sample:")
print(agg.head())

Data loaded. Sample:
    blr_name   txn_date  txn_count  txn_amount  time_idx
0  BILLER_01 2025-08-01        456    128196.0         0
1  BILLER_01 2025-08-02        376    124632.0         1
2  BILLER_01 2025-08-03        316    104552.0         2
3  BILLER_01 2025-08-04        364    101964.0         3
4  BILLER_01 2025-08-05        384     99676.0         4


In [9]:
agg["day_of_week"] = agg[DATE_COL].dt.weekday  # 0..6
agg["month"] = agg[DATE_COL].dt.month
agg["is_weekend"] = agg["day_of_week"].isin([5,6]).astype(int)

def add_lags_and_rolls(df, group_col, col, lags=(1,7,14), rolls=(7,14)):
    df = df.copy()
    for lag in lags:
        df[f"{col}_lag{lag}"] = df.groupby(group_col)[col].shift(lag)
    for r in rolls:
        # rolling of previous values (shift 1 then rolling)
        df[f"{col}_roll{r}"] = df.groupby(group_col)[col].shift(1).rolling(window=r, min_periods=1).mean().reset_index(level=0, drop=True)
    return df

# add for both targets
agg = add_lags_and_rolls(agg, BILLER_COL, "txn_count", lags=(1,7), rolls=(7,14))
agg = add_lags_and_rolls(agg, BILLER_COL, "txn_amount", lags=(1,7), rolls=(7,14))

# fill NaNs with 0 (safe, though you can choose other imputation)
agg.fillna(0, inplace=True)

print("Features added. Columns snapshot:")
print(agg.columns.tolist())


Features added. Columns snapshot:
['blr_name', 'txn_date', 'txn_count', 'txn_amount', 'time_idx', 'day_of_week', 'month', 'is_weekend', 'txn_count_lag1', 'txn_count_lag7', 'txn_count_roll7', 'txn_count_roll14', 'txn_amount_lag1', 'txn_amount_lag7', 'txn_amount_roll7', 'txn_amount_roll14']


In [15]:
def make_tsd(df, target_col):

    # --- FIX 1: convert categorical inside df, not agg ---
    df[BILLER_COL] = df[BILLER_COL].astype(str)
    df["day_of_week"] = df["day_of_week"].astype(str)
    df["is_weekend"] = df["is_weekend"].astype(str)

    # --- FIX 2: ensure floats for real-valued cols ---
    real_cols = [
        "time_idx", "month",
        target_col,
        f"{target_col}_lag1", f"{target_col}_lag7",
        f"{target_col}_roll7", f"{target_col}_roll14",
    ]
    for c in real_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32")

    time_varying_known_reals = ["time_idx", "month"]
    time_varying_known_categoricals = ["day_of_week", "is_weekend"]

    time_varying_unknown_reals = [
        target_col,
        f"{target_col}_lag1",
        f"{target_col}_lag7",
        f"{target_col}_roll7",
        f"{target_col}_roll14",
    ]

    static_categoricals = [BILLER_COL]

    tsd = TimeSeriesDataSet(
        df,
        time_idx="time_idx",
        target=target_col,
        group_ids=[BILLER_COL],
        min_encoder_length=1,
        max_encoder_length=MAX_ENCODER_LENGTH,
        min_prediction_length=1,
        max_prediction_length=MAX_PREDICTION_LENGTH,
        static_categoricals=static_categoricals,
        time_varying_known_categoricals=time_varying_known_categoricals,
        time_varying_known_reals=time_varying_known_reals,
        time_varying_unknown_reals=time_varying_unknown_reals,
        target_normalizer=GroupNormalizer(groups=[BILLER_COL], transformation="softplus"),
        add_relative_time_idx=True,
        add_target_scales=True,
        add_encoder_length=True,
    )

    return tsd
