In [1]:
# =====================================================================
# 0) CONFIG: choose encoder for ablation ("bilstm" or "transformer")
# =====================================================================
ENCODER_KIND = "transformer"    # ← set "bilstm" (baseline) or "transformer" (ablation)
EMB          = 128              # embedding width; roughly capacity-parity with baseline
POOL         = "last"           # {"last","mean","cls"}; mirrors your baseline (last hidden)  <-- uncertain if baseline differs
LR           = 3e-4
EPOCHS       = 300
TEMPERATURE  = 0.05
BATCH_CAP    = 64
SEED         = 42

# =====================================================================
# ①  Upload & Save (select EXACTLY 3 merged_*.csv training files)
# =====================================================================
from google.colab import files
import pathlib, glob, os, warnings, re, collections, random
warnings.filterwarnings("ignore")
import numpy as np
import torch

def set_seed(s=SEED):
    import random
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)

set_seed(SEED)

DATA_DIR = "/content/merged_csvs"
pathlib.Path(DATA_DIR).mkdir(exist_ok=True)

uploaded = files.upload()  # ★ Upload exactly 3 training CSVs
for n, d in uploaded.items():
    with open(f"{DATA_DIR}/{n}", "wb") as f:
        f.write(d)

print("📝 Saved:", glob.glob(f"{DATA_DIR}/*.csv"))

# =====================================================================
# ②  Dedup names + PPIACO→PPI + required columns validation (3 files)
# =====================================================================
import pandas as pd

raw_paths = glob.glob(os.path.join(DATA_DIR, "*.csv"))
dedup = collections.OrderedDict()
for p in sorted(raw_paths):  # alphabetical; push (1) copies behind
    key = re.sub(r"\s*\(\d+\)", "", os.path.basename(p))
    if key not in dedup:     # keep only one per basename
        dedup[key] = p
paths = list(dedup.values())

need_cols  = ["Date","CPI","PPI","FEDFUNDS","DGS10","DJIA","SP500_PE"]
macro_cols = ["CPI","PPI","FEDFUNDS","DGS10"]
dow_cols   = ["DJIA","SP500_PE"]

assert len(paths) == 3, f"❗ Found {len(paths)} CSVs — need exactly 3 for training!"

# Ensure PPI column name
for p in paths:
    df_tmp = pd.read_csv(p)
    if "PPIACO" in df_tmp.columns and "PPI" not in df_tmp.columns:
        df_tmp.rename(columns={"PPIACO":"PPI"}, inplace=True)
        df_tmp.to_csv(p, index=False)

for p in paths:
    miss = set(need_cols) - set(pd.read_csv(p, nrows=1).columns)
    assert not miss, f"{os.path.basename(p)} missing columns: {miss}"

print("✅ 3 files OK:", [os.path.basename(p) for p in paths])

# =====================================================================
# ③  Libraries & Preprocessing
# =====================================================================
!pip -q install tsaug
import torch, torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from tsaug import TimeWarp, Drift, AddNoise

DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
WINDOW  = 24  # 24-month windows

# Load & concat
df_list = []
for p in paths:
    tmp = pd.read_csv(p, parse_dates=["Date"])
    proto = os.path.basename(p).replace(".csv","")
    # Ensure PPI column name at load time as well
    if "PPIACO" in tmp.columns and "PPI" not in tmp.columns:
        tmp.rename(columns={"PPIACO":"PPI"}, inplace=True)
    tmp["Prototype"] = proto
    df_list.append(tmp)

df = (pd.concat(df_list, ignore_index=True)
        .sort_values("Date")
        .dropna(subset=need_cols)
        .reset_index(drop=True))

# Scale macro & mkt columns (fit on training only)
sc_macro = StandardScaler().fit(df[macro_cols])
sc_dow   = StandardScaler().fit(df[dow_cols])
Xm_all   = sc_macro.transform(df[macro_cols]).astype("float32")
Xd_all   = sc_dow.transform(df[dow_cols]).astype("float32")
prot_codes = pd.Categorical(df["Prototype"]).codes

print(f"📊 Training prototypes (3): {df['Prototype'].unique()}")
print(f"📊 Total rows after clean: {len(df)}")

# =====================================================================
# ④  Contrastive Dataset & DataLoader (same as your baseline)
# =====================================================================
aug = TimeWarp(n_speed_change=3, max_speed_ratio=2.0) + \
      Drift(max_drift=(0,0.1)) + \
      AddNoise(scale=0.01)

class ContrastiveDataset(Dataset):
    def __init__(self, Xm, Xd, codes, win=24):
        self.Xm, self.Xd, self.codes, self.win = Xm, Xd, codes, win
        # Make window starts that do not cross prototypes
        self.starts = [i for b in np.unique(codes)
                       for i in np.where(codes==b)[0][:-win]
                       if i+win in np.where(codes==b)[0]]

    def __len__(self): return len(self.starts)

    def __getitem__(self, idx):
        s = self.starts[idx]
        anc = np.hstack([self.Xm[s:s+self.win], self.Xd[s:s+self.win]])
        alt = np.hstack([aug.augment(self.Xm[s:s+self.win]),
                         aug.augment(self.Xd[s:s+self.win])])
        return torch.tensor(anc), torch.tensor(alt)

train_ds   = ContrastiveDataset(Xm_all, Xd_all, prot_codes, WINDOW)
batch_size = min(BATCH_CAP, len(train_ds))
loader     = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
print(f"train_ds={len(train_ds)}, batch_size={batch_size}, len(loader)={len(loader)}")

# =====================================================================
# ⑤  Encoders (BiLSTM baseline + Transformer ablation) + Head
# =====================================================================
import math

class EncoderBiLSTM(nn.Module):
    def __init__(self, in_dim, emb=128):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, emb, num_layers=2, bidirectional=True, batch_first=True)
        self.fc   = nn.Linear(emb*2, emb)
    def forward(self, x):
        # x: [B, T, in_dim]
        _, (h, _) = self.lstm(x)
        h = torch.cat([h[-2], h[-1]], dim=1)   # last layer, both directions
        z = self.fc(h)
        return nn.functional.normalize(z, dim=1)

class SinusoidalPE(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2)*(-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))  # [1, max_len, d_model]
    def forward(self, x):  # x: [B, T, d_model]
        return x + self.pe[:, :x.size(1)]

class EncoderTransformer(nn.Module):
    def __init__(self, in_dim, emb=128, nhead=4, num_layers=2, dropout=0.1, pool="last"):
        super().__init__()
        self.input = nn.Linear(in_dim, emb)
        layer = nn.TransformerEncoderLayer(d_model=emb, nhead=nhead, batch_first=True,
                                           dropout=dropout, norm_first=True)
        self.enc = nn.TransformerEncoder(layer, num_layers=num_layers)
        self.pos = SinusoidalPE(emb)
        self.pool = pool
        self.cls = nn.Parameter(torch.zeros(1,1,emb)) if pool == "cls" else None

    def forward(self, x):
        h = self.input(x)                # [B,T,emb]
        if self.cls is not None:
            cls = self.cls.expand(x.size(0), -1, -1)   # [B,1,emb]
            h = torch.cat([cls, h], dim=1)             # [B,T+1,emb]
        h = self.enc(self.pos(h))        # [B,T(±1),emb]
        if self.pool == "cls":   z = h[:, 0, :]
        elif self.pool == "mean": z = h.mean(dim=1)
        else:                    z = h[:, -1, :]       # mirror BiLSTM's "last"
        return nn.functional.normalize(z, dim=1)

class BubbleDetector(nn.Module):
    def __init__(self, in_dim, emb=128, kind="bilstm", pool="last",
                 nhead=4, num_layers=2, dropout=0.1):
        super().__init__()
        if kind == "bilstm":
            self.encoder = EncoderBiLSTM(in_dim, emb)
        elif kind == "transformer":
            self.encoder = EncoderTransformer(in_dim, emb, nhead, num_layers, dropout, pool)
        else:
            raise ValueError(f"Unknown encoder kind: {kind}")

        self.classifier = nn.Sequential(
            nn.Linear(emb, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(32, 1), nn.Sigmoid()
        )

    def forward(self, x):
        z = self.encoder(x)
        prob = self.classifier(z)
        return z, prob.squeeze()

    @torch.no_grad()
    def get_probability(self, x):
        _, prob = self.forward(x)
        return prob

# Instantiate model
in_dim = len(macro_cols) + len(dow_cols)   # 6 features
model = BubbleDetector(in_dim=in_dim, emb=EMB, kind=ENCODER_KIND, pool=POOL,
                       nhead=4, num_layers=2, dropout=0.1).to(DEVICE)

def count_params(m):
    return sum(p.numel() for p in m.parameters() if p.requires_grad)

print(f"🧠 Encoder: {ENCODER_KIND} | POOL={POOL} | EMB={EMB}")
print(f"🧮 Trainable params: {count_params(model):,}")

# =====================================================================
# ⑥  Training (NT-Xent + BCE), identical to your recipe
# =====================================================================
temperature = TEMPERATURE
def ntxent(z1, z2, tau=temperature):
    z1 = nn.functional.normalize(z1, dim=1)
    z2 = nn.functional.normalize(z2, dim=1)
    sim = torch.mm(z1, z2.t()) / tau     # [B,B]
    labels = torch.arange(z1.size(0), device=z1.device)
    return nn.CrossEntropyLoss()(sim, labels)

bce_loss = nn.BCELoss()
opt = optim.Adam(model.parameters(), lr=LR)

print(f"\n🚀 Training start — all training windows are labeled bubble=1")
for ep in range(1, EPOCHS+1):
    model.train()
    tot_loss = 0.0
    for anc, alt in loader:
        anc, alt = anc.to(DEVICE), alt.to(DEVICE)
        z1, p1 = model(anc)
        z2, p2 = model(alt)

        cont = ntxent(z1, z2)
        labels = torch.ones(anc.size(0), device=DEVICE)
        clas = bce_loss(p1, labels) + bce_loss(p2, labels)
        loss = cont + 0.5*clas

        opt.zero_grad()
        loss.backward()
        opt.step()
        tot_loss += loss.item()

    if ep % 10 == 0 or ep == 1:
        print(f"Epoch {ep:03d} | loss {tot_loss/max(1,len(loader)):.4f}")

# =====================================================================
# ⑦  Save package (model + scalers + config)
# =====================================================================
package = {
    "model_state_dict": model.state_dict(),
    "model_config": {
        "encoder_kind": ENCODER_KIND,
        "emb": EMB,
        "pool": POOL,
        "window": WINDOW,
        "in_dim": in_dim,
        "transformer": {"nhead": 4, "num_layers": 2, "dropout": 0.1},
    },
    "scalers": {
        "sc_macro": sc_macro,
        "sc_dow": sc_dow,
        "need_cols": need_cols,
        "macro_cols": macro_cols,
        "dow_cols": dow_cols
    }
}

save_path = "bubble_model_package.pth"
torch.save(package, save_path)
print(f"✅ Saved: {save_path}")

# Optional: auto-download in Colab
try:
    from google.colab import files as colab_files
    colab_files.download(save_path)
except Exception:
    pass

print("🎯 Done.")