In [1]:
# =====================================================================
# Approach 2 (variant): TRAIN on 3 NON-BUBBLE CSVs → evaluate on BUBBLE
# Model outputs p(non-bubble). Paper uses p(bubble)=1-p(non-bubble).
# =====================================================================

# 0) CONFIG
ENCODER_KIND = "transformer"    # {"transformer","bilstm"}
EMB          = 128
POOL         = "last"           # {"last","mean","cls"}
LR           = 3e-4
EPOCHS       = 300
TEMPERATURE  = 0.05
BATCH_CAP    = 64
SEED         = 42
EXPECTED_TRAIN_FILES = 3        # ← exactly 3 NON-BUBBLE CSVs

# 1) Upload & Save NON-BUBBLE CSVs
from google.colab import files
import pathlib, glob, os, warnings, re, collections, random, math
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd, torch
def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)
set_seed(SEED)

DATA_DIR = "/content/merged_csvs"
pathlib.Path(DATA_DIR).mkdir(exist_ok=True)
uploaded = files.upload()  # ★ Upload 3 NON-BUBBLE CSVs
for n, d in uploaded.items():
    with open(f"{DATA_DIR}/{n}", "wb") as f:
        f.write(d)
print("📝 Saved:", glob.glob(f"{DATA_DIR}/*.csv"))

# 2) Dedup names + PPIACO→PPI + required column validation
raw_paths = glob.glob(os.path.join(DATA_DIR, "*.csv"))
dedup = collections.OrderedDict()
for p in sorted(raw_paths):
    key = re.sub(r"\s*\(\d+\)", "", os.path.basename(p))
    if key not in dedup: dedup[key] = p
paths = list(dedup.values())

need_cols  = ["Date","CPI","PPI","FEDFUNDS","DGS10","DJIA","SP500_PE"]
macro_cols = ["CPI","PPI","FEDFUNDS","DGS10"]
dow_cols   = ["DJIA","SP500_PE"]

assert len(paths) == EXPECTED_TRAIN_FILES, f"❗ Found {len(paths)} CSVs — need {EXPECTED_TRAIN_FILES} NON-BUBBLE files!"
for p in paths:
    df_tmp = pd.read_csv(p)
    if "PPIACO" in df_tmp.columns and "PPI" not in df_tmp.columns:
        df_tmp.rename(columns={"PPIACO":"PPI"}, inplace=True); df_tmp.to_csv(p, index=False)
for p in paths:
    miss = set(need_cols) - set(pd.read_csv(p, nrows=1).columns)
    assert not miss, f"{os.path.basename(p)} missing columns: {miss}"
print("✅ NON-BUBBLE files OK:", [os.path.basename(p) for p in paths])

# 3) Libraries & Preprocessing
!pip -q install tsaug
import torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from tsaug import TimeWarp, Drift, AddNoise

DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
WINDOW  = 24  # months

# Load & concat
df_list = []
for p in paths:
    tmp = pd.read_csv(p, parse_dates=["Date"])
    if "PPIACO" in tmp.columns and "PPI" not in tmp.columns:
        tmp.rename(columns={"PPIACO":"PPI"}, inplace=True)
    tmp["Prototype"] = os.path.basename(p).replace(".csv","")
    df_list.append(tmp)
df = (pd.concat(df_list, ignore_index=True)
        .sort_values("Date")
        .dropna(subset=need_cols)
        .reset_index(drop=True))

# Scale
sc_macro = StandardScaler().fit(df[macro_cols])
sc_dow   = StandardScaler().fit(df[dow_cols])
Xm_all   = sc_macro.transform(df[macro_cols]).astype("float32")
Xd_all   = sc_dow.transform(df[dow_cols]).astype("float32")
prot_codes = pd.Categorical(df["Prototype"]).codes
print(f"📊 Training prototypes (NON-BUBBLE): {df['Prototype'].unique()} | rows={len(df)}")

# 4) Dataset & Loader
aug = TimeWarp(n_speed_change=3, max_speed_ratio=2.0) + Drift(max_drift=(0,0.1)) + AddNoise(scale=0.01)
class ContrastiveDataset(Dataset):
    def __init__(self, Xm, Xd, codes, win=24):
        self.Xm, self.Xd, self.codes, self.win = Xm, Xd, codes, win
        self.starts = [i for b in np.unique(codes)
                       for i in np.where(codes==b)[0][:-win]
                       if i+win in np.where(codes==b)[0]]
    def __len__(self): return len(self.starts)
    def __getitem__(self, idx):
        s = self.starts[idx]
        anc = np.hstack([self.Xm[s:s+self.win], self.Xd[s:s+self.win]])
        alt = np.hstack([aug.augment(self.Xm[s:s+self.win]),
                         aug.augment(self.Xd[s:s+self.win])])
        return torch.tensor(anc), torch.tensor(alt)

train_ds   = ContrastiveDataset(Xm_all, Xd_all, prot_codes, WINDOW)
batch_size = min(BATCH_CAP, len(train_ds))
loader     = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
print(f"train_ds={len(train_ds)}, batch_size={batch_size}, len(loader)={len(loader)}")

# 5) Encoders (BiLSTM & Transformer) + Non-bubble head
class EncoderBiLSTM(nn.Module):
    def __init__(self, in_dim, emb=128):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, emb, num_layers=2, bidirectional=True, batch_first=True)
        self.fc   = nn.Linear(emb*2, emb)
    def forward(self, x):
        _, (h, _) = self.lstm(x)
        h = torch.cat([h[-2], h[-1]], dim=1)
        return nn.functional.normalize(self.fc(h), dim=1)

class SinusoidalPE(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2)*(-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class EncoderTransformer(nn.Module):
    def __init__(self, in_dim, emb=128, nhead=4, num_layers=2, dropout=0.1, pool="last"):
        super().__init__()
        self.input = nn.Linear(in_dim, emb)
        layer = nn.TransformerEncoderLayer(d_model=emb, nhead=nhead, batch_first=True,
                                           dropout=dropout, norm_first=True)
        self.enc = nn.TransformerEncoder(layer, num_layers=num_layers)
        self.pos = SinusoidalPE(emb)
        self.pool = pool
        self.cls = nn.Parameter(torch.zeros(1,1,emb)) if pool == "cls" else None
    def forward(self, x):
        h = self.input(x)
        if self.cls is not None:
            cls = self.cls.expand(x.size(0), -1, -1); h = torch.cat([cls, h], dim=1)
        h = self.enc(self.pos(h))
        if self.pool == "cls":   z = h[:, 0, :]
        elif self.pool == "mean": z = h.mean(dim=1)
        else:                    z = h[:, -1, :]
        return nn.functional.normalize(z, dim=1)

class NonBubbleDetector(nn.Module):
    """Outputs p(non-bubble)."""
    def __init__(self, in_dim, emb=128, kind="bilstm", pool="last",
                 nhead=4, num_layers=2, dropout=0.1):
        super().__init__()
        if kind == "bilstm":
            self.encoder = EncoderBiLSTM(in_dim, emb)
        elif kind == "transformer":
            self.encoder = EncoderTransformer(in_dim, emb, nhead, num_layers, dropout, pool)
        else:
            raise ValueError(f"Unknown encoder: {kind}")
        self.classifier = nn.Sequential(
            nn.Linear(emb, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(32, 1), nn.Sigmoid()
        )
    def forward(self, x):
        z = self.encoder(x); p_non = self.classifier(z)
        return z, p_non.squeeze()
    @torch.no_grad()
    def get_nonbubble_probability(self, x):
        _, p = self.forward(x); return p

in_dim = len(macro_cols) + len(dow_cols)
model = NonBubbleDetector(in_dim=in_dim, emb=EMB, kind=ENCODER_KIND, pool=POOL,
                          nhead=4, num_layers=2, dropout=0.1).to(DEVICE)
def count_params(m): return sum(p.numel() for p in m.parameters() if p.requires_grad)
print(f"🧠 Encoder: {ENCODER_KIND} | POOL={POOL} | EMB={EMB} | params={count_params(model):,}")

# 6) Train (NT-Xent + BCE with label=1 for NON-BUBBLE)
temperature = TEMPERATURE
def ntxent(z1, z2, tau=temperature):
    z1 = nn.functional.normalize(z1, dim=1); z2 = nn.functional.normalize(z2, dim=1)
    sim = torch.mm(z1, z2.t()) / tau
    labels = torch.arange(z1.size(0), device=z1.device)
    return nn.CrossEntropyLoss()(sim, labels)

bce_loss = nn.BCELoss()
opt = optim.Adam(model.parameters(), lr=LR)

print(f"\n🚀 Training start — all windows labeled NON-BUBBLE=1")
for ep in range(1, EPOCHS+1):
    model.train(); tot_loss = 0.0
    for anc, alt in loader:
        anc, alt = anc.to(DEVICE), alt.to(DEVICE)
        z1, p1 = model(anc); z2, p2 = model(alt)
        cont = ntxent(z1, z2)
        labels = torch.ones(anc.size(0), device=DEVICE)
        clas  = bce_loss(p1, labels) + bce_loss(p2, labels)
        loss  = cont + 0.5*clas
        opt.zero_grad(); loss.backward(); opt.step()
        tot_loss += loss.item()
    if ep % 10 == 0 or ep == 1:
        print(f"Epoch {ep:03d} | loss {tot_loss/max(1,len(loader)):.4f}")

# 7) Save package
package = {
    "model_state_dict": model.state_dict(),
    "model_config": {
        "predicts": "non_bubble",
        "encoder_kind": ENCODER_KIND, "emb": EMB, "pool": POOL,
        "window": WINDOW, "in_dim": in_dim,
        "transformer": {"nhead": 4, "num_layers": 2, "dropout": 0.1},
        "train_prototypes": list(pd.unique(df["Prototype"]))
    },
    "scalers": {
        "sc_macro": sc_macro, "sc_dow": sc_dow,
        "need_cols": need_cols, "macro_cols": macro_cols, "dow_cols": dow_cols
    },
    "param_count": int(count_params(model))
}
save_path = "nonbubble_model_package.pth"
torch.save(package, save_path)
print(f"✅ Saved: {save_path}")

try:
    from google.colab import files as colab_files
    colab_files.download(save_path)
except Exception:
    pass
print("🎯 Training complete.")

Saving 1987.11-1995.12.csv to 1987.11-1995.12.csv
Saving 2001.04-2003.12.csv to 2001.04-2003.12.csv
Saving 1962.01-1965.12.csv to 1962.01-1965.12.csv
📝 Saved: ['/content/merged_csvs/1962.01-1965.12.csv', '/content/merged_csvs/1987.11-1995.12.csv', '/content/merged_csvs/2001.04-2003.12.csv']
✅ NON-BUBBLE files OK: ['1962.01-1965.12.csv', '1987.11-1995.12.csv', '2001.04-2003.12.csv']
📊 Training prototypes (NON-BUBBLE): ['1962.01-1965.12' '1987.11-1995.12' '2001.04-2003.12'] | rows=168
train_ds=96, batch_size=64, len(loader)=2
🧠 Encoder: transformer | POOL=last | EMB=128 | params=1,197,313

🚀 Training start — all windows labeled NON-BUBBLE=1
Epoch 001 | loss 2.9249
Epoch 010 | loss 2.2747
Epoch 020 | loss 2.0567
Epoch 030 | loss 1.9426
Epoch 040 | loss 1.8601
Epoch 050 | loss 1.5249
Epoch 060 | loss 1.3210
Epoch 070 | loss 0.9117
Epoch 080 | loss 0.7993
Epoch 090 | loss 0.6441
Epoch 100 | loss 0.5844
Epoch 110 | loss 0.4954
Epoch 120 | loss 0.4359
Epoch 130 | loss 0.4958
Epoch 140 | loss 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

🎯 Training complete.
