In [1]:
# =====================================================================
# Approach 3: Train on BOTH (4 non-bubble + 3 bubble) → Evaluate on bubble
# - Rotating held-out bubble file across sessions (upload 3 bubble here).
# - Unified pipeline: same windowing, augs, NT-Xent + BCE, optimizer.
# - Encoder switch via ENCODER_KIND = {"transformer","bilstm"}.
# =====================================================================

# 0) CONFIG
ENCODER_KIND = "transformer"    # {"transformer","bilstm"}
EMB          = 128
POOL         = "last"           # {"last","mean","cls"}; mirrors LSTM last hidden
LR           = 3e-4
EPOCHS       = 300
TEMPERATURE  = 0.05
BATCH_CAP    = 64
SEED         = 42

# 1) Upload files (two-stage): 4 non-bubble, then 3 bubble
from google.colab import files
import pathlib, glob, os, warnings, re, collections, random, math
warnings.filterwarnings("ignore")
import numpy as np, pandas as pd, torch

def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)
set_seed(SEED)

DATA_DIR = "/content/economic_indicators"
pathlib.Path(DATA_DIR).mkdir(exist_ok=True)

print("📤 STAGE 1: Upload 4 NON-BUBBLE CSVs")
up_non = files.upload()
assert len(up_non) == 4, f"❗ Expected 4 non-bubble files, got {len(up_non)}"
non_bubble_files = list(up_non.keys())
for n, d in up_non.items():
    with open(f"{DATA_DIR}/{n}", "wb") as f: f.write(d)

print("\n📤 STAGE 2: Upload 3 BUBBLE CSVs (held-out bubble is NOT uploaded in this session)")
up_bub = files.upload()
assert len(up_bub) == 3, f"❗ Expected 3 bubble files, got {len(up_bub)}"
bubble_files = list(up_bub.keys())
for n, d in up_bub.items():
    with open(f"{DATA_DIR}/{n}", "wb") as f: f.write(d)

print(f"\n✅ Saved {len(non_bubble_files)} non-bubble + {len(bubble_files)} bubble files.")

# 2) Dedup + PPIACO→PPI + column validation
raw_paths = glob.glob(os.path.join(DATA_DIR, "*.csv"))
dedup = collections.OrderedDict()
for p in sorted(raw_paths):
    key = re.sub(r"\s*\(\d+\)", "", os.path.basename(p))
    if key not in dedup: dedup[key] = p
paths = list(dedup.values())

# Required columns
need_cols  = ["Date","CPI","PPI","FEDFUNDS","DGS10","DJIA","SP500_PE"]
macro_cols = ["CPI","PPI","FEDFUNDS","DGS10"]
dow_cols   = ["DJIA","SP500_PE"]

# Harmonize PPI & validate
for p in paths:
    df_tmp = pd.read_csv(p)
    if "PPIACO" in df_tmp.columns and "PPI" not in df_tmp.columns:
        df_tmp.rename(columns={"PPIACO":"PPI"}, inplace=True); df_tmp.to_csv(p, index=False)

assert len(paths) == 7, f"❗ Found {len(paths)} CSV files — need exactly 7 (4 NB + 3 Bubble)!"
for p in paths:
    miss = set(need_cols) - set(pd.read_csv(p, nrows=1).columns)
    assert not miss, f"{os.path.basename(p)} missing columns: {miss}"
print("✅ Column validation passed for all 7 files")

# 3) Libraries & preprocessing
!pip -q install tsaug
import torch.nn as nn, torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from tsaug import TimeWarp, Drift, AddNoise

DEVICE  = "cuda" if torch.cuda.is_available() else "cpu"
WINDOW  = 24

# Build labeled dataframe
df_list = []
for p in paths:
    tmp = pd.read_csv(p, parse_dates=["Date"])
    if "PPIACO" in tmp.columns and "PPI" not in tmp.columns:
        tmp.rename(columns={"PPIACO":"PPI"}, inplace=True)
    fname = os.path.basename(p)
    tmp["Prototype"] = fname.replace(".csv","")
    tmp["IsBubble"]  = 1 if fname in bubble_files else 0  # 1=bubble, 0=non-bubble
    df_list.append(tmp)

df = (pd.concat(df_list, ignore_index=True)
        .sort_values("Date")
        .dropna(subset=need_cols)
        .reset_index(drop=True))

# Scale
sc_macro = StandardScaler().fit(df[macro_cols])
sc_dow   = StandardScaler().fit(df[dow_cols])
Xm_all   = sc_macro.transform(df[macro_cols]).astype("float32")
Xd_all   = sc_dow.transform(df[dow_cols]).astype("float32")
prot_codes = pd.Categorical(df["Prototype"]).codes
labels_all = df["IsBubble"].astype("float32").values

print(f"\n📊 Prototypes: {df['Prototype'].nunique()} files | rows={len(df)}")
print(f"   bubble rows: {(labels_all==1).sum()} | non-bubble rows: {(labels_all==0).sum()}")

# 4) Dataset & Loader (respect prototype boundaries)
aug = TimeWarp(n_speed_change=3, max_speed_ratio=2.0) + Drift(max_drift=(0,0.1)) + AddNoise(scale=0.01)
class ContrastiveDataset(Dataset):
    def __init__(self, Xm, Xd, codes, labels, win=24):
        self.Xm, self.Xd, self.codes, self.labels, self.win = Xm, Xd, codes, labels, win
        self.starts = [i for b in np.unique(codes)
                       for i in np.where(codes==b)[0][:-win]
                       if i+win in np.where(codes==b)[0]]
    def __len__(self): return len(self.starts)
    def __getitem__(self, idx):
        s = self.starts[idx]
        anc = np.hstack([self.Xm[s:s+self.win], self.Xd[s:s+self.win]])
        alt = np.hstack([aug.augment(self.Xm[s:s+self.win]),
                         aug.augment(self.Xd[s:s+self.win])])
        label = self.labels[s]  # window inherits file label
        return torch.tensor(anc), torch.tensor(alt), torch.tensor(label, dtype=torch.float32)

train_ds   = ContrastiveDataset(Xm_all, Xd_all, prot_codes, labels_all, WINDOW)
batch_size = min(BATCH_CAP, len(train_ds))
loader     = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
print(f"train_ds={len(train_ds)}, batch_size={batch_size}, len(loader)={len(loader)}")

# 5) Encoders (BiLSTM & Transformer) + bubble head (predicts p(bubble))
class EncoderBiLSTM(nn.Module):
    def __init__(self, in_dim, emb=128):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, emb, num_layers=2, bidirectional=True, batch_first=True)
        self.fc   = nn.Linear(emb*2, emb)
    def forward(self, x):
        _, (h, _) = self.lstm(x)
        h = torch.cat([h[-2], h[-1]], dim=1)
        return nn.functional.normalize(self.fc(h), dim=1)

class SinusoidalPE(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2)*(-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(pos * div); pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer("pe", pe.unsqueeze(0))
    def forward(self, x): return x + self.pe[:, :x.size(1)]

class EncoderTransformer(nn.Module):
    def __init__(self, in_dim, emb=128, nhead=4, num_layers=2, dropout=0.1, pool="last"):
        super().__init__()
        self.input = nn.Linear(in_dim, emb)
        layer = nn.TransformerEncoderLayer(d_model=emb, nhead=nhead, batch_first=True,
                                           dropout=dropout, norm_first=True)
        self.enc = nn.TransformerEncoder(layer, num_layers=num_layers)
        self.pos = SinusoidalPE(emb); self.pool = pool
        self.cls = nn.Parameter(torch.zeros(1,1,emb)) if pool == "cls" else None
    def forward(self, x):
        h = self.input(x)
        if self.cls is not None:
            cls = self.cls.expand(x.size(0), -1, -1); h = torch.cat([cls, h], dim=1)
        h = self.enc(self.pos(h))
        if   self.pool == "cls":  z = h[:, 0, :]
        elif self.pool == "mean": z = h.mean(dim=1)
        else:                     z = h[:, -1, :]
        return nn.functional.normalize(z, dim=1)

class BubbleVsNonBubble(nn.Module):
    """Outputs p(bubble) with labels: 1=bubble, 0=non-bubble."""
    def __init__(self, in_dim, emb=128, kind="bilstm", pool="last", nhead=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.encoder = EncoderBiLSTM(in_dim, emb) if kind=="bilstm" else \
                       EncoderTransformer(in_dim, emb, nhead, num_layers, dropout, pool)
        self.classifier = nn.Sequential(
            nn.Linear(emb, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 32), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(32, 1), nn.Sigmoid()
        )
    def forward(self, x):
        z = self.encoder(x); p_bub = self.classifier(z)
        return z, p_bub.squeeze()
    @torch.no_grad()
    def get_bubble_probability(self, x):
        _, p = self.forward(x); return p

in_dim = len(macro_cols) + len(dow_cols)  # 6 features
model = BubbleVsNonBubble(in_dim=in_dim, emb=EMB, kind=ENCODER_KIND, pool=POOL,
                          nhead=4, num_layers=2, dropout=0.1).to(DEVICE)

def count_params(m): return sum(p.numel() for p in m.parameters() if p.requires_grad)
print(f"🧠 Encoder: {ENCODER_KIND} | POOL={POOL} | EMB={EMB} | params={count_params(model):,}")

# 6) Training (NT-Xent + BCE with true labels)
temperature = TEMPERATURE
def ntxent(z1, z2, tau=temperature):
    z1 = nn.functional.normalize(z1, dim=1); z2 = nn.functional.normalize(z2, dim=1)
    sim = torch.mm(z1, z2.t()) / tau
    labels = torch.arange(z1.size(0), device=z1.device)
    return nn.CrossEntropyLoss()(sim, labels)

bce_loss = nn.BCELoss()
opt = optim.Adam(model.parameters(), lr=LR)

print("\n🚀 Training start — labels: 1=bubble, 0=non-bubble")
for ep in range(1, EPOCHS+1):
    model.train(); tot_loss = 0.0
    for anc, alt, y in loader:
        anc, alt, y = anc.to(DEVICE), alt.to(DEVICE), y.to(DEVICE)
        z1, p1 = model(anc); z2, p2 = model(alt)
        cont  = ntxent(z1, z2)
        clas  = bce_loss(p1, y) + bce_loss(p2, y)
        loss  = cont + 0.5*clas
        opt.zero_grad(); loss.backward(); opt.step()
        tot_loss += loss.item()
    if ep % 10 == 0 or ep == 1:
        print(f"Epoch {ep:03d} | loss {tot_loss/max(1,len(loader)):.4f}")

# 7) Save unified package
package = {
    "model_state_dict": model.state_dict(),
    "model_config": {
        "predicts": "bubble",           # <- IMPORTANT for inference semantics
        "encoder_kind": ENCODER_KIND, "emb": EMB, "pool": POOL,
        "window": WINDOW, "in_dim": in_dim,
        "transformer": {"nhead": 4, "num_layers": 2, "dropout": 0.1},
        "train_prototypes": list(pd.unique(df["Prototype"]))
    },
    "scalers": {
        "sc_macro": sc_macro, "sc_dow": sc_dow,
        "need_cols": need_cols, "macro_cols": macro_cols, "dow_cols": dow_cols
    },
    "param_count": int(count_params(model))
}

SAVE_PATH = "approach3_joint_model_package.pth"
torch.save(package, SAVE_PATH)
print(f"✅ Saved: {SAVE_PATH}")

# Auto-download (Colab)
try:
    from google.colab import files as colab_files
    colab_files.download(SAVE_PATH)
except Exception:
    pass

print("🎯 Training complete.")

📤 STAGE 1: Upload 4 NON-BUBBLE CSVs


Saving 1987.11-1995.12.csv to 1987.11-1995.12.csv
Saving 1975.01-1980.01.csv to 1975.01-1980.01.csv
Saving 2001.04-2003.12.csv to 2001.04-2003.12.csv
Saving 1962.01-1965.12.csv to 1962.01-1965.12.csv

📤 STAGE 2: Upload 3 BUBBLE CSVs (held-out bubble is NOT uploaded in this session)


Saving Merged_Nifty_Fifty.csv to Merged_Nifty_Fifty.csv
Saving Merged_Dot_Com.csv to Merged_Dot_Com.csv
Saving Merged_Black_Monday.csv to Merged_Black_Monday.csv

✅ Saved 4 non-bubble + 3 bubble files.
✅ Column validation passed for all 7 files

📊 Prototypes: 7 files | rows=341
   bubble rows: 119 | non-bubble rows: 222
train_ds=173, batch_size=64, len(loader)=3
🧠 Encoder: transformer | POOL=last | EMB=128 | params=1,197,313

🚀 Training start — labels: 1=bubble, 0=non-bubble
Epoch 001 | loss 2.5324
Epoch 010 | loss 1.7195
Epoch 020 | loss 1.5428
Epoch 030 | loss 1.2829
Epoch 040 | loss 1.1145
Epoch 050 | loss 0.8675
Epoch 060 | loss 0.7003
Epoch 070 | loss 0.6164
Epoch 080 | loss 0.4621
Epoch 090 | loss 0.4936
Epoch 100 | loss 0.3333
Epoch 110 | loss 0.3836
Epoch 120 | loss 0.3004
Epoch 130 | loss 0.3466
Epoch 140 | loss 0.2125
Epoch 150 | loss 0.2914
Epoch 160 | loss 0.1942
Epoch 170 | loss 0.2254
Epoch 180 | loss 0.1922
Epoch 190 | loss 0.1710
Epoch 200 | loss 0.1733
Epoch 210 | loss

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

🎯 Training complete.


In [2]:
# ============================
# MODEL RESCUE & DOWNLOAD CELL
# ============================
import os, glob, time, math, json, sys
import torch

TARGET_NAMES = [
    "approach3_joint_model_package.pth",      # preferred for Approach 3 variants
    "bubble_vs_nonbubble_model.pth",          # older naming
    "bubble_model_package.pth",
    "nonbubble_model_package.pth",
]
SEARCH_ROOTS = ["/content", "/content/economic_indicators", "/content/drive/My Drive"]

def human_dt(ts):
    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ts))

def try_mount_drive():
    try:
        from google.colab import drive
        drive.mount("/content/drive", force_remount=False)
        print("🔗 Google Drive mounted.")
    except Exception as e:
        print("ℹ️ Skipping Drive mount (not in Colab or user declined).")

def find_candidates():
    found = []
    for root in SEARCH_ROOTS:
        if os.path.exists(root):
            found += glob.glob(os.path.join(root, "**", "*.pth"), recursive=True)
    # sort by preference and recency
    def score(path):
        name = os.path.basename(path)
        pref = -min([i for i, t in enumerate(TARGET_NAMES) if t in name] + [999])
        mtime = os.path.getmtime(path)
        return (pref, mtime)
    found = sorted(set(found), key=lambda p: score(p), reverse=True)
    return found

def try_download(path):
    try:
        from google.colab import files
        print(f"⬇️  Downloading: {path}")
        files.download(path)
        return True
    except Exception as e:
        print(f"⚠️ files.download failed: {e}")
        return False

def package_from_ram(save_path="approach3_joint_model_package.pth"):
    """
    Attempt to re-package the trained objects from RAM without retraining.
    Requires: model, sc_macro, sc_dow, need_cols, macro_cols, dow_cols, cfg-like info.
    Falls back to reasonable defaults if some config fields are missing.
    """
    g = globals()
    required = ["model", "sc_macro", "sc_dow", "need_cols", "macro_cols", "dow_cols"]
    missing = [k for k in required if k not in g]
    if missing:
        print(f"❌ Cannot repackage from RAM. Missing variables: {missing}")
        return None

    # Try to extract config from known names; fallback to safe defaults
    ENCODER_KIND = g.get("ENCODER_KIND", "transformer")
    EMB          = int(g.get("EMB", 128))
    POOL         = g.get("POOL", "last")
    WINDOW       = int(g.get("WINDOW", 24))
    in_dim       = int(g.get("in_dim", len(g["macro_cols"])+len(g["dow_cols"])))

    transformer_cfg = {"nhead": 4, "num_layers": 2, "dropout": 0.1}
    if "cfg" in g and isinstance(g["cfg"], dict) and "transformer" in g["cfg"]:
        # try to keep whatever was used
        transformer_cfg.update({k: g["cfg"]["transformer"].get(k, transformer_cfg[k]) for k in transformer_cfg})

    train_prototypes = []
    if "df" in g and hasattr(g["df"], "__len__") and "Prototype" in g["df"].columns:
        try:
            import pandas as pd
            train_prototypes = list(pd.unique(g["df"]["Prototype"]))
        except Exception:
            pass

    # Build package
    try:
        state = g["model"].state_dict()
    except Exception as e:
        print(f"❌ Could not access model.state_dict(): {e}")
        return None

    package = {
        "model_state_dict": state,
        "model_config": {
            "predicts": "bubble",            # Approach 3 variant predicts p(bubble)
            "encoder_kind": ENCODER_KIND,
            "emb": EMB,
            "pool": POOL,
            "window": WINDOW,
            "in_dim": in_dim,
            "transformer": transformer_cfg,
            "train_prototypes": train_prototypes,
        },
        "scalers": {
            "sc_macro": g["sc_macro"],
            "sc_dow": g["sc_dow"],
            "need_cols": g["need_cols"],
            "macro_cols": g["macro_cols"],
            "dow_cols": g["dow_cols"],
        }
    }
    torch.save(package, save_path)
    print(f"✅ Repackaged from RAM: {save_path}")
    return save_path

# ---- Flow starts here
print("🔎 Searching for existing model packages...")
try_mount_drive()
cands = find_candidates()

if cands:
    print(f"✅ Found {len(cands)} candidate .pth file(s):")
    for i, p in enumerate(cands[:10], 1):
        print(f"  {i:>2}. {p}  (modified {human_dt(os.path.getmtime(p))})")
    # Choose the top-scoring candidate
    best = cands[0]
    print(f"\n🏆 Best match selected: {best}")
    ok = try_download(best)
    if not ok:
        print("👉 You can also manually click in the left file browser to download.")
else:
    print("⚠️ No .pth packages found on disk.\n🔁 Trying to repackage from RAM (no retrain)…")
    rebuilt = package_from_ram(save_path="approach3_joint_model_package.pth")
    if rebuilt and os.path.exists(rebuilt):
        _ = try_download(rebuilt)
    else:
        print("\n🙇 Sorry—cannot recover automatically without either:")
        print("  • An existing .pth file on disk/Drive, or")
        print("  • The live `model` and scaler variables in RAM.")
        print("\nNext best options (no retraining):")
        print("  1) If you previously saved to Google Drive, run the Drive pane (left) and download it.")
        print("  2) If you still have the package locally on your machine, re-upload it and use the inference script.")
        print("  3) As a last resort, if you can re-create ONLY the scalers from the original training CSVs,")
        print("     we can repackage the model if its state_dict is somewhere (e.g., a checkpoint).")


🔎 Searching for existing model packages...
ℹ️ Skipping Drive mount (not in Colab or user declined).
✅ Found 1 candidate .pth file(s):
   1. /content/approach3_joint_model_package.pth  (modified 2025-08-23 07:51:34)

🏆 Best match selected: /content/approach3_joint_model_package.pth
⬇️  Downloading: /content/approach3_joint_model_package.pth


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>