# **Fine-Tuning TimesFM for Anomaly Detection**

## **Objective**
This notebook implements a complete pipeline for fine-tuning a pretrained Time Series Foundation Model (TimesFM) for anomaly detection on weather data.

## **Setup and Dependencies**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, ConcatDataset
import numpy as np
import math
import time
import copy
import os
import glob
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, precision_recall_curve, roc_curve, auc

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

DATA_DIR = "/content/drive/MyDrive/anomaly_patches"
YEARS_TRAIN = [2013, 2014, 2015, 2016]
YEARS_VAL = [2017, 2018]
YEARS_TEST = [2019]
BATCH_SIZE = 128
EPOCHS = 3
LEARNING_RATE = 1e-4
SAVE_DIR = "/content/drive/MyDrive/models"
os.makedirs(SAVE_DIR, exist_ok=True)

Using device: cuda


In [None]:
# Used google colab so to mount drive use this code
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Model Architecture (Pretrained Backbone)**
We reuse `TimesFMLiteGPT` architecture.

In [None]:
class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization"""
    def __init__(self, d_model, eps=1e-8):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(d_model))
        self.eps = eps

    def forward(self, x):
        norm = torch.mean(x**2, dim=-1, keepdim=True)
        x_normed = x * torch.rsqrt(norm + self.eps)
        return self.scale * x_normed

class TimesFMLiteGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.patch_embed = nn.Linear(config['patch_len'], config['d_model'])
        self.pos_embed = nn.Parameter(torch.zeros(1, config['context_len'], config['d_model']))

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config['d_model'],
            nhead=config['n_heads'],
            dim_feedforward=config['d_ff'],
            dropout=config['dropout'],
            activation="gelu",
            batch_first=True,
            norm_first=True
        )
        self.blocks = nn.TransformerEncoder(encoder_layer, num_layers=config['n_layers'])

        self.norm_f = RMSNorm(config['d_model'])
        self.head = nn.Linear(config['d_model'], config['patch_len'])

    def forward(self, x, return_embeddings=False):
        # x: [Batch, Seq_Len, Patch_Len]
        B, T, P = x.shape

        # Embedding
        h = self.patch_embed(x)
        h = h + self.pos_embed[:, :T, :]

        # Causal Mask
        mask = nn.Transformer.generate_square_subsequent_mask(T).to(x.device)

        # Blocks
        h = self.blocks(h, mask=mask, is_causal=True)
        h = self.norm_f(h)

        if return_embeddings:
            return h

        out = self.head(h)
        return out

## **Data Loading**


In [None]:
class YearDataset(Dataset):
    def __init__(self, year, base_path, context_len=32):
        self.year = year
        self.context_len = context_len

        patch_path = os.path.join(base_path, str(year), "temp_patches.npy")
        label_path = os.path.join(base_path, str(year), "label_patches.npy")

        print(f"Loading data for year {year}...")
        if not os.path.exists(patch_path) or not os.path.exists(label_path):
            print(f"WARNING: Data for year {year} not found at {patch_path} or {label_path}")
            self.data = np.zeros((0, 32), dtype=np.float32)
            self.labels = np.zeros((0,), dtype=np.float32)
            self.samples = []
            return

        # Use mmap_mode if memory is tight, but float32 usually fits fine
        try:
            self.data = np.load(patch_path).astype(np.float32)
            self.labels = np.load(label_path).astype(np.float32)
            print(f"Loaded {os.path.basename(patch_path)}: {self.data.shape}, Labels: {self.labels.shape}")
        except Exception as e:
            print(f"Error loading year {year}: {e}")
            self.data = np.zeros((0, 32), dtype=np.float32)
            self.labels = np.zeros((0,), dtype=np.float32)

        self.samples = []
        num_patches = len(self.data)

        if len(self.labels) != num_patches:
            print(f"Warning: Mismatch in data length {num_patches} and labels {len(self.labels)}")

        for i in range(num_patches - context_len + 1):
            target_idx = i + context_len - 1
            self.samples.append((i, target_idx))

        print(f"Year {year}: {len(self.samples)} sequences generated.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        start_idx, target_idx = self.samples[idx]
        seq = self.data[start_idx : start_idx + self.context_len]
        label = self.labels[target_idx]
        label_scalar = np.max(label) if label.ndim > 0 else label

        return torch.from_numpy(seq), torch.tensor(label_scalar, dtype=torch.float32)

## **Loss Functions for Imbalance**
We implement Focal Loss to handle the rarity of anomalies.

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.75, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, inputs, targets):
        bce_loss = self.bce(inputs, targets)
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * bce_loss
        return focal_loss.mean()

## **Evaluation & Plotting Metrics**

In [None]:
def plot_training_history(history, save_name="training_plot.png"):
    epochs = range(1, len(history['train_loss']) + 1)

    plt.figure(figsize=(12, 5))

    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history['train_loss'], label='Train Loss')
    plt.plot(epochs, history['val_loss'], label='Val Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    # Metrics
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history['val_f1'], label='Val F1')
    plt.plot(epochs, history['val_auroc'], label='Val AUROC')
    plt.title('Validation Metrics')
    plt.xlabel('Epochs')
    plt.ylabel('Score')
    plt.legend()

    plt.tight_layout()
    plt.savefig(save_name)
    plt.show()

def save_model_checkpoint(model, name):
    path = os.path.join(SAVE_DIR, f"{name}.pth")
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

def plot_roc_pr_curve(probs, targets, save_name="roc_pr_curve.png"):
    precision, recall, _ = precision_recall_curve(targets, probs)
    fpr, tpr, _ = roc_curve(targets, probs)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(12, 5))

    # ROC Curve
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")

    # PR Curve
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, color='blue', lw=2, label='PR curve')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc="lower left")

    plt.tight_layout()
    plt.savefig(save_name)
    plt.show()

## **Wrapper for Anomaly Classification**
We wrap the base model to output a single scalar (anomaly score) for the sequence (or last patch).

In [None]:
class TimesFMAnomaly(nn.Module):
    def __init__(self, pretrained_model, d_model):
        super().__init__()
        self.backbone = pretrained_model
        # Classifier Head: Takes the embedding of the LAST patch to predict anomaly
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(d_model // 2, 1)
        )

    def forward(self, x):
        embeddings = self.backbone(x, return_embeddings=True)

        # We pool the last time step for classification
        last_embedding = embeddings[:, -1, :]

        # Logits [BATCH, 1]
        logits = self.classifier(last_embedding)
        return logits

# **Loading the data and splitting into train, test, validation**


In [None]:
CONFIG = {
    "patch_len": 32,
    "d_model": 128,
    "n_layers": 4,
    "n_heads": 4,
    "d_ff": 512,
    "dropout": 0.1,
    "context_len": 32,
    "batch_size": BATCH_SIZE,
    "lr": LEARNING_RATE,
    "epochs": EPOCHS
}

# Create Datasets by Year
def create_combined_dataset(years, base_path):
    datasets = []
    for y in years:
        ds = YearDataset(y, base_path, context_len=CONFIG['context_len'])
        if len(ds) > 0:
            datasets.append(ds)
    if not datasets:
        return None
    return ConcatDataset(datasets)

print("Constructing Training Dataset...")
train_ds = create_combined_dataset(YEARS_TRAIN, DATA_DIR)
print("Constructing Validation Dataset...")
val_ds = create_combined_dataset(YEARS_VAL, DATA_DIR)
print("Constructing Test Dataset...")
test_ds = create_combined_dataset(YEARS_TEST, DATA_DIR)

if train_ds:
    train_loader = DataLoader(
        train_ds,
        batch_size=CONFIG['batch_size'],
        shuffle=True,
        num_workers=2,
       pin_memory=True
)

else:
    print("CRITICAL ERROR: No training data found! Check paths.")

if val_ds:
    val_loader = DataLoader(
        val_ds,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        num_workers=2,
        pin_memory=True
)

if test_ds:
    test_loader = DataLoader(
        test_ds,
        batch_size=CONFIG['batch_size'],
        shuffle=False,
        num_workers=2,
        pin_memory=True
)

# Instantiate Base Model
base_model = TimesFMLiteGPT(CONFIG).to(device)

Constructing Training Dataset...
Loading data for year 2013...
Loaded temp_patches.npy: (1446172, 32), Labels: (1446172, 32)
Year 2013: 1446141 sequences generated.
Loading data for year 2014...
Loaded temp_patches.npy: (1446172, 32), Labels: (1446172, 32)
Year 2014: 1446141 sequences generated.
Loading data for year 2015...
Loaded temp_patches.npy: (1446172, 32), Labels: (1446172, 32)
Year 2015: 1446141 sequences generated.
Loading data for year 2016...
Loaded temp_patches.npy: (1451450, 32), Labels: (1451450, 32)
Year 2016: 1451419 sequences generated.
Constructing Validation Dataset...
Loading data for year 2017...
Loaded temp_patches.npy: (1446172, 32), Labels: (1446172, 32)
Year 2017: 1446141 sequences generated.
Loading data for year 2018...
Loaded temp_patches.npy: (1446172, 32), Labels: (1446172, 32)
Year 2018: 1446141 sequences generated.
Constructing Test Dataset...
Loading data for year 2019...
Loaded temp_patches.npy: (1446172, 32), Labels: (1446172, 32)
Year 2019: 1446141 



# **Transfer Learning**


In [None]:
backbone = TimesFMLiteGPT(CONFIG).to(device)

pretrained_path = "/content/drive/MyDrive/weather_model_runs/best_model.pth"  # YOUR PATH
state_dict = torch.load(pretrained_path, map_location=device)
state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}

backbone.load_state_dict(state_dict, strict=False)
print("Pretrained backbone loaded")

Pretrained backbone loaded


In [None]:
model = TimesFMAnomaly(backbone, CONFIG["d_model"]).to(device)

# Freeze backbone
for param in model.backbone.parameters():
    param.requires_grad = False

# Freeze RMSNorm explicitly
for module in model.backbone.modules():
    if isinstance(module, RMSNorm):
        for p in module.parameters():
            p.requires_grad = False

# Sanity check
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable}/{total} ({100*trainable/total:.3f}%)")

Trainable params: 8321/813985 (1.022%)


In [None]:
criterion = FocalLoss(alpha=0.75, gamma=2)
optimizer = torch.optim.Adam(
    model.classifier.parameters(),
    lr=CONFIG["lr"]
)

In [None]:
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
    average_precision_score
)
import numpy as np
import os
import torch


def train_transfer_learning(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    epochs,
    save_dir,
):
    os.makedirs(save_dir, exist_ok=True)

    history = {
        "train_loss": [],
        "val_loss": [],
        "val_f1": [],
        "val_precision": [],
        "val_recall": [],
        "val_auroc": [],
        "val_aucpr": []
    }

    best_val_auc = 0.0

    for epoch in range(epochs):
        # TRAINING
        model.backbone.eval()
        model.classifier.train()

        train_loss = 0.0

        for x, y in train_loader:
            x = x.to(device)
            y = y.to(device).unsqueeze(1)

            optimizer.zero_grad()

            logits = model(x)
            loss = criterion(logits, y)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        history["train_loss"].append(train_loss)

        # VALIDATION
        model.eval()
        val_loss = 0.0
        val_probs, val_targets = [], []

        with torch.no_grad():
            for x, y in val_loader:
                x = x.to(device)
                y = y.to(device).unsqueeze(1)

                logits = model(x)
                loss = criterion(logits, y)

                val_loss += loss.item()

                probs = torch.sigmoid(logits)
                val_probs.extend(probs.cpu().numpy().ravel())
                val_targets.extend(y.cpu().numpy().ravel())

        val_loss /= len(val_loader)

        val_probs = np.array(val_probs)
        val_targets = np.array(val_targets)

        # METRICS
        val_auc = roc_auc_score(val_targets, val_probs)
        val_aucpr = average_precision_score(val_targets, val_probs)

        # Threshold 0.5 is often suboptimal for anomalies
        val_preds = (val_probs >= 0.2).astype(int)

        val_precision = precision_score(val_targets, val_preds, zero_division=0)
        val_recall = recall_score(val_targets, val_preds, zero_division=0)
        val_f1 = f1_score(val_targets, val_preds, zero_division=0)

        history["val_loss"].append(val_loss)
        history["val_auroc"].append(val_auc)
        history["val_aucpr"].append(val_aucpr)
        history["val_precision"].append(val_precision)
        history["val_recall"].append(val_recall)
        history["val_f1"].append(val_f1)

        # LOGGING
        print(
            f"Epoch [{epoch+1}/{epochs}] | "
            f"Train Loss: {train_loss:.4f} | "
            f"Val Loss: {val_loss:.4f} | "
            f"AUC: {val_auc:.4f} | "
            f"AUC-PR: {val_aucpr:.4f} | "
            f"Prec: {val_precision:.4f} | "
            f"Rec: {val_recall:.4f} | "
            f"F1: {val_f1:.4f}"
        )

        # SAVE BEST MODEL (AUROC)
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            torch.save(
                model.state_dict(),
                os.path.join(save_dir, "best_transfer_model.pth")
            )

    print(f"\nBest Validation AUROC: {best_val_auc:.4f}")
    return history

In [None]:
SAVE_DIR = "/content/drive/MyDrive/models"

history = train_transfer_learning(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    epochs=CONFIG["epochs"],
    save_dir=SAVE_DIR
)

Epoch [1/3] | Train Loss: 0.0361 | Val Loss: 0.0362 | AUC: 0.9331 | AUC-PR: 0.6276 | Prec: 0.2464 | Rec: 0.9728 | F1: 0.3932
Epoch [2/3] | Train Loss: 0.0344 | Val Loss: 0.0364 | AUC: 0.9324 | AUC-PR: 0.6250 | Prec: 0.2663 | Rec: 0.9636 | F1: 0.4172
Epoch [3/3] | Train Loss: 0.0339 | Val Loss: 0.0359 | AUC: 0.9346 | AUC-PR: 0.6306 | Prec: 0.2427 | Rec: 0.9773 | F1: 0.3888

Best Validation AUROC: 0.9346


In [None]:
model.eval()
test_preds, test_targets = [], []

with torch.no_grad():
    for x, y in test_loader:
        x = x.to(device)
        y = y.to(device).unsqueeze(1)

        logits = model(x)
        test_preds.extend(torch.sigmoid(logits).cpu().numpy())
        test_targets.extend(y.cpu().numpy())

test_auc = roc_auc_score(test_targets, test_preds)
print(f"Test AUC: {test_auc:.4f}")

Test AUC: 0.9392


### Experiment 3: LoRA (Low-Rank Adaptation)

In [None]:
class LoRALayer(nn.Module):
    def __init__(self, linear_layer, rank=4, alpha=16):
        super().__init__()
        self.linear = linear_layer
        self.rank = rank
        self.alpha = alpha

        in_dim = linear_layer.in_features
        out_dim = linear_layer.out_features

        self.lora_A = nn.Parameter(torch.zeros(in_dim, rank))
        self.lora_B = nn.Parameter(torch.zeros(rank, out_dim))
        self.scaling = alpha / rank

        # Initialize
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

        # Freeze original
        self.linear.weight.requires_grad = False
        if self.linear.bias is not None:
            self.linear.bias.requires_grad = False

    def forward(self, x):
        regular_out = self.linear(x)
        lora_out = (x @ self.lora_A @ self.lora_B) * self.scaling
        return regular_out + lora_out

def apply_lora(model, rank=4):
    # Apply LoRA to patch_embed and output head (simplified for demo)
    model.patch_embed = LoRALayer(model.patch_embed, rank=rank)
    return model

print("Initializing LoRA...")
lora_base = copy.deepcopy(base_model)
lora_base = apply_lora(lora_base)
lora_model = TimesFMAnomaly(lora_base, CONFIG['d_model'])

# Ensure only LoRA params and Head are trainable
param_count = 0
trainable = 0
for n, p in lora_model.named_parameters():
    param_count += p.numel()
    if 'lora' in n or 'classifier' in n:
        p.requires_grad = True
        trainable += p.numel()
    else:
        p.requires_grad = False

print(f"LoRA Trainable Params: {trainable} / {param_count} ({trainable/param_count:.2%})")
lora_trained = train_experiment(lora_model, "LoRA Fine-Tuning")

### Experiment 4: Model Distillation
We use the Full-Fine-Tuned model as Teacher.

In [None]:
print("Initializing Distillation...")

# Teacher:  Full Fine-Tuned Model (fft_trained)
teacher_model = fft_trained
teacher_model.eval()

# Student: Smaller config
STUDENT_CONFIG = CONFIG.copy()
STUDENT_CONFIG['n_layers'] = 2 # Half layers
STUDENT_CONFIG['d_model'] = 64 # Half width

student_base = TimesFMLiteGPT(STUDENT_CONFIG).to(device)
student_model = TimesFMAnomaly(student_base, STUDENT_CONFIG['d_model']).to(device)

def distillation_loss(student_logits, teacher_logits, targets, T=2.0, alpha=0.5):
    # Soft Target Loss (KLDiv)
    teacher_probs = torch.sigmoid(teacher_logits / T)
    student_probs = torch.sigmoid(student_logits / T)
    # Standard binary distillation
    soft_loss = nn.MSELoss()(student_probs, teacher_probs)

    # Hard Target Loss (Focal)
    hard_loss = FocalLoss()(student_logits, targets)

    return alpha * soft_loss + (1 - alpha) * hard_loss

optimizer = torch.optim.AdamW(student_model.parameters(), lr=CONFIG['lr'])

print("--- Starting Distillation ---")
history = {'train_loss': [], 'val_loss': [], 'val_f1': [], 'val_auroc': []}

for epoch in range(CONFIG['epochs']):
    student_model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        y = y.unsqueeze(1)

        with torch.no_grad():
            teacher_logits = teacher_model(x)

        optimizer.zero_grad()
        student_logits = student_model(x)

        loss = distillation_loss(student_logits, teacher_logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    metrics, _ = evaluate_anomaly(student_model, val_loader, device)
    print(f"Epoch {epoch+1}: Student F1={metrics['F1']:.4f}, AUROC={metrics['AUROC']:.4f}")

    history['train_loss'].append(total_loss / len(train_loader))
    history['val_loss'].append(metrics['Loss'])
    history['val_f1'].append(metrics['F1'])
    history['val_auroc'].append(metrics['AUROC'])

save_model_checkpoint(student_model, "distilled_student_model")
plot_training_history(history, save_name="distillation_curves.png")

### Experiment 5: Continuous Pre-Training
Continue the self-supervised task (Next Patch Prediction) on the new domain data, then fine-tune head.

In [None]:
print("Initializing Continuous Pre-Training...")
# 1. Unsupervised Phase (Reconstruction / Next Token)
cpt_base = copy.deepcopy(base_model)
optimizer = torch.optim.AdamW(cpt_base.parameters(), lr=CONFIG['lr'])
criterion_mse = nn.MSELoss()

print("Phase 1: Self-Supervised Update")
for epoch in range(2): # Short phase e.g. 2 epochs
    cpt_base.train()
    for x, _ in train_loader:
        x = x.to(device)
        # Simulate Next-Patch Prediction task:
        # Input: 0..N-1, Target: 1..N
        # x shape: [B, Context, Patch]
        inp = x[:, :-1, :]
        target = x[:, 1:, :]

        if inp.size(1) == 0: continue

        pred = cpt_base(inp)
        loss = criterion_mse(pred, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"CPT Epoch {epoch+1} MSE: {loss.item():.4f}")

# 2. Fine-Tune Head for Anomaly
print("Phase 2: Anomaly Head Tuning")
cpt_model = TimesFMAnomaly(cpt_base, CONFIG['d_model'])
cpt_trained = train_experiment(cpt_model, "Continuous Pre-Training + Fine-Tuning")

### Final Evaluation on Test Set
We evaluate the best performing model (example: LoRA or Full FT) on the held-out Test Set (2019).

In [None]:
print("--- TEST SET EVALUATION (LoRA Model) ---")
# Using Lora Model as example
metrics, (probs, targets) = evaluate_anomaly(lora_model, test_loader, device)
print(f"Test Set Results: F1={metrics['F1']:.4f}, AUROC={metrics['AUROC']:.4f}, AUPRC={metrics['AUPRC']:.4f}")

# Plot ROC and PR Curves for Test Set
plot_roc_pr_curve(probs, targets, save_name="final_test_roc_pr.png")