In [1]:
# --- 1. Imports ---
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import os

# --- 2. Utility functions ---
def load(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        return json.load(f)

def process_date_features(articles):
    years = np.array([a['year'] for a in articles]).reshape(-1, 1)
    months = np.array([a['month'] for a in articles]).reshape(-1, 1)
    day_of_week = np.array([a['day_of_week'] for a in articles])
    hour = np.array([a['hour'] for a in articles])

    year_scaler = StandardScaler()
    month_scaler = StandardScaler()
    years_scaled = year_scaler.fit_transform(years)
    months_scaled = month_scaler.fit_transform(months)

    day_sin = np.sin(2 * np.pi * day_of_week / 7)
    day_cos = np.cos(2 * np.pi * day_of_week / 7)
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)

    date_feats = np.concatenate([years_scaled, months_scaled, day_sin[:, None], day_cos[:, None], hour_sin[:, None], hour_cos[:, None]], axis=1)
    return date_feats

def extract_topics_from_url(url):
    parts = url.split('/')
    topic = parts[3] if len(parts) > 3 else 'none'
    subtopic = parts[4] if len(parts) > 4 else 'none'
    return topic, subtopic

# --- 3. Dataset ---
class SequenceDataset(Dataset):
    def __init__(self, features, targets, seq_len=5):
        self.features = features
        self.targets = targets
        self.seq_len = seq_len

    def __len__(self):
        return len(self.features) - self.seq_len

    def __getitem__(self, idx):
        x = self.features[idx:idx+self.seq_len]
        y = self.targets[idx+self.seq_len-1]
        return x, y

# --- 4. Model ---
class LSTMRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Softplus()
        )

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = h_n[-1]
        return self.fc(out).squeeze()

# --- 5. Trainer ---
class SequentialRTVSloModel:
    def __init__(self, batch_size=128, seq_len=5, epochs=70, learning_rate=1e-4):
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def prepare_features(self, articles, bert_vectors, fit_encoders=False):
        topics = []
        subtopics = []
        for a in articles:
            topic, subtopic = extract_topics_from_url(a['url'])
            topics.append(topic)
            subtopics.append(subtopic if subtopic != "NO_SUBTOPIC" else "none")

        if fit_encoders:
            self.topic_encoder = LabelEncoder().fit(topics)
            self.subtopic_encoder = LabelEncoder().fit(subtopics)

        topics = [t if t in self.topic_encoder.classes_ else self.topic_encoder.classes_[0] for t in topics]
        subtopics = [s if s in self.subtopic_encoder.classes_ else self.subtopic_encoder.classes_[0] for s in subtopics]

        topic_ids = self.topic_encoder.transform(topics)
        subtopic_ids = self.subtopic_encoder.transform(subtopics)

        date_feats = process_date_features(articles)

        features = np.hstack([
            bert_vectors,
            topic_ids[:, None],
            subtopic_ids[:, None],
            date_feats
        ])

        return features

    def fit(self, train_articles, bert_path):
        for a in train_articles:
            dt = pd.to_datetime(a['date'])
            a['year'] = dt.year
            a['month'] = dt.month
            a['day_of_week'] = dt.weekday()
            a['hour'] = dt.hour

        bert_vectors = torch.load(bert_path, map_location="cpu").numpy()

        features = self.prepare_features(train_articles, bert_vectors, fit_encoders=True)
        targets = np.array([np.log1p(a['n_comments']) for a in train_articles], dtype=np.float32)

        dataset = SequenceDataset(torch.tensor(features, dtype=torch.float32), torch.tensor(targets, dtype=torch.float32), seq_len=self.seq_len)
        train_loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        input_dim = features.shape[1]
        self.model = LSTMRegressor(input_dim=input_dim).to(self.device)

        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        criterion = nn.HuberLoss(delta=5.0)

        best_loss = float('inf')
        patience = 10
        counter = 0

        for epoch in range(self.epochs):
            self.model.train()
            losses = []

            for x_batch, y_batch in train_loader:
                x_batch = x_batch.to(self.device)
                y_batch = y_batch.to(self.device)

                optimizer.zero_grad()
                preds = self.model(x_batch)
                loss = criterion(preds, y_batch)
                loss.backward()
                optimizer.step()

                losses.append(loss.item())

            epoch_loss = np.mean(losses)
            print(f"Epoch {epoch+1} - Train Loss: {epoch_loss:.4f}")

            # Early stopping manually
            if epoch_loss < best_loss - 1e-4:
                best_loss = epoch_loss
                counter = 0
                torch.save(self.model.state_dict(), "best_seq_model.pt")
            else:
                counter += 1
                if counter >= patience:
                    print("Early stopping triggered.")
                    break

    def predict(self, articles, bert_path):
        for a in articles:
            dt = pd.to_datetime(a['date'])
            a['year'] = dt.year
            a['month'] = dt.month
            a['day_of_week'] = dt.weekday()
            a['hour'] = dt.hour

        bert_vectors = torch.load(bert_path, map_location="cpu").numpy()
        features = self.prepare_features(articles, bert_vectors, fit_encoders=False)

        dataset = SequenceDataset(torch.tensor(features, dtype=torch.float32), torch.zeros(features.shape[0]), seq_len=self.seq_len)
        loader = DataLoader(dataset, batch_size=self.batch_size)

        self.model.load_state_dict(torch.load("best_seq_model.pt"))
        self.model.eval()

        preds = []
        with torch.no_grad():
            for x_batch, _ in tqdm(loader):
                x_batch = x_batch.to(self.device)
                y_pred = self.model(x_batch)
                preds.append(y_pred)

        preds = torch.cat(preds).cpu().numpy()
        preds = np.expm1(np.clip(preds, 0, None))
        return preds

# --- 6. Main ---
if __name__ == "__main__":
    print("ðŸ”µ Loading data...")
    train_articles = load("rtvslo_train.json")
    val_articles = load("rtvslo_validation.json")

    model = SequentialRTVSloModel(batch_size=128, seq_len=5, epochs=70, learning_rate=1e-4)
    model.fit(train_articles, bert_path="sloberta_embeddings.pt")

    print("ðŸ”µ Predicting...")
    preds_val = model.predict(val_articles, bert_path="sloberta_embeddings_val.pt")
    np.savetxt("predictions_val_seq.txt", preds_val, fmt="%.4f")
    print("âœ… Done!")


ðŸ”µ Loading data...


FileNotFoundError: [Errno 2] No such file or directory: 'rtvslo_train.json'

In [28]:
import numpy as np

# Load base semantic model predictions
preds_semantic = np.loadtxt("predictions_val.txt")  # length 2218

# Load sequential (LSTM) predictions
preds_seq = np.loadtxt("predictions_val_seq.txt")            # length 2213 (because sequence length 5)

# Trim semantic predictions to match LSTM output
preds_semantic = preds_semantic[5:]  # skip first 5 articles for semantic preds too

# Sanity check
assert len(preds_semantic) == len(preds_seq), f"Mismatch: semantic={len(preds_semantic)}, seq={len(preds_seq)}"

# Combine predictions
alpha = 0  # you can tune this later
combined_preds = (1 - alpha) * preds_semantic + alpha * preds_seq

# Save to file
np.savetxt("predictions_val_combined.txt", combined_preds, fmt="%.4f")

print("âœ… Combined predictions saved to predictions_val_combined.txt!")


âœ… Combined predictions saved to predictions_val_combined.txt!


In [29]:
from sklearn.metrics import mean_absolute_error
import json

# Load ground truth
with open("../data/rtvslo_validation.json", "r", encoding="utf-8") as f:
    val_articles = json.load(f)

# Extract true n_comments
y_true = np.array([a["n_comments"] for a in val_articles], dtype=np.float32)
y_true = y_true[5:]  # because LSTM shifted output

# Load your combined predictions
combined_preds = np.loadtxt("predictions_val_combined.txt")

# Final check
assert len(combined_preds) == len(y_true)

# Compute MAE
mae = mean_absolute_error(y_true, combined_preds)
print(f"ðŸ“Š Combined MAE: {mae:.2f}")


ðŸ“Š Combined MAE: 26.84
