In [2]:
# --- 1. Imports ---
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import pandas as pd

# --- 2. Utility Functions ---
def load(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        return json.load(f)

def extract_topics_from_url(url):
    parts = url.split('/')
    topic = parts[3] if len(parts) > 3 else 'none'
    subtopic = parts[4] if len(parts) > 4 else 'none'
    return topic, subtopic

def enrich_articles(articles):
    for a in articles:
        dt = pd.to_datetime(a['date'])
        a['year'] = dt.year
        a['month'] = dt.month
        a['day_of_week'] = dt.weekday()
        a['hour'] = dt.hour
    return articles

# --- 3. Dataset ---
class NewsDataset(Dataset):
    def __init__(self, X, topic_ids, subtopic_ids, hour_ids, weekday_ids, targets):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.topic_ids = torch.tensor(topic_ids, dtype=torch.long)
        self.subtopic_ids = torch.tensor(subtopic_ids, dtype=torch.long)
        self.hour_ids = torch.tensor(hour_ids, dtype=torch.long)
        self.weekday_ids = torch.tensor(weekday_ids, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return (
            self.X[idx],
            self.topic_ids[idx],
            self.subtopic_ids[idx],
            self.hour_ids[idx],
            self.weekday_ids[idx],
            self.targets[idx]
        )

# --- 4. Model ---
class MLPWithBottleneckAndTopics(nn.Module):
    def __init__(self, input_dim, num_topics, num_subtopics):
        super().__init__()

        # Bottleneck MLP on top of text embeddings
        self.bottleneck = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU()
        )

        # Embeddings
        self.topic_emb = nn.Embedding(num_topics, 16)
        self.subtopic_emb = nn.Embedding(num_subtopics, 24)
        self.hour_emb = nn.Embedding(24, 4)
        self.weekday_emb = nn.Embedding(7, 3)

        # Total input size
        total_input = 128 + 16 + 24 + 4 + 3  # bottleneck + topic + subtopic + hour + weekday

        # Final prediction MLP
        self.net = nn.Sequential(
            nn.Linear(total_input, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Softplus()
        )

    def forward(self, x, topic_id, subtopic_id, hour_id, weekday_id):
        x = self.bottleneck(x)  # ðŸ”¥ transform text embeddings

        topic_vec = self.topic_emb(topic_id)
        subtopic_vec = self.subtopic_emb(subtopic_id)
        hour_vec = self.hour_emb(hour_id)
        weekday_vec = self.weekday_emb(weekday_id)

        all_features = torch.cat([x, topic_vec, subtopic_vec, hour_vec, weekday_vec], dim=1)
        return self.net(all_features).squeeze()

# --- 5. Trainer ---
class RTVSloBottleneck:
    def __init__(self, batch_size=128, epochs=50, lr=1e-4, eval_split=0.05):
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.eval_split = eval_split

    def fit(self, articles, val_articles=None):
        articles = enrich_articles(articles)
        y = [a['n_comments'] for a in articles]

        bert = torch.load("sloberta_embeddings.pt", map_location="cpu").numpy()
        topics, subtopics = zip(*[extract_topics_from_url(a['url']) for a in articles])
        subtopics = [s if s != "NO_SUBTOPIC" else "none" for s in subtopics]

        self.topic_enc = LabelEncoder().fit(topics)
        self.subtopic_enc = LabelEncoder().fit(subtopics)
        topic_ids = self.topic_enc.transform(topics)
        subtopic_ids = self.subtopic_enc.transform(subtopics)

        hour_ids = np.array([a['hour'] for a in articles], dtype=np.int64)
        weekday_ids = np.array([a['day_of_week'] for a in articles], dtype=np.int64)

        X_train, X_val, topic_train, topic_val, subtopic_train, subtopic_val, hour_train, hour_val, weekday_train, weekday_val, y_train, y_val = train_test_split(
            bert, topic_ids, subtopic_ids, hour_ids, weekday_ids, y, test_size=self.eval_split, random_state=42
        )

        train_data = NewsDataset(X_train, topic_train, subtopic_train, hour_train, weekday_train, y_train)
        val_data = NewsDataset(X_val, topic_val, subtopic_val, hour_val, weekday_val, y_val)

        loader_train = DataLoader(train_data, batch_size=self.batch_size, shuffle=True)
        loader_val = DataLoader(val_data, batch_size=self.batch_size)

        self.model = MLPWithBottleneckAndTopics(X_train.shape[1], len(self.topic_enc.classes_), len(self.subtopic_enc.classes_))
        optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        criterion = nn.L1Loss()

        best_mae = float('inf')
        best_state = None

        for epoch in range(self.epochs):
            self.model.train()
            for batch in loader_train:
                x, t, st, h, wd, y = batch
                optimizer.zero_grad()
                pred = self.model(x, t, st, h, wd)
                loss = criterion(pred, y)
                loss.backward()
                optimizer.step()

            self.model.eval()
            with torch.no_grad():
                val_preds, val_true = [], []
                for batch in loader_val:
                    x, t, st, h, wd, y = batch
                    pred = self.model(x, t, st, h, wd)
                    val_preds.append(pred)
                    val_true.append(y)
                val_preds = torch.cat(val_preds).numpy()
                val_true = torch.cat(val_true).numpy()
                val_mae = mean_absolute_error(val_true, val_preds)
                print(f"Epoch {epoch+1} - Val MAE: {val_mae:.2f}")

            if val_mae < best_mae:
                best_mae = val_mae
                best_state = self.model.state_dict()

        if best_state:
            self.model.load_state_dict(best_state)

    def predict(self, test_articles):
        test_articles = enrich_articles(test_articles)
        bert = torch.load("sloberta_embeddings_val.pt", map_location="cpu").numpy()

        topics, subtopics = zip(*[extract_topics_from_url(a['url']) for a in test_articles])
        subtopics = [s if s != "NO_SUBTOPIC" else "none" for s in subtopics]
        topic_ids = [self.topic_enc.classes_[0] if t not in self.topic_enc.classes_ else t for t in topics]
        subtopic_ids = [self.subtopic_enc.classes_[0] if s not in self.subtopic_enc.classes_ else s for s in subtopics]
        topic_ids = self.topic_enc.transform(topic_ids)
        subtopic_ids = self.subtopic_enc.transform(subtopic_ids)

        hour_ids = np.array([a['hour'] for a in test_articles], dtype=np.int64)
        weekday_ids = np.array([a['day_of_week'] for a in test_articles], dtype=np.int64)

        self.model.eval()
        with torch.no_grad():
            x = torch.tensor(bert, dtype=torch.float32)
            t = torch.tensor(topic_ids, dtype=torch.long)
            st = torch.tensor(subtopic_ids, dtype=torch.long)
            h = torch.tensor(hour_ids, dtype=torch.long)
            wd = torch.tensor(weekday_ids, dtype=torch.long)

            preds = self.model(x, t, st, h, wd).numpy()
            return preds

# --- 6. Main ---
if __name__ == '__main__':
    train = load("../data/rtvslo_train.json")
    test = load("../data/rtvslo_test.json")

    m = RTVSloBottleneck()
    m.fit(train)
    preds = m.predict(test)

    np.savetxt("predictions.txt", preds, fmt="%.4f")


Epoch 1 - Val MAE: 30.73
Epoch 2 - Val MAE: 28.48
Epoch 3 - Val MAE: 27.38
Epoch 4 - Val MAE: 26.75
Epoch 5 - Val MAE: 26.53
Epoch 6 - Val MAE: 26.55
Epoch 7 - Val MAE: 26.46
Epoch 8 - Val MAE: 25.62
Epoch 9 - Val MAE: 25.82
Epoch 10 - Val MAE: 25.46
Epoch 11 - Val MAE: 25.44
Epoch 12 - Val MAE: 25.38
Epoch 13 - Val MAE: 25.61
Epoch 14 - Val MAE: 25.47
Epoch 15 - Val MAE: 25.04
Epoch 16 - Val MAE: 24.92
Epoch 17 - Val MAE: 24.99
Epoch 18 - Val MAE: 25.05
Epoch 19 - Val MAE: 24.78
Epoch 20 - Val MAE: 24.77
Epoch 21 - Val MAE: 24.76
Epoch 22 - Val MAE: 24.61
Epoch 23 - Val MAE: 26.01
Epoch 24 - Val MAE: 24.55
Epoch 25 - Val MAE: 24.59
Epoch 26 - Val MAE: 24.52
Epoch 27 - Val MAE: 24.56
Epoch 28 - Val MAE: 24.37
Epoch 29 - Val MAE: 24.31
Epoch 30 - Val MAE: 24.45
Epoch 31 - Val MAE: 24.37
Epoch 32 - Val MAE: 24.08
Epoch 33 - Val MAE: 24.40
Epoch 34 - Val MAE: 24.22
Epoch 35 - Val MAE: 24.13
Epoch 36 - Val MAE: 24.11
Epoch 37 - Val MAE: 24.06
Epoch 38 - Val MAE: 24.47
Epoch 39 - Val MAE: 2

In [3]:
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

# 1. Load your predictions
preds = np.loadtxt("predictions.txt")

# 2. Load your true y-values from dataset_val.json
with open("../data/rtvslo_validation.json", "r", encoding="utf-8") as f:
    val_articles = json.load(f)

# 3. Extract true n_comments
y_true = np.array([a["n_comments"] for a in val_articles], dtype=np.float32)

# 4. Check lengths
assert len(preds) == len(y_true), f"Length mismatch: preds={len(preds)}, y_true={len(y_true)}"

# 5. Calculate MAE
mae = mean_absolute_error(y_true, preds)

print(f"ðŸ“Š MAE between predictions and true values: {mae:.2f}")


ðŸ“Š MAE between predictions and true values: 28.77
