In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
# --- 1. Imports ---
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
from tqdm import tqdm

# --- 2. Utility functions ---
def load(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        return json.load(f)

def enrich_articles_with_time_features(articles):
    for a in articles:
        dt = pd.to_datetime(a['date'])
        a['year'] = dt.year
        a['month'] = dt.month
        a['day_of_week'] = dt.weekday()
        a['hour'] = dt.hour
    return articles

def process_date_features(articles):
    years = np.array([a['year'] for a in articles]).reshape(-1, 1)
    months = np.array([a['month'] for a in articles]).reshape(-1, 1)
    day_of_week = np.array([a['day_of_week'] for a in articles])
    hour = np.array([a['hour'] for a in articles])

    year_scaler = StandardScaler()
    month_scaler = StandardScaler()
    years_scaled = year_scaler.fit_transform(years)
    months_scaled = month_scaler.fit_transform(months)

    day_sin = np.sin(2 * np.pi * day_of_week / 7)
    day_cos = np.cos(2 * np.pi * day_of_week / 7)
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)

    date_feats = np.concatenate([years_scaled, months_scaled, day_sin[:, None], day_cos[:, None], hour_sin[:, None], hour_cos[:, None]], axis=1)
    return date_feats

def extract_topics_from_url(url):
    parts = url.split('/')
    topic = parts[3] if len(parts) > 3 else 'none'
    subtopic = parts[4] if len(parts) > 4 else 'none'
    return topic, subtopic

# --- 3. Dataset ---
class NewsDataset(Dataset):
    def __init__(self, text_embed, topic_ids, subtopic_ids, date_feats, targets):
        self.text_embed = text_embed
        self.topic_ids = topic_ids
        self.subtopic_ids = subtopic_ids
        self.date_feats = date_feats
        self.targets = targets

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return (
            self.text_embed[idx],
            self.topic_ids[idx],
            self.subtopic_ids[idx],
            self.date_feats[idx],
            self.targets[idx]
        )

# --- 4. Model ---
class NegativeBinomialLoss(nn.Module):
    def forward(self, y_true, mu, alpha):
        eps = 1e-8
        mu = torch.clamp(mu, min=eps)
        alpha = torch.clamp(alpha, min=eps)
        r = 1.0 / (alpha + eps)
        p = r / (r + mu + eps)

        log_prob = (
            torch.lgamma(y_true + r)
            - torch.lgamma(r)
            - torch.lgamma(y_true + 1)
            + r * torch.log(p + eps)
            + y_true * torch.log(1 - p + eps)
        )
        return -torch.mean(log_prob)

class NBRegressionWithTopics(nn.Module):
    def __init__(self, input_dim_text, num_topics, num_subtopics, time_dim=6, hidden_dim=256):
        super().__init__()
        self.topic_embed = nn.Embedding(num_topics, 16)
        self.subtopic_embed = nn.Embedding(num_subtopics, 24)

        total_input_dim = input_dim_text + 16 + 24 + time_dim

        self.fc1 = nn.Linear(total_input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.mu_head = nn.Linear(hidden_dim, 1)
        self.alpha_head = nn.Linear(hidden_dim, 1)

    def forward(self, text_embed, topic_ids, subtopic_ids, date_feats):
        topic_emb = self.topic_embed(topic_ids)
        subtopic_emb = self.subtopic_embed(subtopic_ids)
        x = torch.cat([text_embed, topic_emb, subtopic_emb, date_feats], dim=1)
        x = F.relu(self.fc1(x))
        x = self.dropout(F.relu(self.fc2(x)))
        mu = torch.exp(self.mu_head(x))
        alpha = F.softplus(self.alpha_head(x))
        return mu.squeeze(1), alpha.squeeze(1)

# --- 5. Main Trainer Class ---
class RTVSloNB:
    def __init__(self, batch_size=64, epochs=50, learning_rate=1e-3, eval_split=0.2):
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.eval_split = eval_split

    def fit(self, train_data):
        train_data = enrich_articles_with_time_features(train_data)

        raw_targets = [a['n_comments'] for a in train_data]
        targets = torch.tensor(raw_targets, dtype=torch.float32)

        bert_vectors = torch.load("sloberta_embeddings.pt", map_location="cpu").to(torch.float32)

        topics = []
        subtopics = []
        for a in train_data:
            topic, subtopic = extract_topics_from_url(a['url'])
            topics.append(topic)
            subtopics.append(subtopic if subtopic != "NO_SUBTOPIC" else "none")

        date_feats = torch.tensor(process_date_features(train_data), dtype=torch.float32)

        self.topic_encoder = LabelEncoder().fit(topics)
        self.subtopic_encoder = LabelEncoder().fit(subtopics)

        topic_ids = torch.tensor(self.topic_encoder.transform(topics), dtype=torch.long)
        subtopic_ids = torch.tensor(self.subtopic_encoder.transform(subtopics), dtype=torch.long)

        # >>> FIX: split numpy arrays
        X_np = bert_vectors.cpu().numpy()
        topic_np = topic_ids.cpu().numpy()
        subtopic_np = subtopic_ids.cpu().numpy()
        date_np = date_feats.cpu().numpy()
        target_np = targets.cpu().numpy()

        X_train, X_val, topic_train, topic_val, subtopic_train, subtopic_val, date_train, date_val, y_train, y_val = train_test_split(
            X_np, topic_np, subtopic_np, date_np, target_np, test_size=self.eval_split, random_state=42
        )

        # >>> FIX: rewrap into torch tensors
        X_train = torch.tensor(X_train, dtype=torch.float32)
        X_val = torch.tensor(X_val, dtype=torch.float32)
        topic_train = torch.tensor(topic_train, dtype=torch.long)
        topic_val = torch.tensor(topic_val, dtype=torch.long)
        subtopic_train = torch.tensor(subtopic_train, dtype=torch.long)
        subtopic_val = torch.tensor(subtopic_val, dtype=torch.long)
        date_train = torch.tensor(date_train, dtype=torch.float32)
        date_val = torch.tensor(date_val, dtype=torch.float32)
        y_train = torch.tensor(y_train, dtype=torch.float32)
        y_val = torch.tensor(y_val, dtype=torch.float32)

        train_dataset = NewsDataset(X_train, topic_train, subtopic_train, date_train, y_train)
        val_dataset = NewsDataset(X_val, topic_val, subtopic_val, date_val, y_val)

        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size)

        input_dim = bert_vectors.shape[1]
        self.model = NBRegressionWithTopics(input_dim, len(self.topic_encoder.classes_), len(self.subtopic_encoder.classes_))
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)

        self._train(self.model, train_loader, val_loader)


    def _train(self, model, train_loader, val_loader):
        optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate)
        criterion = NegativeBinomialLoss()

        best_val_mae = float('inf')
        best_model_state = None

        for epoch in range(self.epochs):
            model.train()
            for text, topic_id, subtopic_id, time_feat, y in train_loader:
                text, topic_id, subtopic_id, time_feat, y = text.to(self.device), topic_id.to(self.device), subtopic_id.to(self.device), time_feat.to(self.device), y.to(self.device)
                optimizer.zero_grad()
                mu, alpha = model(text, topic_id, subtopic_id, time_feat)
                loss = criterion(y, mu, alpha)
                loss.backward()
                optimizer.step()

            model.eval()
            val_preds = []
            val_targets = []
            with torch.no_grad():
                for text, topic_id, subtopic_id, time_feat, y in val_loader:
                    text, topic_id, subtopic_id, time_feat, y = text.to(self.device), topic_id.to(self.device), subtopic_id.to(self.device), time_feat.to(self.device), y.to(self.device)
                    mu, alpha = model(text, topic_id, subtopic_id, time_feat)
                    val_preds.append(mu.cpu())
                    val_targets.append(y.cpu())

            val_preds = torch.cat(val_preds).numpy()
            val_targets = torch.cat(val_targets).numpy()
            val_mae = np.mean(np.abs(val_preds - val_targets))

            if val_mae < best_val_mae:
                best_val_mae = val_mae
                best_model_state = model.state_dict()

            print(f"Epoch {epoch+1} - Val MAE: {val_mae:.2f}")

        if best_model_state is not None:
            model.load_state_dict(best_model_state)

    def predict(self, test_data):
        test_data = enrich_articles_with_time_features(test_data)

        bert_vectors = torch.load("sloberta_embeddings_val.pt", map_location="cpu").to(torch.float32)

        topics = []
        subtopics = []
        for a in test_data:
            topic, subtopic = extract_topics_from_url(a['url'])
            topics.append(topic)
            subtopics.append(subtopic if subtopic != "NO_SUBTOPIC" else "none")

        date_feats = process_date_features(test_data)

        topic_ids = [self.topic_encoder.classes_[0] if t not in self.topic_encoder.classes_ else t for t in topics]
        subtopic_ids = [self.subtopic_encoder.classes_[0] if s not in self.subtopic_encoder.classes_ else s for s in subtopics]

        topic_ids = self.topic_encoder.transform(topic_ids)
        subtopic_ids = self.subtopic_encoder.transform(subtopic_ids)

        X = torch.tensor(bert_vectors, dtype=torch.float32)
        topic_ids = torch.tensor(topic_ids, dtype=torch.long)
        subtopic_ids = torch.tensor(subtopic_ids, dtype=torch.long)
        date_feats = torch.tensor(date_feats, dtype=torch.float32)

        self.model.eval()
        with torch.no_grad():
            X, topic_ids, subtopic_ids, date_feats = X.to(self.device), topic_ids.to(self.device), subtopic_ids.to(self.device), date_feats.to(self.device)
            preds, _ = self.model(X, topic_ids, subtopic_ids, date_feats)
            return np.clip(preds.cpu().numpy(), 0, None)

# --- 6. Main ---
if __name__ == '__main__':
    train = load("../data/rtvslo_train.json")
    test = load("../data/rtvslo_test.json")

    m = RTVSloNB(
        eval_split=0.05,
        batch_size=170,
        epochs=50,
        learning_rate=1e-4
    )

    m.fit(train)
    p = m.predict(test)
    np.savetxt("final_predictions.txt", p, fmt="%.4f")


Epoch 1 - Val MAE: 38.44
Epoch 2 - Val MAE: 33.99
Epoch 3 - Val MAE: 32.05
Epoch 4 - Val MAE: 31.25
Epoch 5 - Val MAE: 30.24
Epoch 6 - Val MAE: 29.07
Epoch 7 - Val MAE: 28.01
Epoch 8 - Val MAE: 27.75
Epoch 9 - Val MAE: 27.33
Epoch 10 - Val MAE: 28.09
Epoch 11 - Val MAE: 27.54
Epoch 12 - Val MAE: 27.62
Epoch 13 - Val MAE: 27.97
Epoch 14 - Val MAE: 28.82
Epoch 15 - Val MAE: 27.67
Epoch 16 - Val MAE: 26.39
Epoch 17 - Val MAE: 26.93
Epoch 18 - Val MAE: 26.42
Epoch 19 - Val MAE: 26.76
Epoch 20 - Val MAE: 26.46
Epoch 21 - Val MAE: 25.74
Epoch 22 - Val MAE: 25.34
Epoch 23 - Val MAE: 26.18
Epoch 24 - Val MAE: 26.58
Epoch 25 - Val MAE: 26.16
Epoch 26 - Val MAE: 26.76
Epoch 27 - Val MAE: 25.56
Epoch 28 - Val MAE: 26.41
Epoch 29 - Val MAE: 25.09
Epoch 30 - Val MAE: 24.92
Epoch 31 - Val MAE: 24.80
Epoch 32 - Val MAE: 25.73
Epoch 33 - Val MAE: 24.97
Epoch 34 - Val MAE: 25.77
Epoch 35 - Val MAE: 27.84
Epoch 36 - Val MAE: 26.24
Epoch 37 - Val MAE: 23.48
Epoch 38 - Val MAE: 23.79
Epoch 39 - Val MAE: 2

  X = torch.tensor(bert_vectors, dtype=torch.float32)


In [3]:
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

# 1. Load your predictions
preds = np.loadtxt("predictions.txt")

# 2. Load your true y-values from dataset_val.json
with open("../data/rtvslo_validation.json", "r", encoding="utf-8") as f:
    val_articles = json.load(f)

# 3. Extract true n_comments
y_true = np.array([a["n_comments"] for a in val_articles], dtype=np.float32)

# 4. Check lengths
assert len(preds) == len(y_true), f"Length mismatch: preds={len(preds)}, y_true={len(y_true)}"

# 5. Calculate MAE
mae = mean_absolute_error(y_true, preds)

print(f"ðŸ“Š MAE between predictions and true values: {mae:.2f}")


ðŸ“Š MAE between predictions and true values: 26.36


In [16]:
# Load your new sloberta embeddings and time features
text_embeddings_test = torch.load("sloberta_embeddings_final.pt", weights_only=True, map_location='cpu')
# 3. Preprocess time features
df = pd.read_json("../data/rtvslo_test.json")  # or whatever your test set is

# 3. Preprocess time features
df['date'] = pd.to_datetime(df['date'])
    
# Extract raw features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.weekday  # Monday = 0
df['hour'] = df['date'].dt.hour

df['year_scaled'] = year_scaler.transform(df['year'].values.reshape(-1, 1))
df['month_scaled'] = month_scaler.transform(df['month'].values.reshape(-1, 1))

df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Only using 'days_since' here
time_features_test = torch.tensor(
    df[['year_scaled', 'month_scaled', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos']].values,
    dtype=torch.float32
)

# Move to TensorDataset and Loader
test_dataset = TensorDataset(text_embeddings_test, time_features_test)
test_loader = DataLoader(test_dataset, batch_size=128)

# Predict
model.eval()
preds = []

with torch.no_grad():
    for text, time in test_loader:
        text, time = text.to(device), time.to(device)
        mu, _ = model(text, time)
        preds.append(mu.cpu())

# Stack predictions
final_preds = torch.cat(preds).numpy()

# Save to txt
np.savetxt("final_predictions_nb.txt", final_preds, fmt="%.3f")

print("âœ… Final predictions saved to final_predictions_nb.txt")


âœ… Final predictions saved to final_predictions_nb.txt
