In [12]:
# --- 1. Imports ---
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
from tqdm import tqdm

# --- 2. Utility functions ---
def load(fn):
    with open(fn, 'r', encoding='utf-8') as f:
        return json.load(f)

def process_date_features(articles):
    years = np.array([a['year'] for a in articles]).reshape(-1, 1)
    months = np.array([a['month'] for a in articles]).reshape(-1, 1)
    day_of_week = np.array([a['day_of_week'] for a in articles])
    hour = np.array([a['hour'] for a in articles])

    year_scaler = StandardScaler()
    month_scaler = StandardScaler()
    years_scaled = year_scaler.fit_transform(years)
    months_scaled = month_scaler.fit_transform(months)

    day_sin = np.sin(2 * np.pi * day_of_week / 7)
    day_cos = np.cos(2 * np.pi * day_of_week / 7)
    hour_sin = np.sin(2 * np.pi * hour / 24)
    hour_cos = np.cos(2 * np.pi * hour / 24)

    date_feats = np.concatenate([years_scaled, months_scaled, day_sin[:, None], day_cos[:, None], hour_sin[:, None], hour_cos[:, None]], axis=1)
    return date_feats

def extract_topics_from_url(url):
    parts = url.split('/')
    topic = parts[3] if len(parts) > 3 else 'none'
    subtopic = parts[4] if len(parts) > 4 else 'none'
    return topic, subtopic

# --- 3. Dataset and Model ---
class NewsDataset(Dataset):
    def __init__(self, bert_vectors, topic_ids, subtopic_ids, date_feats, targets):
        self.X = torch.tensor(bert_vectors, dtype=torch.float32)
        self.topic_ids = torch.tensor(topic_ids, dtype=torch.long)
        self.subtopic_ids = torch.tensor(subtopic_ids, dtype=torch.long)
        self.date_feats = torch.tensor(date_feats, dtype=torch.float32)
        self.y = torch.tensor(targets, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.topic_ids[idx], self.subtopic_ids[idx], self.date_feats[idx], self.y[idx]

class MLPWithEmbeddings(nn.Module):
    def __init__(self, input_dim, num_topics, num_subtopics):
        super().__init__()
        self.topic_embedding = nn.Embedding(num_topics, 16)
        self.subtopic_embedding = nn.Embedding(num_subtopics, 24)

        self.model = nn.Sequential(
            nn.Linear(input_dim + 16 + 24 + 6, 1024),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 512),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(512, 128),
            nn.GELU(),
            nn.Linear(128, 1),
            nn.Softplus()
        )

    def forward(self, x_text, topic_ids, subtopic_ids, date_feats):
        topic_embed = self.topic_embedding(topic_ids)
        subtopic_embed = self.subtopic_embedding(subtopic_ids)
        x = torch.cat([x_text, topic_embed, subtopic_embed, date_feats], dim=1)
        return self.model(x)

# --- 4. Main Model Trainer ---
class RTVSloBERT:
    def __init__(self, batch_size=64, epochs=10, learning_rate=1e-3, l2_lambda=1e-4, eval_split=0.2):
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.l2_lambda = l2_lambda
        self.eval_split = eval_split

    def fit(self, train_data):
        for a in train_data:
            dt = pd.to_datetime(a['date'])
            a['year'] = dt.year
            a['month'] = dt.month
            a['day_of_week'] = dt.weekday()
            a['hour'] = dt.hour

        raw_targets = [a['n_comments'] for a in train_data]
        targets = [np.log1p(t) for t in raw_targets]
        bert_vectors = torch.load("train/sloberta_embeddings.pt", map_location="cpu").numpy()

        topics = []
        subtopics = []
        for a in train_data:
            topic, subtopic = extract_topics_from_url(a['url'])
            topics.append(topic)
            subtopics.append(subtopic if subtopic != "NO_SUBTOPIC" else "none")

        date_feats = process_date_features(train_data)

        self.topic_encoder = LabelEncoder().fit(topics)
        self.subtopic_encoder = LabelEncoder().fit(subtopics)

        topic_ids = self.topic_encoder.transform(topics)
        subtopic_ids = self.subtopic_encoder.transform(subtopics)

        X_train, X_val, topic_train, topic_val, subtopic_train, subtopic_val, date_train, date_val, y_train, y_val = train_test_split(
            bert_vectors, topic_ids, subtopic_ids, date_feats, targets, test_size=self.eval_split, random_state=42
        )

        train_dataset = NewsDataset(X_train, topic_train, subtopic_train, date_train, y_train)
        val_dataset = NewsDataset(X_val, topic_val, subtopic_val, date_val, y_val)

        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size)

        input_dim = bert_vectors.shape[1]
        self.model = MLPWithEmbeddings(input_dim, len(self.topic_encoder.classes_), len(self.subtopic_encoder.classes_))

        self._train(self.model, train_loader, val_loader)

    def _train(self, model, train_loader, val_loader):
        optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate, weight_decay=self.l2_lambda)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
        criterion = nn.HuberLoss(delta=5.0)

        best_val_mae = float('inf')
        best_model_state = None

        for epoch in range(self.epochs):
            model.train()
            train_mae_list = []
            for x_batch, topic_ids, subtopic_ids, date_feats, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(x_batch, topic_ids, subtopic_ids, date_feats).squeeze()
                loss = criterion(y_pred, y_batch)
                loss.backward()
                optimizer.step()

                y_pred_clamped = torch.clamp(y_pred, -10, 10)
                y_batch_clamped = torch.clamp(y_batch, -10, 10)
                mae = torch.mean(torch.abs(torch.expm1(y_pred_clamped) - torch.expm1(y_batch_clamped))).item()
                train_mae_list.append(mae)

            model.eval()
            val_mae_list = []
            with torch.no_grad():
                for x_batch, topic_ids, subtopic_ids, date_feats, y_batch in val_loader:
                    y_pred = model(x_batch, topic_ids, subtopic_ids, date_feats).squeeze()
                    y_pred_clamped = torch.clamp(y_pred, -10, 10)
                    y_batch_clamped = torch.clamp(y_batch, -10, 10)
                    mae = torch.mean(torch.abs(torch.expm1(y_pred_clamped) - torch.expm1(y_batch_clamped))).item()
                    val_mae_list.append(mae)

            train_mae = np.mean(train_mae_list)
            val_mae = np.mean(val_mae_list)
            scheduler.step(val_mae)

            if val_mae < best_val_mae:
                best_val_mae = val_mae
                best_model_state = model.state_dict()

            print(f"Epoch {epoch+1} - Train MAE: {train_mae:.2f} | Val MAE: {val_mae:.2f}")

        if best_model_state is not None:
            model.load_state_dict(best_model_state)

    def predict(self, test_data, bert_vectors):
        for a in test_data:
            dt = pd.to_datetime(a['date'])
            a['year'] = dt.year
            a['month'] = dt.month
            a['day_of_week'] = dt.weekday()
            a['hour'] = dt.hour

        topics = []
        subtopics = []
        for a in test_data:
            topic, subtopic = extract_topics_from_url(a['url'])
            topics.append(topic)
            subtopics.append(subtopic if subtopic != "NO_SUBTOPIC" else "none")

        date_feats = process_date_features(test_data)

        topic_ids = [self.topic_encoder.classes_[0] if t not in self.topic_encoder.classes_ else t for t in topics]
        subtopic_ids = [self.subtopic_encoder.classes_[0] if s not in self.subtopic_encoder.classes_ else s for s in subtopics]

        topic_ids = self.topic_encoder.transform(topic_ids)
        subtopic_ids = self.subtopic_encoder.transform(subtopic_ids)

        X = torch.tensor(bert_vectors, dtype=torch.float32)
        topic_ids = torch.tensor(topic_ids, dtype=torch.long)
        subtopic_ids = torch.tensor(subtopic_ids, dtype=torch.long)
        date_feats = torch.tensor(date_feats, dtype=torch.float32)

        self.model.eval()
        with torch.no_grad():
            preds = self.model(X, topic_ids, subtopic_ids, date_feats).squeeze().numpy()
            return np.clip(np.expm1(preds), 0, None)

# --- 5. Main ---
if __name__ == '__main__':
    train = load("../data/rtvslo_train.json")
    validation = load("../data/rtvslo_validation.json")
    test = load("../data/rtvslo_test.json")

    m = RTVSloBERT(
        eval_split=0.05,
        batch_size=170,
        epochs=50,
        learning_rate=1e-4,
        l2_lambda=1e-3
    )

    m.fit(train)

    bert_vectors_validation = torch.load("validation/sloberta_embeddings_val.pt", map_location="cpu").numpy()
    p = m.predict(validation, bert_vectors_validation)
    np.savetxt("predictions_val.txt", p, fmt="%.4f")

    bert_vectors_test = torch.load("test_final/sloberta_embeddings_final.pt", map_location="cpu").numpy()
    p = m.predict(test, bert_vectors_test)
    np.savetxt("predictions_test.txt", p, fmt="%.4f")


  bert_vectors = torch.load("train/sloberta_embeddings.pt", map_location="cpu").numpy()


Epoch 1 - Train MAE: 30.59 | Val MAE: 26.73
Epoch 2 - Train MAE: 27.59 | Val MAE: 26.55
Epoch 3 - Train MAE: 26.77 | Val MAE: 25.23
Epoch 4 - Train MAE: 26.33 | Val MAE: 24.75
Epoch 5 - Train MAE: 25.67 | Val MAE: 24.91
Epoch 6 - Train MAE: 25.43 | Val MAE: 24.48
Epoch 7 - Train MAE: 25.01 | Val MAE: 24.37
Epoch 8 - Train MAE: 25.01 | Val MAE: 24.12
Epoch 9 - Train MAE: 24.52 | Val MAE: 23.98
Epoch 10 - Train MAE: 24.36 | Val MAE: 24.21
Epoch 11 - Train MAE: 24.18 | Val MAE: 23.57
Epoch 12 - Train MAE: 24.04 | Val MAE: 23.66
Epoch 13 - Train MAE: 23.88 | Val MAE: 23.30
Epoch 14 - Train MAE: 23.66 | Val MAE: 23.54
Epoch 15 - Train MAE: 23.56 | Val MAE: 23.95
Epoch 16 - Train MAE: 23.34 | Val MAE: 23.34
Epoch 17 - Train MAE: 23.27 | Val MAE: 23.48
Epoch 18 - Train MAE: 23.26 | Val MAE: 23.48
Epoch 19 - Train MAE: 22.99 | Val MAE: 23.24
Epoch 20 - Train MAE: 22.90 | Val MAE: 23.26
Epoch 21 - Train MAE: 22.82 | Val MAE: 23.78
Epoch 22 - Train MAE: 22.80 | Val MAE: 23.25
Epoch 23 - Train MA

  bert_vectors_validation = torch.load("validation/sloberta_embeddings_val.pt", map_location="cpu").numpy()
  bert_vectors_test = torch.load("test_final/sloberta_embeddings_final.pt", map_location="cpu").numpy()


In [74]:
import numpy as np
import json
import matplotlib.pyplot as plt

# --- 1. Load ---
preds = np.loadtxt("final_predictions_val_postprocessed.txt")

# Assuming you have the true labels:
with open("../data/rtvslo_validation.json", "r", encoding="utf-8") as f:
    val_articles = json.load(f)

y_true = np.array([a['n_comments'] for a in val_articles], dtype=np.float32)

# --- 2. Quick sanity check ---
assert len(preds) == len(y_true), "Length mismatch between preds and ground truth."

# --- 3. Define Buckets ---
bucket_edges = [0, 5, 20, 50, 100, 300, 1000, np.inf]
bucket_names = [
    "0â€“5",
    "5â€“20",
    "20â€“50",
    "50â€“100",
    "100â€“300",
    "300â€“1000",
    "1000+"
]

# --- 4. Bucket analysis ---
buckets = {name: [] for name in bucket_names}

for p, y in zip(preds, y_true):
    for i in range(len(bucket_edges)-1):
        if bucket_edges[i] <= p < bucket_edges[i+1]:
            buckets[bucket_names[i]].append((p, y))
            break

# --- 5. Report stats per bucket ---
print("\nðŸ“Š Error Analysis by Prediction Size Bucket:")
print(f"{'Bucket':<10} | {'N Samples':>10} | {'Mean Pred':>10} | {'Mean True':>10} | {'MAE':>8}")
print("-" * 60)

bucket_maes = []

for name, samples in buckets.items():
    if len(samples) == 0:
        continue
    preds_b, trues_b = zip(*samples)
    preds_b = np.array(preds_b)
    trues_b = np.array(trues_b)
    mae = np.mean(np.abs(preds_b - trues_b))
    bucket_maes.append(mae)
    print(f"{name:<10} | {len(samples):>10} | {np.mean(preds_b):>10.1f} | {np.mean(trues_b):>10.1f} | {mae:>8.2f}")



ðŸ“Š Error Analysis by Prediction Size Bucket:
Bucket     |  N Samples |  Mean Pred |  Mean True |      MAE
------------------------------------------------------------
0â€“5        |        731 |        1.8 |        4.5 |     3.82
5â€“20       |        585 |       10.9 |       18.1 |    13.07
20â€“50      |        404 |       31.8 |       42.3 |    29.45
50â€“100     |        239 |       72.2 |       84.8 |    50.52
100â€“300    |        233 |      166.7 |      192.2 |    86.05
300â€“1000   |         26 |      354.7 |      342.8 |   116.95


In [7]:
import numpy as np
import json
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

# --- 1. Load ---
preds = np.loadtxt("final_predictions_test.txt")

# --- 2. Postprocessing: Quantile Smoothing ---

# Step 1: Clip extreme values
max_allowed = 3000
preds = np.clip(preds, 0, max_allowed)

# Step 2: Quantile smoothing
q_low = np.percentile(preds, 1)
q_high = np.percentile(preds, 99)

# Linearly squash very small and very large predictions
def quantile_smooth(x):
    if x < q_low:
        return x * 0.7  # shrink low values
    elif x > q_high:
        return q_high # shrink heavy tails
    else:
        return x

vectorized_smooth = np.vectorize(quantile_smooth)
smoothed_preds = vectorized_smooth(preds)

# --- 3. Save ---
np.savetxt("adj_predictions.txt", smoothed_preds, fmt="%.4f")

print("âœ… Saved adj_predictions.txt with quantile smoothing.")

âœ… Saved adj_predictions.txt with quantile smoothing.


In [8]:
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

# 1. Load your predictions
preds = np.loadtxt("adj_predictions.txt")

# 2. Load your true y-values from dataset_val.json
with open("../data/rtvslo_validation.json", "r", encoding="utf-8") as f:
    val_articles = json.load(f)

# 3. Extract true n_comments
y_true = np.array([a["n_comments"] for a in val_articles], dtype=np.float32)

# 4. Check lengths
assert len(preds) == len(y_true), f"Length mismatch: preds={len(preds)}, y_true={len(y_true)}"

# 5. Calculate MAE
mae = mean_absolute_error(y_true, preds)

print(f"ðŸ“Š MAE between predictions and true values: {mae:.2f}")

ðŸ“Š MAE between predictions and true values: 25.85


In [5]:
import numpy as np
import json
from sklearn.metrics import mean_absolute_error

# Load predictions
preds = np.loadtxt("predictions.txt")

# Load validation set
with open("../data/rtvslo_validation.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)

# Extract true values
y_true = np.array([a["n_comments"] for a in val_data], dtype=np.float32)

# Filter to just articles with 0 true comments
zero_mask = (y_true == 0)
zero_preds = preds[zero_mask]
zero_true = y_true[zero_mask]

# Evaluate
mae_zero = mean_absolute_error(zero_true, zero_preds)
print(f"ðŸ“‰ MAE on zero-comment articles: {mae_zero:.2f} (should be close to 0)")
print(f"Average predicted comments for zero-articles: {np.mean(zero_preds):.2f}")


ðŸ“‰ MAE on zero-comment articles: 1.91 (should be close to 0)
Average predicted comments for zero-articles: 1.91


## ------------------

In [None]:
# --- 1. Load best model ---
model.load_state_dict(torch.load("best_model_topic.pt"))  # âš¡ Match training model name!
model.eval()

# --- 2. Prepare validation data ---
val_df = pd.read_json("../data/rtvslo_validation.json")
val_df = enrich_articles_with_time_features(val_df)

# Load sloberta validation embeddings
text_embeddings_val = torch.load("sloberta_embeddings_val.pt", weights_only=True)
targets_val = torch.load("targets_val.pt", weights_only=True)

# Process time features
time_features_val, _ = process_time_features(val_df, scaler=time_scaler)

# Encode topics/subtopics using the SAME encoders!
topic_ids_val = torch.tensor(topic_encoder.transform(val_df['topic']), dtype=torch.long)
subtopic_ids_val = torch.tensor(subtopic_encoder.transform(val_df['subtopic']), dtype=torch.long)

# --- 3. TensorDataset and DataLoader ---
val_dataset = TensorDataset(text_embeddings_val, time_features_val, topic_ids_val, subtopic_ids_val, targets_val)
val_loader = DataLoader(val_dataset, batch_size=128)

# --- 4. Evaluation loop ---
y_preds = []
y_trues = []

with torch.no_grad():
    for text, time, topic_id, subtopic_id, y in val_loader:
        text, time, topic_id, subtopic_id, y = text.to(device), time.to(device), topic_id.to(device), subtopic_id.to(device), y.to(device)
        y_pred = model(text, time, topic_id, subtopic_id)
        y_preds.append(torch.expm1(y_pred).cpu())  # ðŸ”¥ Reverse log1p
        y_trues.append(torch.expm1(y).cpu())

y_preds = torch.cat(y_preds).numpy()
y_trues = torch.cat(y_trues).numpy()

# --- 5. Metrics ---
mae = mean_absolute_error(y_trues, y_preds)
rmse = np.sqrt(mean_squared_error(y_trues, y_preds))
r2 = r2_score(y_trues, y_preds)

print("\nðŸ“Š Validation Results with Topics:")
print(f"  MAE : {mae:.2f}")
print(f"  RMSE: {rmse:.2f}")
print(f"  RÂ²  : {r2:.4f}")


ValueError: y contains previously unseen labels: 'dopolnitev-strategije-drzavnih-nalozb-cilj-invalidskih-podjetij-tudi-druzbeno-odgovorno-upravljanje'

### Test set output

In [48]:
import torch
import pandas as pd
import numpy as np

# 1. Load your saved model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LinearRegressionPredictor(
    input_dim_text=text_embeddings.shape[1],  # match your trained model
    input_dim_time=6,                         # if you use only days_since
    hidden_dim=128,
    dropout_prob=0.5
).to(device)
model.load_state_dict(torch.load("best_model_linear_log.pt", map_location=device))
model.eval()

# 2. Load new articles
df = pd.read_json("../data/rtvslo_test.json")  # or whatever your test set is

# 3. Preprocess time features
df['date'] = pd.to_datetime(df['date'])
    
# Extract raw features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.weekday  # Monday = 0
df['hour'] = df['date'].dt.hour

df['year_scaled'] = year_scaler.transform(df['year'].values.reshape(-1, 1))
df['month_scaled'] = month_scaler.transform(df['month'].values.reshape(-1, 1))

df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Only using 'days_since' here
time_features_new = torch.tensor(
    df[['year_scaled', 'month_scaled', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos']].values,
    dtype=torch.float32
)

# 4. Load the corresponding SloBERTa embeddings for new articles
text_embeddings_new = torch.load("sloberta_embeddings_final.pt", weights_only=True)

# 5. Predict
model.eval()
preds = []

batch_size = 128
dataset = torch.utils.data.TensorDataset(text_embeddings_new, time_features_new)
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
print(len(dataset))

with torch.no_grad():
    for text, time in loader:
        text, time = text.to(device), time.to(device)
        y_pred = model(text, time)
        y_pred = torch.expm1(y_pred)  # ðŸ”¥ reverse log1p to real counts
        preds.append(y_pred.cpu())

# Stack predictions
preds = torch.cat(preds).numpy()
print(len(preds))

# 6. Save predictions to .txt
np.savetxt("final_predictions.txt", preds, fmt="%.3f")  # or "%.0f" if you want integer predictions

print("âœ… Saved predictions to final_predictions.txt")

2218
2218
âœ… Saved predictions to final_predictions.txt
