In [1]:
!pip install gdown einops

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m635.1 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: einops
Successfully installed einops-0.8.0


In [2]:
!gdown 1viFKqtYTtTiP9_EdBXVpCmWbNmxDiXWG

Downloading...
From (original): https://drive.google.com/uc?id=1viFKqtYTtTiP9_EdBXVpCmWbNmxDiXWG
From (redirected): https://drive.google.com/uc?id=1viFKqtYTtTiP9_EdBXVpCmWbNmxDiXWG&confirm=t&uuid=f00608a6-0d27-4246-8ba3-457e350e0ae7
To: /content/intern_task.csv
100% 222M/222M [00:01<00:00, 162MB/s]


In [24]:
import torch
from tqdm.notebook import tqdm

from dataset import Dataset, split_data
from metrics import ndcgk
from model import Encoder
from scheduler import CosineScheduler

epochs = 80
warmup_epochs = 16
decay_epochs = 8
batch_size = 64
start_lr = 1e-4
max_lr = 6e-3
min_lr = 1e-7
weight_decay = 0.05
grad_clip = 1.0
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Encoder().to(device)
train_data, test_data = split_data("/content/intern_task.csv", 0.1, 42)
train_ds = Dataset(train_data, batch_size=batch_size)
test_ds = Dataset(test_data, batch_size=batch_size)

epoch_iters = len(train_ds)
optimizer = torch.optim.AdamW(model.parameters(), weight_decay)
scaler = torch.cuda.amp.GradScaler()
criterion = torch.nn.L1Loss()
scheduler = CosineScheduler(
    optimizer,
    warmup_epochs * epoch_iters,
    (epochs - decay_epochs) * epoch_iters,
    start_lr,
    min_lr,
    max_lr,
)

for epoch in tqdm(range(1, epochs + 1)):
    train_loss = 0.0
    ndcg = torch.empty((0,))
    for iterations, (x, y, weights) in enumerate(train_ds.batch(), start=1):
        x = x.to(device)
        y = y.to(device)
        weights = weights.to(device)
        scheduler.step()
        optimizer.zero_grad()
        pred = weights * model(x)
        loss = criterion(pred, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        train_loss += loss.item()
        with torch.no_grad():
            ndcg = torch.cat([ndcg, ndcgk(y.cpu(), pred.cpu(), k=5, rel="exp2")])
    ndcg = ndcg.mean().item()
    print(
        f"Epochs: {epoch}/{epochs} | Train_loss {train_loss / iterations:.4f} | ndcg {ndcg:.4f}"
    )
test_loss = 0.0
ndcg = torch.empty((0,))
model.eval()
for iterations, (x, y, weights) in enumerate(train_ds.batch(), start=1):
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)
        weights = weights.to(device)
        pred = weights * model(x)
        loss = criterion(pred, y)
        test_loss += loss.item()
        ndcg = torch.cat([ndcg, ndcgk(y.cpu(), pred.cpu(), k=5, rel="exp2")])
ndcg = ndcg.mean().item()
print(f"Test loss {test_loss / iterations:.4f} | ndcg {ndcg:.4f}")

  0%|          | 0/80 [00:00<?, ?it/s]

Epochs: 1/80 | Train_loss 0.2532 | ndcg 0.2019
Epochs: 2/80 | Train_loss 0.2409 | ndcg 0.2059
Epochs: 3/80 | Train_loss 0.2425 | ndcg 0.2314
Epochs: 4/80 | Train_loss 0.2557 | ndcg 0.2194
Epochs: 5/80 | Train_loss 0.2475 | ndcg 0.2375
Epochs: 6/80 | Train_loss 0.2416 | ndcg 0.2548
Epochs: 7/80 | Train_loss 0.2615 | ndcg 0.2494
Epochs: 8/80 | Train_loss 0.2573 | ndcg 0.2370
Epochs: 9/80 | Train_loss 0.2485 | ndcg 0.2544
Epochs: 10/80 | Train_loss 0.2467 | ndcg 0.2459
Epochs: 11/80 | Train_loss 0.2446 | ndcg 0.2404
Epochs: 12/80 | Train_loss 0.2369 | ndcg 0.2587
Epochs: 13/80 | Train_loss 0.2405 | ndcg 0.2614
Epochs: 14/80 | Train_loss 0.2416 | ndcg 0.2799
Epochs: 15/80 | Train_loss 0.2487 | ndcg 0.2588
Epochs: 16/80 | Train_loss 0.2334 | ndcg 0.2662
Epochs: 17/80 | Train_loss 0.2426 | ndcg 0.2596
Epochs: 18/80 | Train_loss 0.2418 | ndcg 0.2616
Epochs: 19/80 | Train_loss 0.2436 | ndcg 0.2748
Epochs: 20/80 | Train_loss 0.2505 | ndcg 0.2625
Epochs: 21/80 | Train_loss 0.2506 | ndcg 0.2645
E

In [23]:
import torch
from tqdm.notebook import tqdm

from dataset import Dataset, split_data
from metrics import ndcgk, dcg
from model import Encoder
from scheduler import CosineScheduler

epochs = 80
warmup_epochs = 16
decay_epochs = 8
batch_size = 64
start_lr = 1e-4
max_lr = 6e-3
min_lr = 1e-7
weight_decay = 0.05
grad_clip = 1.0
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Encoder().to(device)
train_data, test_data = split_data("/content/intern_task.csv", 0.1, 42)
train_ds = Dataset(train_data, batch_size=batch_size)
test_ds = Dataset(test_data, batch_size=batch_size)

epoch_iters = len(train_ds)
optimizer = torch.optim.AdamW(model.parameters(), weight_decay)
scaler = torch.cuda.amp.GradScaler()
criterion = torch.nn.L1Loss()
scheduler = CosineScheduler(
    optimizer,
    warmup_epochs * epoch_iters,
    (epochs - decay_epochs) * epoch_iters,
    start_lr,
    min_lr,
    max_lr,
)

for epoch in tqdm(range(1, epochs + 1)):
    ndcg = torch.empty((0,))
    for iterations, (x, y, weights) in enumerate(train_ds.batch(), start=1):
        x = x.to(device)
        y = y.to(device)
        weights = weights.to(device)
        scheduler.step()
        optimizer.zero_grad()
        pred = weights * model(x)

        # LambdaRank gradient computation
        sigma = 1.0
        N = 1 / (dcg(y.cpu(), y.cpu(), x.size(1), "exp2").sum(-1) + 1e-10)
        rank_order = pred.argsort(descending=True, dim=-1).argsort(dim=-1) + 1

        pos_pairs_score_diff = 1.0 + torch.exp(
            sigma * (pred.unsqueeze(1) - pred.unsqueeze(-1))
        )

        rel_diff = y.unsqueeze(1) - y.unsqueeze(-1)
        pos_pairs = (rel_diff > 0.0).type(torch.float32)
        neg_pairs = (rel_diff < 0.0).type(torch.float32)
        Sij = pos_pairs - neg_pairs

        gain_diff = torch.pow(2, y.unsqueeze(1)) - torch.pow(2, y.unsqueeze(-1))
        decay_diff = 1.0 / torch.log2(rank_order.unsqueeze(1) + 1.0) - 1.0 / torch.log2(
            rank_order.unsqueeze(-1) + 1.0
        )
        delta_ndcg = torch.abs(N.view(-1, 1, 1).to(device) * gain_diff * decay_diff)
        lambda_update = (
            sigma * (0.5 * (1 - Sij) - 1 / pos_pairs_score_diff) * delta_ndcg
        )

        pred.backward(torch.sum(lambda_update, 1))
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        with torch.no_grad():
            ndcg = torch.cat([ndcg, ndcgk(y.cpu(), pred.cpu(), k=5, rel="exp2")])
    ndcg = ndcg.mean().item()
    print(f"Epochs: {epoch}/{epochs} | ndcg {ndcg:.4f}")
test_loss = 0.0
ndcg = torch.empty((0,))
model.eval()
for iterations, (x, y, weights) in enumerate(train_ds.batch(), start=1):
    with torch.no_grad():
        x = x.to(device)
        y = y.to(device)
        weights = weights.to(device)
        pred = weights * model(x)
        ndcg = torch.cat([ndcg, ndcgk(y.cpu(), pred.cpu(), k=5, rel="exp2")])
ndcg = ndcg.mean().item()
print(f"Test Ndcg {ndcg:.4f}")

  0%|          | 0/80 [00:00<?, ?it/s]

Epochs: 1/80 | ndcg 0.1507
Epochs: 2/80 | ndcg 0.1856
Epochs: 3/80 | ndcg 0.2059
Epochs: 4/80 | ndcg 0.2305
Epochs: 5/80 | ndcg 0.2457
Epochs: 6/80 | ndcg 0.2555
Epochs: 7/80 | ndcg 0.2619
Epochs: 8/80 | ndcg 0.2719
Epochs: 9/80 | ndcg 0.2800
Epochs: 10/80 | ndcg 0.2915
Epochs: 11/80 | ndcg 0.2919
Epochs: 12/80 | ndcg 0.2855
Epochs: 13/80 | ndcg 0.2934
Epochs: 14/80 | ndcg 0.2973
Epochs: 15/80 | ndcg 0.3078
Epochs: 16/80 | ndcg 0.3129
Epochs: 17/80 | ndcg 0.3205
Epochs: 18/80 | ndcg 0.3259
Epochs: 19/80 | ndcg 0.3158
Epochs: 20/80 | ndcg 0.3212
Epochs: 21/80 | ndcg 0.3338
Epochs: 22/80 | ndcg 0.3210
Epochs: 23/80 | ndcg 0.3272
Epochs: 24/80 | ndcg 0.3220
Epochs: 25/80 | ndcg 0.3386
Epochs: 26/80 | ndcg 0.3299
Epochs: 27/80 | ndcg 0.3365
Epochs: 28/80 | ndcg 0.3252
Epochs: 29/80 | ndcg 0.3350
Epochs: 30/80 | ndcg 0.3416
Epochs: 31/80 | ndcg 0.3354
Epochs: 32/80 | ndcg 0.3378
Epochs: 33/80 | ndcg 0.3330
Epochs: 34/80 | ndcg 0.3241
Epochs: 35/80 | ndcg 0.3358
Epochs: 36/80 | ndcg 0.3425
E