In [1]:
import os
from collections import defaultdict

import numpy as np

import torch
from torch import nn
from torch.autograd import Variable, grad
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm_notebook as tqdm

In [2]:
def load_txt(path, dtype=np.float):
    res = []
    with open(path) as f:
        for line in f:
            res.append(line.split())
    return np.array(res, dtype=dtype)

def load_txt_fast(path, shape, dtype=np.float):
    res = np.empty(shape, dtype=dtype)
    with open(path) as f:
        i = 0
        for line in f:
            res[i] = line.split()
            i += 1
    return res

In [3]:
## NDCG

def calc_dcg(m, p, k=5):
    assert len(m) == len(p)
    order = np.argsort(-p)[:k]
    dcg = 0
    for i in order:
        dcg += (2 ** m[i] - 1) / (np.log2(i + 2))
    return dcg

def calc_ndcg(marks, preds, groups):
    ndcgs = []
    
    start = 0
    cur_group = groups[0]
    for i in range(len(marks)):
        if groups[i] != cur_group:
            m = marks[start:i]
            p = preds[start:i]
            ndcg = calc_dcg(m, p) / (calc_dcg(m, m) + 1e-5)
            ndcgs.append(ndcg)
            start = i
            cur_group = groups[i]
    m = marks[start:]
    p = preds[start:]
    ndcg = calc_dcg(m, p) / (calc_dcg(m, m) + 1e-5)
    ndcgs.append(ndcg)
    
    return np.mean(ndcgs)

In [4]:
X = load_txt_fast('./data/processed.tsv', (161626, 200), dtype=np.int)
y_tr = load_txt_fast('../../data/raw/m_train.tsv', (161626, 1)).ravel()
groups_tr = load_txt_fast('../../data/raw/groups.tr.tsv', (161626, 1), dtype=np.int).ravel()
y = ((y_tr > 2)).astype(np.float32)

In [5]:
y_te = load_txt_fast('../../data/raw/m_test.tsv', (40101, 1), dtype=np.int).ravel()
groups_te = load_txt_fast('../../data/raw/groups.te.tsv', (40101, 1), dtype=np.int).ravel()

X_te = load_txt_fast('./data/processed_te.tsv', (40101, 200), dtype=np.int)
X_te = X_te.reshape(-1, 2, 100)

In [6]:
K = 100
assert X.shape[1] == K * 2
X = X.reshape(-1, 2, K)

In [7]:
class LambdaGroup:
    def __init__(self, marks, begin, end):
        self.begin = begin
        self.end = end
        marks = marks[begin:end]
        sg = defaultdict(list)
        for i, m in enumerate(marks):
            sg[m].append(i)
        self.subgroups = list(map(lambda x:x[1], sorted(sg.items(), key=lambda x:-x[0])))
        
class LambdaDssmDataloader:
    def __init__(self, marks, data, groups, shuffle=False):
        assert len(marks) == len(data)
        assert len(marks) == len(groups)
        
        self.marks = marks
        self.data = data
        self.shuffle = shuffle
        
        cur_group = groups[0]
        self.groups = []
        begin = 0
        for i in range(len(groups)):
            if groups[i] != cur_group:
                cur_group = groups[i]
                self.groups.append(LambdaGroup(marks, begin, i))
                begin = i
        self.groups.append(LambdaGroup(marks, begin, len(groups)))
        
    def __len__(self):
        return len(self.groups)
    
    def __getitem__(self, idx):
        g = self.groups[idx]
        begin = g.begin
        end = g.end
        return self.marks[begin:end], self.data[begin:end], g.subgroups
    
    def __iter__(self):
        order = np.arange(len(self))
        if self.shuffle:
            np.random.shuffle(order)
        for i in order:
            yield self[i]

In [8]:
train_dataloader = LambdaDssmDataloader(y_tr, X, groups_tr, True)
train_dataloader_lin = LambdaDssmDataloader(y_tr, X, groups_tr, False)

test_dataloader = LambdaDssmDataloader(y_te, X_te, groups_te, False)

In [13]:
def eval_ndcg(model, dl, a_y, a_g):
    preds = np.empty(len(a_y))

    model.cuda()
    model.eval()
    with torch.no_grad():
        i = 0
        for targets, qd_pairs, _ in dl:
            qd_pairs = Variable(torch.tensor(qd_pairs)).cuda()
            pred = model(qd_pairs).cpu().data.numpy()
            preds[i:i+len(pred)] = pred
            i += len(pred)
    
    return calc_ndcg(a_y, preds, a_g)

In [14]:
def calc_lambdas(s, subgroups):
    lambdas = np.zeros(len(s), dtype=np.float32)
    for best_sg in range(len(subgroups) - 1):
        for worse_sg in range(best_sg + 1, len(subgroups)):
            for i in subgroups[best_sg]:
                for j in subgroups[worse_sg]:
                    delta = np.clip(s[i] - s[j], -30, 30)
                    lmb = - 1.0 / (1.0 + np.exp(delta))
                    lambdas[i] += lmb
                    lambdas[j] -= lmb
    return lambdas

In [32]:
class DSSM(nn.Module):
    def __init__(self):
        super(DSSM, self).__init__()
        
        self.embedding = nn.Embedding(13678, 300)
        self.linear_1 = nn.Linear(300, 300)
        self.linear_2 = nn.Linear(300, 300)
        self.linear_3 = nn.Linear(300, 128)
        
        self.dropout_1 = nn.Dropout()
        self.dropout_2 = nn.Dropout()
        
        self.NL = nn.functional.elu
    
    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout_1(x)
        x = torch.mean(x, 2)
#         x = self.NL(x)
#         x = self.linear_1(x)
#         x = self.NL(x)
#         x = self.linear_2(x)
#         x = self.NL(x)
        x = self.linear_3(x)
        
        ## Cosine
#         dot_prod = torch.sum(torch.prod(x, 1), 1)
#         norm = torch.sqrt(torch.prod(torch.sum(x * x, 2), 1))
#         self.last_dot_prod = dot_prod.cpu().data.numpy()
#         self.last_norm = norm.cpu().data.numpy()
#         cosine = dot_prod / (norm + 1e-4)
#         pred = (cosine + 1.0) / 2.0
        
#         cosine = cosine.view(-1, 1)
#         pred = torch.sigmoid(self.cos_final(cosine))

        ## Dot
        prod = torch.prod(x, 1)
        dot_prod = torch.mean(prod, 1)
        pred = dot_prod
        
        return pred.view(-1)

In [33]:
model = DSSM().cuda()
opt = torch.optim.Adam(model.parameters(), 0.01)
scheduler = torch.optim.lr_scheduler.StepLR(opt, 999999999, 0.5)

In [34]:
epochs = 10

for e in range(1, epochs+1):
    model.train()
    model.cuda()
    cnt = 10000
    for marks, qd_pairs, subgroups in train_dataloader:
        cnt -= 1
        if cnt == 0:
            break
        qd_pairs = Variable(torch.tensor(qd_pairs)).cuda()
        
        model.zero_grad()
        
        s = model(qd_pairs)
        lambdas = calc_lambdas(s.data.cpu().numpy(), subgroups)
        
        lambdas = torch.tensor(lambdas).cuda()
        s.backward(lambdas)
        opt.step()
    train_ndcg = eval_ndcg(model, train_dataloader_lin, y_tr, groups_tr)
    test_ndcg = eval_ndcg(model, test_dataloader, y_te, groups_te)
    scheduler.step()
    print('[%d / %d]\t%.3lf\t%.3lf' % (e, epochs, train_ndcg, test_ndcg))

[1 / 10]	0.486	0.446
[2 / 10]	0.560	0.454
[3 / 10]	0.609	0.459
[4 / 10]	0.660	0.502
[5 / 10]	0.682	0.478
[6 / 10]	0.718	0.496
[7 / 10]	0.724	0.494
[8 / 10]	0.725	0.478
[9 / 10]	0.749	0.500
[10 / 10]	0.756	0.485


## Predict

In [35]:
%%time

preds = np.empty(len(y_te))

model.cuda()
i = 0
for qd_pairs, targets in test_dataloader:
    qd_pairs = Variable(qd_pairs).cuda()
    targets = Variable(targets).cuda()
    pred = model(qd_pairs).cpu().data.numpy()
    preds[i:i+len(pred)] = pred
    i += len(pred)

CPU times: user 2.03 s, sys: 108 ms, total: 2.14 s
Wall time: 2.13 s


In [36]:
calc_ndcg(y_te, preds, groups_te)

0.4653389935005265

In [30]:
np.savetxt('./data/pred.tsv', preds)