In [1]:
!pip install -q numpy pandas tqdm pymystem3 tensorflow torch transformers qdrant-client ranx

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 21.12.2 requires cupy-cuda115, which is not installed.
cudf 21.12.2 requires cupy-cuda115, which is not installed.
xarray-einstats 0.2.2 requires numpy>=1.21, but you have numpy 1.20.3 which is incompatible.
tfx-bsl 1.12.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.79.0 which is incompatible.
tfx-bsl 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
tensorflow-transform 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
onnx 1.13.1 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.
librosa 0.10.0 requires soundfile>=0.12.1, but you have soundfile 0.11.0 which is incompatible.
featuretools 1.11.1 requires numpy>=1.21.0, but you have numpy 1.20.3 which is inco

In [2]:
import shutil

import numpy as np
import pandas as pd

import torch
from torch import nn, optim

from transformers import AutoTokenizer, AutoModel
from transformers import logging
logging.set_verbosity_error()

from functools import partial

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from qdrant_client.http.models import CollectionStatus

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

from ranx import Qrels, Run, evaluate, compare

from tqdm import tqdm
from copy import deepcopy

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

DIR = '/kaggle/input/aaa-project-search/'

cuda:0


In [3]:
df = pd.read_hdf(DIR + 'search_relevance_dataset_v1.hdf', 'table')
df.drop(columns=['query_category_id', 'query_microcat_id', 'query_location_id'], inplace=True)

df.query_id = df.query_id.astype(str)
df.item_id = df.item_id.astype(str)

df.head(3)

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target
0,274025,2108 ссср,964140459,Советские бутыли канистры 60-80-х СССР ваз газ...,Для ценителей и понимающих.\n\nПодробные фотог...,стопсфинксstop,0
1,274025,2108 ссср,990433426,Ваз 2108 СССР цвет салатовый 1/43 идеальный,красивый салатовый цвет\nвсе детали в наличии\...,модель,1
2,274025,2108 ссср,994402610,Модели советских машин ваз 2102 почта М 1/43 №10,Продается модель автомобиля ВАЗ 2102 почта . ...,модель,1


In [4]:
client = QdrantClient(':memory:')

In [5]:
def index_dataset(client, df, model, vector_size, collection_name='collection'):
    client.recreate_collection(
        collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )

    df = df.drop_duplicates(subset=['item_id'])

    points = []
    for row in tqdm(df.itertuples(), total=len(df)):
        points.append(
            PointStruct(
                id=row.Index,
                vector=list(map(float, model(row.title))), # list(map(float.. to make proper type
                payload={
                    'title': row.title, 'description': row.description, 'keywords': row.keywords, 'item_id': row.item_id
                },
            )
        )

    operation_info = client.upsert(collection_name=collection_name, wait=True, points=points)

    return operation_info

In [6]:
# random model
random_model = lambda text: np.random.randn(32)

# baseline model 1
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModel.from_pretrained('cointegrated/rubert-tiny2').to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

baseline_model_1 = partial(embed_bert_cls, model=model, tokenizer=tokenizer)

# baseline model 2
tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
model = AutoModel.from_pretrained('cointegrated/LaBSE-en-ru').to(device)

def embed_labse_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

baseline_model_2 = partial(embed_labse_cls, model=model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/118M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/516M [00:00<?, ?B/s]

In [7]:
baseline_model_1('hello').shape, baseline_model_2('hello').shape # ((312,), (768,))

((312,), (768,))

In [8]:
class ProjectorModel(nn.Module):
    def __init__(self, model_name: str = 'cointegrated/rubert-tiny2', final_emb_size: int = 32):
        super().__init__()

        self.model_name = model_name
        self.final_emb_size = final_emb_size
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.backbone = AutoModel.from_pretrained(self.model_name, output_hidden_states=True).to(device)

        for n, p in self.backbone.named_parameters():
            p.requires_grad = False

        self.initial_emd_size = 312 if self.model_name == 'cointegrated/rubert-tiny2' else 768

        self.projection_head = nn.Sequential(
            nn.Linear(self.initial_emd_size, self.final_emb_size, device=device),            
        )

    def backbone_forward(self, text):
        t = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        model_output = self.backbone(**{k: v.to(self.backbone.device) for k, v in t.items()})

#         embeddings = model_output.pooler_output # torch.concat([model_output['hidden_states'][-i].mean(dim=1) for i in range(1, 5, 1)], dim=1)

        embeddings = model_output.last_hidden_state[:, 0, :] if self.model_name == 'cointegrated/rubert-tiny2' else model_output.pooler_output
        embeddings = nn.functional.normalize(embeddings)

        return embeddings

    def forward(self, text):
        embeddings = self.backbone_forward(text)

        compressed_embeddings = self.projection_head(embeddings)
        compressed_embeddings = nn.functional.normalize(compressed_embeddings)

        return compressed_embeddings

ProjectorModel()('some text here').shape, ProjectorModel()(['some text here', 'lalala']).shape

(torch.Size([1, 32]), torch.Size([2, 32]))

In [9]:
def train_eval(model, df_train, df_test, batch_size=64, n_epochs=5, lr=1e-5):
    model.train()

    queries = df_train.query_id.unique()
    df_train = df_train.set_index('query_id')

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)

    for e in range(n_epochs):
        train_set = df_train.loc[np.random.choice(queries, size=len(queries), replace=False), ['query_text', 'title', 'target', 'query_len', 'keywords']].values
        num_batches = len(train_set) // batch_size

        for i in tqdm(range(num_batches)):
            start, end = i * batch_size, (i + 1) * batch_size
            batch = train_set[start:end]

            x1, x2, y = list(batch[:, 0]), list(batch[:, 1]), torch.tensor(batch[:, 2].astype(int), device=device)

            loss = 0.5 * ((y - (model.backbone_forward(x1) * model.backbone_forward(x2)).sum(dim=1))**2).mean() +\
                   0.5 * ((y - (model(x1) * model(x2)).sum(dim=1))**2).mean()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            optimizer.zero_grad()

        if e == 0:
            for n, p in model.named_parameters():
                if n[:5] != 'embed':
                    p.requires_grad = True

            optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-2)

        model.eval()

        test_set = df_test.loc[:, ['query_text', 'title', 'target', 'query_len', 'keywords']].values
        num_batches = len(test_set) // batch_size

        pred1, pred2, true = [], [], []
        for i in tqdm(range(num_batches + 1)):
            start, end = i * batch_size, (i + 1) * batch_size
            batch = test_set[start:end]

            x1, x2, y = list(batch[:, 0]), list(batch[:, 1]), batch[:, 2].astype(int)

            pred1.extend((model.backbone_forward(x1) * model.backbone_forward(x2)).sum(dim=1).abs().cpu().detach().numpy())
            pred2.extend((model(x1) * model(x2)).sum(dim=1).abs().cpu().detach().numpy())
            true.extend(y)

        print(f'Epoch {e}:')
        print(f'Test ROC-AUC backbone: {roc_auc_score(true, pred1):.3f}')
        print(f'Test ROC-AUC full net: {roc_auc_score(true, pred2):.3f}')

        df_test['true'] = true
        df_test['pred1'] = pred1
        df_test['pred2'] = pred2

        print('Test ROC-AUC by query length:')
        display(df_test.groupby('query_len').apply(
            lambda d: [f'{roc_auc_score(d.true, d.pred1):.3f}', f'{roc_auc_score(d.true, d.pred2):.3f}']
        ))

        model.train()

In [10]:
df['query_len'] = df['query_text'].apply(lambda x: len(x.split(' ')))
df.loc[df.query_len > 3, 'query_len'] = 3
df.query_len.value_counts()

3    62707
2    62379
1    36343
Name: query_len, dtype: int64

In [11]:
%%time

kf = GroupKFold(n_splits=2)

for train_indices, test_indices in kf.split(X=df, groups=df.query_id):    
    df_train, df_test = df.loc[train_indices], df.loc[test_indices]
    print('-' * 80)
    print('Train:', df_train.shape, df_train.query_id.nunique(), '  ',
          'Test:', df_test.shape, df_test.query_id.nunique(), '  ',
          'Intersection:', set(df_train.query_id).intersection(set(df_test.query_id)))
    print()

    runs = []
    for model_name in ['cointegrated/rubert-tiny2']: # ['cointegrated/rubert-tiny2', 'cointegrated/LaBSE-en-ru']:
        for final_emb_size in [64]:
            print('Processing...', model_name, final_emb_size)

            model = ProjectorModel(model_name=model_name, final_emb_size=final_emb_size)
            train_eval(model, df_train, df_test, batch_size=512, n_epochs=1+10, lr=1e-5)

            model.eval()

            final_model = lambda text: model(text).cpu().detach().numpy()[0]
            _ = index_dataset(client, df, final_model, vector_size=final_emb_size,
                              collection_name=f'train_collection_{model_name}_{final_emb_size}')

            qrels = Qrels.from_df(df_test, q_id_col='query_id', doc_id_col='item_id', score_col='target')
            test_examples = df_test.drop_duplicates(subset=['query_id', 'query_text']).loc[:, ['query_id', 'query_text']]

            run_dict = {}
            for row in tqdm(test_examples.itertuples(), total=len(test_examples)):
                search_result = client.search(
                    collection_name=f'train_collection_{model_name}_{final_emb_size}', 
                    query_vector=list(map(float, final_model(row.query_text))), limit=50
                )
                run_dict[row.query_id] = {i.payload['item_id']: i.score for i in search_result}

            run = Run(run_dict) # print(evaluate(qrels, run, ['map@10', 'map@50', 'ndcg@10', 'ndcg@50']))
            runs.append(run)

            client.delete_collection(collection_name=f'train_collection_{model_name}_{final_emb_size}')
            print('Done')

    report = compare(
        qrels=qrels, runs=runs,
        metrics=['map@10', 'map@50', 'ndcg@10', 'ndcg@50'],
        n_permutations=1000, stat_test='student', max_p=0.01,
    )
    print(); print(report); print()

    break

# 0.752 - base
# 0.788-0.789 8-9 epochs - new loss
# 0.790 8-9 epochs - normalize both

# 0.789 (0.803 backbone) 9-11 epochs - 1/1 loss
# 0.791-0.792 (0.804-0.805 backbone) 10-11 epochs - 0.5 / 0.5 loss (batch 256 as I remember)

--------------------------------------------------------------------------------
Train: (80714, 8) 6172    Test: (80715, 8) 6173    Intersection: set()

Processing... cointegrated/rubert-tiny2 64


100%|██████████| 157/157 [00:18<00:00,  8.39it/s]
100%|██████████| 158/158 [00:20<00:00,  7.76it/s]


Epoch 0:
Test ROC-AUC backbone: 0.642
Test ROC-AUC full net: 0.623
Test ROC-AUC by query length:


query_len
1    [0.669, 0.637]
2    [0.623, 0.606]
3    [0.669, 0.650]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  4.00it/s]
100%|██████████| 158/158 [00:20<00:00,  7.77it/s]


Epoch 1:
Test ROC-AUC backbone: 0.710
Test ROC-AUC full net: 0.699
Test ROC-AUC by query length:


query_len
1    [0.735, 0.709]
2    [0.692, 0.681]
3    [0.732, 0.719]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  4.00it/s]
100%|██████████| 158/158 [00:20<00:00,  7.60it/s]


Epoch 2:
Test ROC-AUC backbone: 0.741
Test ROC-AUC full net: 0.734
Test ROC-AUC by query length:


query_len
1    [0.756, 0.729]
2    [0.714, 0.708]
3    [0.766, 0.761]
dtype: object

100%|██████████| 157/157 [00:38<00:00,  4.03it/s]
100%|██████████| 158/158 [00:20<00:00,  7.69it/s]


Epoch 3:
Test ROC-AUC backbone: 0.756
Test ROC-AUC full net: 0.748
Test ROC-AUC by query length:


query_len
1    [0.764, 0.736]
2    [0.726, 0.721]
3    [0.781, 0.777]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  4.02it/s]
100%|██████████| 158/158 [00:20<00:00,  7.78it/s]


Epoch 4:
Test ROC-AUC backbone: 0.767
Test ROC-AUC full net: 0.760
Test ROC-AUC by query length:


query_len
1    [0.773, 0.746]
2    [0.737, 0.733]
3    [0.791, 0.785]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  3.98it/s]
100%|██████████| 158/158 [00:20<00:00,  7.74it/s]


Epoch 5:
Test ROC-AUC backbone: 0.776
Test ROC-AUC full net: 0.768
Test ROC-AUC by query length:


query_len
1    [0.779, 0.753]
2    [0.746, 0.743]
3    [0.798, 0.792]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  4.00it/s]
100%|██████████| 158/158 [00:20<00:00,  7.78it/s]


Epoch 6:
Test ROC-AUC backbone: 0.778
Test ROC-AUC full net: 0.767
Test ROC-AUC by query length:


query_len
1    [0.781, 0.753]
2    [0.752, 0.745]
3    [0.802, 0.794]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  4.00it/s]
100%|██████████| 158/158 [00:20<00:00,  7.73it/s]


Epoch 7:
Test ROC-AUC backbone: 0.786
Test ROC-AUC full net: 0.777
Test ROC-AUC by query length:


query_len
1    [0.784, 0.758]
2    [0.758, 0.755]
3    [0.808, 0.803]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  3.99it/s]
100%|██████████| 158/158 [00:20<00:00,  7.75it/s]


Epoch 8:
Test ROC-AUC backbone: 0.790
Test ROC-AUC full net: 0.779
Test ROC-AUC by query length:


query_len
1    [0.788, 0.761]
2    [0.763, 0.757]
3    [0.812, 0.804]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  4.00it/s]
100%|██████████| 158/158 [00:20<00:00,  7.71it/s]


Epoch 9:
Test ROC-AUC backbone: 0.793
Test ROC-AUC full net: 0.783
Test ROC-AUC by query length:


query_len
1    [0.786, 0.760]
2    [0.767, 0.763]
3    [0.816, 0.809]
dtype: object

100%|██████████| 157/157 [00:39<00:00,  4.00it/s]
100%|██████████| 158/158 [00:20<00:00,  7.65it/s]


Epoch 10:
Test ROC-AUC backbone: 0.795
Test ROC-AUC full net: 0.785
Test ROC-AUC by query length:


query_len
1    [0.789, 0.762]
2    [0.769, 0.763]
3    [0.819, 0.812]
dtype: object

100%|██████████| 150268/150268 [10:23<00:00, 241.14it/s]
100%|██████████| 6173/6173 [09:31<00:00, 10.80it/s]


Done

#    Model      MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1       0.046     0.057      0.092      0.104

CPU times: user 36min 59s, sys: 4min 46s, total: 41min 45s
Wall time: 31min 39s


In [12]:
model.backbone.save_pretrained('model_tuned/')
model.tokenizer.save_pretrained('model_tuned/')

torch.save(model, 'model_tuned/model_tuned_full.pkl')

shutil.make_archive('model_tuned', 'zip', 'model_tuned/')

'/kaggle/working/model_tuned.zip'

In [13]:
model = torch.load('model_tuned/model_tuned_full.pkl', map_location=device)
model.eval()

ProjectorModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affin