In [1]:
!apt-get install python3-dev -y
!apt-get install libpcre3-dev -y
!apt-get install gcc -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-dev is already the newest version (3.10.6-1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 62 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libpcre3-dev is already the newest version (2:8.39-13ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 62 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
gcc is already the newest version (4:11.2.0-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 62 not upgraded.


In [2]:
!pip install -q qdrant-client ranx

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.6 which is incompatible.
apache-beam 2.46.0 requires protobuf<4,>3.12.2, but you have protobuf 4.21.12 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 11.0.0 which is incompatible.
google-cloud-bigquery 2.34.4 requires protobuf<4.0.0dev,>=3.12.0, but you have protobuf 4.21.12 which is incompatible.
google-cloud-pubsub 2.16.1 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 1.33.2 which is incompatible.
ray 2.4.0 requires grpcio<=1.51.3,>=1.42.0; python_version >= "3.10" and sys_platform != "darwin", but you have grpcio 1.54.2 which is incompatible.
tensorflow-transform 0.14.

In [3]:
import shutil
import re

import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

import torch
from torch import nn, optim

from transformers import AutoTokenizer, AutoModel
from transformers import logging
logging.set_verbosity_error()

from functools import partial

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from qdrant_client.http.models import CollectionStatus

from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from string import punctuation

from ranx import Qrels, Run, evaluate, compare

from tqdm import tqdm
tqdm.pandas()

from copy import deepcopy

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

DIR = '/kaggle/input/aaa-project-search/'

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
cuda:0


In [4]:
df = pd.read_hdf(DIR + 'search_relevance_dataset_v1.hdf', 'table')
df.drop(columns=['query_category_id', 'query_microcat_id', 'query_location_id'], inplace=True)

df.query_id = df.query_id.astype(str)
df.item_id = df.item_id.astype(str)

df.head(3)

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target
0,274025,2108 ссср,964140459,Советские бутыли канистры 60-80-х СССР ваз газ...,Для ценителей и понимающих.\n\nПодробные фотог...,стопсфинксstop,0
1,274025,2108 ссср,990433426,Ваз 2108 СССР цвет салатовый 1/43 идеальный,красивый салатовый цвет\nвсе детали в наличии\...,модель,1
2,274025,2108 ссср,994402610,Модели советских машин ваз 2102 почта М 1/43 №10,Продается модель автомобиля ВАЗ 2102 почта . ...,модель,1


In [5]:
df['query_len'] = df['query_text'].apply(lambda x: len(x.split(' ')))
df.loc[df.query_len > 3, 'query_len'] = 3
df.query_len.value_counts()

3    62707
2    62379
1    36343
Name: query_len, dtype: int64

In [6]:
russian_stopwords = stopwords.words('russian') + ['стопсфинксstop']

preprocess_text_simple = lambda text: ' '.join(text.lower().split())

def preprocess_text(text):
    text = preprocess_text_simple(text)

    tokens = [t for t in text.split() if t not in russian_stopwords]
    tokens = list(dict.fromkeys(tokens))

    text = ' '.join(tokens)
    return text

In [7]:
df.query_text = df.query_text.progress_apply(preprocess_text_simple)
df.title = df.title.progress_apply(preprocess_text_simple)

df.description = df.description.progress_apply(lambda x: re.sub(r'[^a-zа-я0-9 ]+', ' ', x))
df.description = df.description.progress_apply(preprocess_text_simple)

df.keywords = df.keywords.progress_apply(preprocess_text)

df.head(3)

100%|██████████| 161429/161429 [00:00<00:00, 311134.55it/s]
100%|██████████| 161429/161429 [00:00<00:00, 241116.73it/s]
100%|██████████| 161429/161429 [00:07<00:00, 21008.45it/s]
100%|██████████| 161429/161429 [00:03<00:00, 47203.07it/s]
100%|██████████| 161429/161429 [00:01<00:00, 110057.59it/s]


Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target,query_len
0,274025,2108 ссср,964140459,советские бутыли канистры 60-80-х ссср ваз газ...,ля ценителей и понимающих одробные фотографии ...,,0,2
1,274025,2108 ссср,990433426,ваз 2108 ссср цвет салатовый 1/43 идеальный,красивый салатовый цвет все детали в наличии и...,модель,1,2
2,274025,2108 ссср,994402610,модели советских машин ваз 2102 почта м 1/43 №10,родается модель автомобиля 2102 почта 1 43 оде...,модель,1,2


In [8]:
# df.keywords = df.keywords.apply(lambda x: ' '.join(x.split()[:5]))
# df.keywords.value_counts()

In [9]:
# df.description = df.description.progress_apply(lambda text: ' '.join(text.split()[:50]) + ' ... ' + ' '.join(text.split()[-50:]) if len(text.split()) > 125 else text)
# df.description.apply(lambda text: len(text.split())).describe()

In [10]:
client = QdrantClient(':memory:')

In [11]:
def index_dataset(client, df, model, vector_size, collection_name='collection'):
    client.recreate_collection(
        collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )
    df = df.drop_duplicates(subset=['item_id'])

    points = []
    for row in tqdm(df.itertuples(), total=len(df)):
        points.append(
            PointStruct(
                id=row.Index,
                vector=list(map(float, model(row.title))), # list(map(float.. to make proper type
                payload={
                    'title': row.title, 'description': row.description, 'keywords': row.keywords, 'item_id': row.item_id
                },
            )
        )

    operation_info = client.upsert(collection_name=collection_name, wait=True, points=points)

    return operation_info

In [12]:
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
backbone = AutoModel.from_pretrained('cointegrated/rubert-tiny2', torchscript=False).to(device)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [13]:
special_tokens_dict = {'additional_special_tokens': ['[Q]', '[I]']} # {'additional_special_tokens': ['[Q]', '[I]', '[K]', '[D]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
backbone.resize_token_embeddings(len(tokenizer))

Embedding(83830, 312)

In [14]:
df['len_title'] = df.title.progress_apply(lambda text: len(tokenizer(text)['input_ids']))
df['len_description'] = df.description.progress_apply(lambda text: len(tokenizer(text)['input_ids']))
df['len_keywords'] = df.keywords.progress_apply(lambda text: len(tokenizer(text)['input_ids']))

100%|██████████| 161429/161429 [00:14<00:00, 11479.32it/s]
100%|██████████| 161429/161429 [01:35<00:00, 1690.78it/s]
100%|██████████| 161429/161429 [00:11<00:00, 13849.36it/s]


In [15]:
pd.concat([df.len_title.describe(), df.len_description.describe(), df.len_keywords.describe(), (df.len_title + df.len_description + df.len_keywords).describe()], axis=1)

Unnamed: 0,len_title,len_description,len_keywords,0
count,161429.0,161429.0,161429.0,161429.0
mean,11.026885,144.936746,5.439816,161.403447
std,4.830343,153.501985,3.124448,154.558729
min,3.0,2.0,2.0,7.0
25%,7.0,36.0,3.0,51.0
50%,10.0,89.0,5.0,106.0
75%,15.0,200.0,6.0,218.0
max,42.0,2719.0,31.0,2736.0


In [16]:
class ProjectorModel(nn.Module):
    def __init__(self, tokenizer, backbone, initial_emd_size, final_emb_size: int = 32):
        super().__init__()

        self.tokenizer = tokenizer
        self.backbone = backbone
        self.initial_emd_size = initial_emd_size

        self.final_emb_size = final_emb_size

        for n, p in self.backbone.named_parameters():
            p.requires_grad = False

        self.projection_head = nn.Linear(self.initial_emd_size, self.final_emb_size, device=device)

    def backbone_forward(self, text):
        t = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=64)
        model_output = self.backbone(**{k: v.to(self.backbone.device) for k, v in t.items()})

        embeddings = model_output.last_hidden_state[:, 0, :]
        embeddings = nn.functional.normalize(embeddings)

        return embeddings

    def forward(self, text):
        embeddings = self.backbone_forward(text)

        compressed_embeddings = self.projection_head(embeddings)
        compressed_embeddings = nn.functional.normalize(compressed_embeddings)

        return compressed_embeddings

ProjectorModel(tokenizer, backbone, 312)('some text here').shape, ProjectorModel(tokenizer, backbone, 312)(['some text here', 'lalala']).shape

(torch.Size([1, 32]), torch.Size([2, 32]))

In [17]:
def train_eval(model, df_train, df_test, batch_size=64, lrs=[1e-5] * 6):
    model.train()

    queries = df_train.query_id.unique()
    df_train = df_train.set_index('query_id')

    for e, lr in enumerate(lrs):
        model.train()
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-2)

        train_set = df_train.loc[np.random.choice(queries, size=len(queries), replace=False), 
                                 ['query_text', 'title', 'target', 'query_len', 'keywords']].values
        num_batches = len(train_set) // batch_size

        for i in tqdm(range(num_batches)):
            start, end = i * batch_size, (i + 1) * batch_size
            batch = train_set[start:end]

            x1, x2, y = list(batch[:, 0]), list(batch[:, 1]), torch.tensor(batch[:, 2].astype(int), device=device)

            loss = 0.5 * ((y - (model.backbone_forward(x1) * model.backbone_forward(x2)).sum(dim=1))**2).mean() +\
                   0.5 * ((y - (model(x1) * model(x2)).sum(dim=1))**2).mean()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            optimizer.zero_grad()

        if e == 0:
            for n, p in model.named_parameters():
                if n[:5] != 'embed':
                    p.requires_grad = True

        model.eval()

        test_set = df_test.loc[:, ['query_text', 'title', 'target', 'query_len', 'keywords']].values
        num_batches = len(test_set) // batch_size

        pred1, pred2, true = [], [], []
        for i in tqdm(range(num_batches + 1)):
            start, end = i * batch_size, (i + 1) * batch_size
            batch = test_set[start:end]

            x1, x2, y = list(batch[:, 0]), list(batch[:, 1]), batch[:, 2].astype(int)

            pred1.extend((model.backbone_forward(x1) * model.backbone_forward(x2)).sum(dim=1).abs().cpu().detach().numpy())
            pred2.extend((model(x1) * model(x2)).sum(dim=1).abs().cpu().detach().numpy())
            true.extend(y)

        print(f'Epoch {e}:')
        print(f'Test ROC-AUC backbone: {roc_auc_score(true, pred1):.3f}')
        print(f'Test ROC-AUC full net: {roc_auc_score(true, pred2):.3f}')

        df_test['true'] = true
        df_test['pred1'] = pred1
        df_test['pred2'] = pred2

        print('Test ROC-AUC by query length:')
        display(df_test.groupby('query_len').apply(
            lambda d: [f'{roc_auc_score(d.true, d.pred1):.3f}', f'{roc_auc_score(d.true, d.pred2):.3f}']
        ))

In [18]:
df.title = '[Q]' + df.title #+ tokenizer.sep_token + '[K]' + df.keywords + tokenizer.sep_token + '[D]' + df.description
df.query_text = '[I]' + df.query_text
df.head()

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target,query_len,len_title,len_description,len_keywords
0,274025,[I]2108 ссср,964140459,[Q]советские бутыли канистры 60-80-х ссср ваз ...,ля ценителей и понимающих одробные фотографии ...,,0,2,21,199,2
1,274025,[I]2108 ссср,990433426,[Q]ваз 2108 ссср цвет салатовый 1/43 идеальный,красивый салатовый цвет все детали в наличии и...,модель,1,2,16,18,3
2,274025,[I]2108 ссср,994402610,[Q]модели советских машин ваз 2102 почта м 1/4...,родается модель автомобиля 2102 почта 1 43 оде...,модель,1,2,15,163,3
3,274025,[I]2108 ссср,1069135877,[Q]книга автомобиль ваз-2108 ссср,путник устройство автомобиля 2108 1986 год смо...,учебный литература,1,2,12,30,4
4,274025,[I]2108 ссср,1217235061,"[Q]плакаты, ваз 2108, ссср, 1988 год",родам комплект плакатов по устройству автомоби...,учебный литература,0,2,15,102,4


In [19]:
model = ProjectorModel(tokenizer, backbone, 312, 64)
train_eval(model, df, df, batch_size=1024, lrs=[5e-5] * 3 + [1e-5] * 3)

model.eval()

100%|██████████| 157/157 [00:50<00:00,  3.10it/s]
100%|██████████| 158/158 [00:53<00:00,  2.94it/s]


Epoch 0:
Test ROC-AUC backbone: 0.672
Test ROC-AUC full net: 0.681
Test ROC-AUC by query length:


query_len
1    [0.679, 0.694]
2    [0.678, 0.678]
3    [0.694, 0.701]
dtype: object

100%|██████████| 157/157 [01:31<00:00,  1.72it/s]
100%|██████████| 158/158 [00:53<00:00,  2.96it/s]


Epoch 1:
Test ROC-AUC backbone: 0.825
Test ROC-AUC full net: 0.836
Test ROC-AUC by query length:


query_len
1    [0.832, 0.837]
2    [0.816, 0.828]
3    [0.830, 0.839]
dtype: object

100%|██████████| 157/157 [01:30<00:00,  1.73it/s]
100%|██████████| 158/158 [00:53<00:00,  2.97it/s]


Epoch 2:
Test ROC-AUC backbone: 0.859
Test ROC-AUC full net: 0.867
Test ROC-AUC by query length:


query_len
1    [0.863, 0.869]
2    [0.854, 0.861]
3    [0.858, 0.866]
dtype: object

100%|██████████| 157/157 [01:30<00:00,  1.73it/s]
100%|██████████| 158/158 [00:53<00:00,  2.96it/s]


Epoch 3:
Test ROC-AUC backbone: 0.865
Test ROC-AUC full net: 0.874
Test ROC-AUC by query length:


query_len
1    [0.874, 0.882]
2    [0.859, 0.867]
3    [0.863, 0.872]
dtype: object

100%|██████████| 157/157 [01:31<00:00,  1.72it/s]
100%|██████████| 158/158 [00:54<00:00,  2.91it/s]


Epoch 4:
Test ROC-AUC backbone: 0.870
Test ROC-AUC full net: 0.879
Test ROC-AUC by query length:


query_len
1    [0.879, 0.887]
2    [0.864, 0.871]
3    [0.867, 0.876]
dtype: object

100%|██████████| 157/157 [01:31<00:00,  1.72it/s]
100%|██████████| 158/158 [00:54<00:00,  2.89it/s]


Epoch 5:
Test ROC-AUC backbone: 0.875
Test ROC-AUC full net: 0.883
Test ROC-AUC by query length:


query_len
1    [0.883, 0.891]
2    [0.868, 0.875]
3    [0.872, 0.880]
dtype: object

ProjectorModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83830, 312)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
 

In [20]:
# %%time

# kf = GroupKFold(n_splits=2)

# for train_indices, test_indices in kf.split(X=df, groups=df.query_id):    
#     df_train, df_test = df.loc[train_indices], df.loc[test_indices]
#     print('-' * 80)
#     print('Train:', df_train.shape, df_train.query_id.nunique(), '  ',
#           'Test:', df_test.shape, df_test.query_id.nunique(), '  ',
#           'Intersection:', set(df_train.query_id).intersection(set(df_test.query_id)))
#     print()

#     model = ProjectorModel(tokenizer, backbone, 312, 64)
#     train_eval(model, df_train, df_test, batch_size=1024, lrs=[5e-5] * 5 + [1e-5] * 3)

#     model.eval()
#     break

# # Epoch 5:
# # Test ROC-AUC backbone: 0.820
# # Test ROC-AUC full net: 0.813

# # Test ROC-AUC backbone: 0.811
# # Test ROC-AUC full net: 0.807

In [21]:
model.to('cpu')

ProjectorModel(
  (backbone): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83830, 312)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
 

In [22]:
model.backbone.save_pretrained('model_tuned/')
model.tokenizer.save_pretrained('model_tuned/')

with open('model_tuned/projection_head.pkl', 'wb') as f:
    pickle.dump(model.projection_head.state_dict(), f)

shutil.make_archive('model_tuned', 'zip', 'model_tuned/')

'/kaggle/working/model_tuned.zip'

In [23]:
path_to_model = '/kaggle/working/model_tuned/'

In [24]:
tokenizer = AutoTokenizer.from_pretrained(path_to_model)
backbone = AutoModel.from_pretrained(path_to_model, torchscript=False)

with open(path_to_model + 'projection_head.pkl', 'rb') as f:
    projection_head = nn.Linear(312, 64)
    projection_head.load_state_dict(pickle.load(f))

In [25]:
def prepare_tokenized_text(text, tokenizer, type_of_text='query'):
    text = ' '.join(text.lower().split())

    text = ('[I]' + text) if type_of_text == 'query' else ('[Q]' + text)

    return tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=64)

def embed_tokenized_text(tokenized_text, model, projection_head):
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in tokenized_text.items()})
    
        embeddings = model_output.last_hidden_state[:, 0, :]
        embeddings = nn.functional.normalize(embeddings)

        embeddings = projection_head(embeddings)
        embeddings = torch.nn.functional.normalize(embeddings)
    
    return embeddings[0].cpu().numpy()

tokenizer_full = partial(prepare_tokenized_text, tokenizer=tokenizer)
model_full = partial(embed_tokenized_text, model=backbone, projection_head=projection_head)

model_full(tokenizer_full('hello')).shape

(64,)