In [1]:
!pip install -q numpy pandas tqdm tensorflow torch transformers gensim optuna qdrant-client ranx

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
xarray-einstats 0.2.2 requires numpy>=1.21, but you have numpy 1.20.3 which is incompatible.
tfx-bsl 1.12.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.79.0 which is incompatible.
onnx 1.13.1 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.
librosa 0.10.0 requires soundfile>=0.12.1, but you have soundfile 0.11.0 which is incompatible.
featuretools 1.11.1 requires numpy>=1.21.0, but you have numpy 1.20.3 which is incompatible.
cmdstanpy 1.1.0 requires numpy>=1.21, but you have numpy 1.20.3 which is incompatible.
apache-beam 2.44.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.6 which is incompatible.[0m[31m
[0m

In [2]:
import re

import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel
from transformers import logging
logging.set_verbosity_error()

import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

import optuna

from functools import partial

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from qdrant_client.http.models import CollectionStatus

from sklearn.model_selection import GroupKFold
from ranx import Qrels, Run, evaluate, compare

from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

DIR = '/kaggle/input/aaa-project-search/'

class LossLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch}')
        self.epoch += 1

cpu


In [3]:
class Word2VecTransformer:
    
    def __init__(self, w2v_model, word_pattern):
        
        self.w2v_model = w2v_model
        self.word_pattern = word_pattern
        self.re = re.compile(pattern=self.word_pattern)
        
    def fit(self, X):
        return self
    
    def predict(self, text):
        title_vector = np.zeros((self.w2v_model.wv.vector_size,))
        tokens = self.re.findall(text.lower())
        n = 0
        for token in tokens:
            if token in self.w2v_model.wv.key_to_index:
                n += 1
                title_vector += self.w2v_model.wv.get_vector(token)

        embedding = title_vector / n if n != 0 else title_vector
        return embedding

    def transform(self, X):
        X_transformed = np.zeros((len(X), self.w2v_model.wv.vector_size))
        for i, title in enumerate(X):
            X_transformed[i] = self.predict(title)

        return X_transformed

In [4]:
df = pd.read_hdf(DIR + 'search_relevance_dataset_v1.hdf', 'table')
df.drop(columns=['query_category_id', 'query_microcat_id', 'query_location_id'], inplace=True)

df.query_id = df.query_id.astype(str)
df.item_id = df.item_id.astype(str)

df.head(3)

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target
0,274025,2108 ссср,964140459,Советские бутыли канистры 60-80-х СССР ваз газ...,Для ценителей и понимающих.\n\nПодробные фотог...,стопсфинксstop,0
1,274025,2108 ссср,990433426,Ваз 2108 СССР цвет салатовый 1/43 идеальный,красивый салатовый цвет\nвсе детали в наличии\...,модель,1
2,274025,2108 ссср,994402610,Модели советских машин ваз 2102 почта М 1/43 №10,Продается модель автомобиля ВАЗ 2102 почта . ...,модель,1


In [5]:
client = QdrantClient(":memory:")

def index_dataset(client, df, model, vector_size, collection_name='collection'):
    client.recreate_collection(
        collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.DOT),
    )

    df = df.drop_duplicates(subset=['item_id'])

    points = []
    for row in tqdm(df.itertuples(), total=len(df)):
        points.append(
            PointStruct(
                id=row.Index,
                vector=list(map(float, model(row.title))), # list(map(float.. to make proper type
                payload={
                    'title': row.title, 'description': row.description, 'keywords': row.keywords, 'item_id': row.item_id
                },
            )
        )

    operation_info = client.upsert(collection_name=collection_name, wait=True, points=points)
    
    return operation_info

In [6]:
kf = GroupKFold(n_splits=2)

for train_indices, test_indices in kf.split(X=df, groups=df.query_id):
    df_train, df_test = df.loc[train_indices], df.loc[test_indices]
    print('-' * 80)
    print('Train:', df_train.shape, df_train.query_id.nunique(), '  ',
          'Valid:', df_test.shape, df_test.query_id.nunique(), '  ',
          'Intersection:', set(df_train.query_id).intersection(set(df_test.query_id)))
    print()

    qrels_valid = Qrels.from_df(df_train, q_id_col='query_id', doc_id_col='item_id', score_col='target')
    qrels_test = Qrels.from_df(df_test, q_id_col='query_id', doc_id_col='item_id', score_col='target')

    valid_examples = df_train.drop_duplicates(subset=['query_id', 'query_text']).loc[:, ['query_id', 'query_text']]
    test_examples = df_test.drop_duplicates(subset=['query_id', 'query_text']).loc[:, ['query_id', 'query_text']]

    break

--------------------------------------------------------------------------------
Train: (80714, 7) 6172    Valid: (80715, 7) 6173    Intersection: set()



In [7]:
w2v_data = pd.concat([df.title], axis=0).drop_duplicates()

WORD_PATTERN = '(?u)\\b\\w\\w+\\b'
reg_exp = re.compile(pattern=WORD_PATTERN)

sentences = [reg_exp.findall(s.lower()) for s in w2v_data]
print(sentences[:2])

def objective(trial):
    param = {
        'sg': trial.suggest_categorical('sg', [0, 1]),
        'vector_size': trial.suggest_int('vector_size', 8, 32, 8),
        'window': trial.suggest_int('window', 3, 10, 1),
        'min_count': trial.suggest_int('min_count', 3, 11, 2),
        'epochs': trial.suggest_int('epochs', 5, 25, 5),
    }

    w2v_model = Word2Vec(
        sg=param['sg'], vector_size=param['vector_size'], window=param['window'], min_count=param['min_count']
    )
    w2v_model.build_vocab(sentences)
    w2v_model.train(
        sentences, total_examples=w2v_model.corpus_count, epochs=param['epochs'],
    )

    w2v_transformer = Word2VecTransformer(w2v_model=w2v_model, word_pattern=WORD_PATTERN)
    word2vec_model = lambda text: w2v_transformer.predict(text)

    _ = index_dataset(client, df, word2vec_model, vector_size=param['vector_size'], collection_name=f'train_collection')
    
    run_dict = {}
    for row in tqdm(valid_examples.itertuples(), total=len(valid_examples)):
        search_result = client.search(
            collection_name=f'train_collection', query_vector=list(map(float, word2vec_model(row.query_text))), limit=50
        )
        run_dict[row.query_id] = {i.payload['item_id']: i.score for i in search_result}

    run = Run(run_dict)

    client.delete_collection(collection_name=f'train_collection')

    return evaluate(qrels_valid, run, 'map@50')

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=60*60*8, n_jobs=1)

print(f'Number of finished trials: {len(study.trials)}')

print(f'Best trial:')
trial = study.best_trial

print(f'  Value: {trial.value}')

print(f'  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

[32m[I 2023-05-13 21:01:25,495][0m A new study created in memory with name: no-name-0264323b-34f9-4396-9bc2-ce37cbbc57e8[0m


[['советские', 'бутыли', 'канистры', '60', '80', 'ссср', 'ваз', 'газ', 'заз'], ['ваз', '2108', 'ссср', 'цвет', 'салатовый', '43', 'идеальный']]


100%|██████████| 150268/150268 [00:20<00:00, 7410.91it/s]
100%|██████████| 6172/6172 [03:48<00:00, 27.06it/s]
[32m[I 2023-05-13 21:06:21,327][0m Trial 0 finished with value: 0.0006679801054824195 and parameters: {'sg': 0, 'vector_size': 8, 'window': 4, 'min_count': 7, 'epochs': 10}. Best is trial 0 with value: 0.0006679801054824195.[0m
100%|██████████| 150268/150268 [00:27<00:00, 5508.68it/s]
100%|██████████| 6172/6172 [03:56<00:00, 26.06it/s]
[32m[I 2023-05-13 21:11:06,254][0m Trial 1 finished with value: 0.0028442298857403662 and parameters: {'sg': 1, 'vector_size': 16, 'window': 8, 'min_count': 11, 'epochs': 10}. Best is trial 1 with value: 0.0028442298857403662.[0m
100%|██████████| 150268/150268 [00:19<00:00, 7536.09it/s]
100%|██████████| 6172/6172 [03:40<00:00, 27.98it/s]
[32m[I 2023-05-13 21:15:29,176][0m Trial 2 finished with value: 0.0009635233049614884 and parameters: {'sg': 1, 'vector_size': 8, 'window': 3, 'min_count': 5, 'epochs': 15}. Best is trial 1 with value: 0.

Number of finished trials: 50
Best trial:
  Value: 0.015342042724173412
  Params:
    sg: 1
    vector_size: 32
    window: 10
    min_count: 9
    epochs: 25


In [8]:
w2v_data = pd.concat([df.title], axis=0).drop_duplicates()

WORD_PATTERN = '(?u)\\b\\w\\w+\\b'
reg_exp = re.compile(pattern=WORD_PATTERN)

sentences = [reg_exp.findall(s.lower()) for s in w2v_data]
print(sentences[:2])

w2v_model = Word2Vec(
    sg=trial.params['sg'], vector_size=trial.params['vector_size'], window=trial.params['window'], min_count=trial.params['min_count']
)
w2v_model.build_vocab(sentences)
w2v_model.train(
    sentences, total_examples=w2v_model.corpus_count,
    epochs=trial.params['epochs'], compute_loss=True, callbacks=[LossLogger()]
)

w2v_transformer = Word2VecTransformer(w2v_model=w2v_model, word_pattern=WORD_PATTERN)

word2vec_model = lambda text: w2v_transformer.predict(text)

[['советские', 'бутыли', 'канистры', '60', '80', 'ссср', 'ваз', 'газ', 'заз'], ['ваз', '2108', 'ссср', 'цвет', 'салатовый', '43', 'идеальный']]
Loss after epoch 0: 1162408.125
Loss after epoch 1: 809816.875
Loss after epoch 2: 728367.5
Loss after epoch 3: 593518.5
Loss after epoch 4: 586281.75
Loss after epoch 5: 516641.25
Loss after epoch 6: 522304.5
Loss after epoch 7: 508797.0
Loss after epoch 8: 496806.0
Loss after epoch 9: 502462.5
Loss after epoch 10: 544772.5
Loss after epoch 11: 440718.5
Loss after epoch 12: 496243.5
Loss after epoch 13: 465582.0
Loss after epoch 14: 443863.5
Loss after epoch 15: 423317.0
Loss after epoch 16: 405617.0
Loss after epoch 17: 404013.0
Loss after epoch 18: 396282.0
Loss after epoch 19: 463763.0
Loss after epoch 20: 417595.0
Loss after epoch 21: 417684.0
Loss after epoch 22: 404111.0
Loss after epoch 23: 425373.0
Loss after epoch 24: 408415.0


In [9]:
# random model
random_model = lambda text: np.random.randn(32)

# baseline model 1
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModel.from_pretrained('cointegrated/rubert-tiny2').to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

baseline_model_1 = partial(embed_bert_cls, model=model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [10]:
word2vec_model('iphone').shape, baseline_model_1('hello').shape

((32,), (312,))

In [11]:
%%time

kf = GroupKFold(n_splits=2)

for train_indices, test_indices in kf.split(X=df, groups=df.query_id):
    df_train, df_test = df.loc[train_indices], df.loc[test_indices]
    print('-' * 80)
    print('Train:', df_train.shape, df_train.query_id.nunique(), '  ',
          'Test:', df_test.shape, df_test.query_id.nunique(), '  ',
          'Intersection:', set(df_train.query_id).intersection(set(df_test.query_id)))
    print()

    runs = []
    for j, (model, vs) in enumerate([(random_model, 32), (baseline_model_1, 312), (word2vec_model, trial.params['vector_size'])]):

        _ = index_dataset(client, df, model, vector_size=vs, collection_name=f'train_collection_{j}')

        qrels = Qrels.from_df(df_test, q_id_col='query_id', doc_id_col='item_id', score_col='target')
        test_examples = df_test.drop_duplicates(subset=['query_id', 'query_text']).loc[:, ['query_id', 'query_text']]

        run_dict = {}
        for row in tqdm(test_examples.itertuples(), total=len(test_examples)):
            search_result = client.search(
                collection_name=f'train_collection_{j}', query_vector=list(map(float, model(row.query_text))), limit=50
            )
            run_dict[row.query_id] = {i.payload['item_id']: i.score for i in search_result}

        run = Run(run_dict) # print(evaluate(qrels, run, ['map@10', 'map@50', 'ndcg@10', 'ndcg@50']))
        runs.append(run)

        client.delete_collection(collection_name=f'train_collection_{j}')

    report = compare(
        qrels=qrels, runs=runs,
        metrics=['map@10', 'map@50', 'ndcg@10', 'ndcg@50'],
        n_permutations=1000, stat_test='student', max_p=0.01,
    )
    print(); print(report); print()

--------------------------------------------------------------------------------
Train: (80714, 7) 6172    Test: (80715, 7) 6173    Intersection: set()



100%|██████████| 150268/150268 [00:37<00:00, 3959.41it/s]
100%|██████████| 6173/6173 [04:59<00:00, 20.62it/s]
100%|██████████| 150268/150268 [20:16<00:00, 123.54it/s]
100%|██████████| 6173/6173 [21:03<00:00,  4.88it/s]
100%|██████████| 150268/150268 [00:41<00:00, 3650.80it/s]
100%|██████████| 6173/6173 [04:39<00:00, 22.05it/s]



#    Model    MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1    0.000     0.000     0.000      0.000
b    run_2    0.045ᵃᶜ   0.053ᵃᶜ   0.091ᵃᶜ    0.094ᵃᶜ
c    run_3    0.010ᵃ    0.015ᵃ    0.027ᵃ     0.040ᵃ

--------------------------------------------------------------------------------
Train: (80715, 7) 6173    Test: (80714, 7) 6172    Intersection: set()



100%|██████████| 150268/150268 [00:37<00:00, 4047.83it/s]
100%|██████████| 6172/6172 [05:04<00:00, 20.30it/s]
100%|██████████| 150268/150268 [19:55<00:00, 125.71it/s]
100%|██████████| 6172/6172 [21:40<00:00,  4.74it/s]
100%|██████████| 150268/150268 [00:40<00:00, 3733.17it/s]
100%|██████████| 6172/6172 [04:33<00:00, 22.58it/s]



#    Model    MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1    0.000     0.000     0.000      0.000
b    run_2    0.048ᵃᶜ   0.056ᵃᶜ   0.091ᵃᶜ    0.097ᵃᶜ
c    run_3    0.010ᵃ    0.015ᵃ    0.027ᵃ     0.039ᵃ

CPU times: user 3h 14min 8s, sys: 1h 27min 58s, total: 4h 42min 7s
Wall time: 1h 46min 34s
