In [1]:
!pip install -q numpy pandas tqdm tensorflow torch transformers qdrant-client ranx

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dask-cudf 21.12.2 requires cupy-cuda115, which is not installed.
cudf 21.12.2 requires cupy-cuda115, which is not installed.
xarray-einstats 0.2.2 requires numpy>=1.21, but you have numpy 1.20.3 which is incompatible.
tfx-bsl 1.12.0 requires google-api-python-client<2,>=1.7.11, but you have google-api-python-client 2.79.0 which is incompatible.
tfx-bsl 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
tensorflow-transform 1.12.0 requires pyarrow<7,>=6, but you have pyarrow 5.0.0 which is incompatible.
onnx 1.13.1 requires protobuf<4,>=3.20.2, but you have protobuf 3.19.6 which is incompatible.
librosa 0.10.0 requires soundfile>=0.12.1, but you have soundfile 0.11.0 which is incompatible.
featuretools 1.11.1 requires numpy>=1.21.0, but you have numpy 1.20.3 which is inco

In [2]:
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, AutoModel
from transformers import logging
logging.set_verbosity_error()

from functools import partial

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from qdrant_client.http.models import CollectionStatus

from sklearn.model_selection import GroupKFold
from ranx import Qrels, Run, evaluate, compare

from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

DIR = '/kaggle/input/aaa-project-search/'

cuda:0


In [3]:
df = pd.read_hdf(DIR + 'search_relevance_dataset_v1.hdf', 'table')
df.drop(columns=['query_category_id', 'query_microcat_id', 'query_location_id'], inplace=True)

df.query_id = df.query_id.astype(str)
df.item_id = df.item_id.astype(str)

df.head(3)

Unnamed: 0,query_id,query_text,item_id,title,description,keywords,target
0,274025,2108 ссср,964140459,Советские бутыли канистры 60-80-х СССР ваз газ...,Для ценителей и понимающих.\n\nПодробные фотог...,стопсфинксstop,0
1,274025,2108 ссср,990433426,Ваз 2108 СССР цвет салатовый 1/43 идеальный,красивый салатовый цвет\nвсе детали в наличии\...,модель,1
2,274025,2108 ссср,994402610,Модели советских машин ваз 2102 почта М 1/43 №10,Продается модель автомобиля ВАЗ 2102 почта . ...,модель,1


In [4]:
# random model
random_model = lambda text: np.random.randn(32)

# baseline model 1
tokenizer = AutoTokenizer.from_pretrained('cointegrated/rubert-tiny2')
model = AutoModel.from_pretrained('cointegrated/rubert-tiny2').to(device)

def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

baseline_model_1 = partial(embed_bert_cls, model=model, tokenizer=tokenizer)

# baseline model 2
tokenizer = AutoTokenizer.from_pretrained('cointegrated/LaBSE-en-ru')
model = AutoModel.from_pretrained('cointegrated/LaBSE-en-ru').to(device)

def embed_labse_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

baseline_model_2 = partial(embed_labse_cls, model=model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/118M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/516M [00:00<?, ?B/s]

In [5]:
baseline_model_1('hello').shape, baseline_model_2('hello').shape

((312,), (768,))

In [6]:
client = QdrantClient(":memory:")

In [7]:
def index_dataset(client, df, model, vector_size, collection_name='collection'):
    client.recreate_collection(
        collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.DOT),
    )

    df = df.drop_duplicates(subset=['item_id'])

    points = []
    for row in tqdm(df.itertuples(), total=len(df)):
        points.append(
            PointStruct(
                id=row.Index,
                vector=list(map(float, model(row.title))), # list(map(float.. to make proper type
                payload={
                    'title': row.title, 'description': row.description, 'keywords': row.keywords, 'item_id': row.item_id
                },
            )
        )

    operation_info = client.upsert(collection_name=collection_name, wait=True, points=points)
    
    return operation_info

In [8]:
%%time

kf = GroupKFold(n_splits=2)

for train_indices, test_indices in kf.split(X=df, groups=df.query_id):
    df_train, df_test = df.loc[train_indices], df.loc[test_indices]
    print('-' * 80)
    print('Train:', df_train.shape, df_train.query_id.nunique(), '  ',
          'Test:', df_test.shape, df_test.query_id.nunique(), '  ',
          'Intersection:', set(df_train.query_id).intersection(set(df_test.query_id)))
    print()

    runs = []
    for j, (model, vs) in enumerate([(random_model, 32), (baseline_model_1, 312), (baseline_model_2, 768)]):

        _ = index_dataset(client, df, model, vector_size=vs, collection_name=f'train_collection_{j}')

        qrels = Qrels.from_df(df_test, q_id_col='query_id', doc_id_col='item_id', score_col='target')
        test_examples = df_test.drop_duplicates(subset=['query_id', 'query_text']).loc[:, ['query_id', 'query_text']]

        run_dict = {}
        for row in tqdm(test_examples.itertuples(), total=len(test_examples)):
            search_result = client.search(
                collection_name=f'train_collection_{j}', query_vector=list(map(float, model(row.query_text))), limit=50
            )
            run_dict[row.query_id] = {i.payload['item_id']: i.score for i in search_result}

        run = Run(run_dict) # print(evaluate(qrels, run, ['map@10', 'map@50', 'ndcg@10', 'ndcg@50']))
        runs.append(run)

        client.delete_collection(collection_name=f'train_collection_{j}')

    report = compare(
        qrels=qrels, runs=runs,
        metrics=['map@10', 'map@50', 'ndcg@10', 'ndcg@50'],
        n_permutations=1000, stat_test='student', max_p=0.01,
    )
    print(); print(report); print()

--------------------------------------------------------------------------------
Train: (80714, 7) 6172    Test: (80715, 7) 6173    Intersection: set()



100%|██████████| 150268/150268 [00:21<00:00, 6965.81it/s]
100%|██████████| 6173/6173 [04:14<00:00, 24.24it/s]
100%|██████████| 150268/150268 [09:42<00:00, 258.01it/s]
100%|██████████| 6173/6173 [21:31<00:00,  4.78it/s]
100%|██████████| 150268/150268 [28:19<00:00, 88.43it/s]
100%|██████████| 6173/6173 [42:54<00:00,  2.40it/s]



#    Model    MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1    0.000     0.000     0.000      0.000
b    run_2    0.045ᵃ    0.053ᵃ    0.091ᵃ     0.094ᵃ
c    run_3    0.084ᵃᵇ   0.103ᵃᵇ   0.150ᵃᵇ    0.170ᵃᵇ

--------------------------------------------------------------------------------
Train: (80715, 7) 6173    Test: (80714, 7) 6172    Intersection: set()



100%|██████████| 150268/150268 [00:21<00:00, 6941.89it/s]
100%|██████████| 6172/6172 [04:39<00:00, 22.09it/s]
100%|██████████| 150268/150268 [10:00<00:00, 250.35it/s]
100%|██████████| 6172/6172 [14:45<00:00,  6.97it/s]
100%|██████████| 150268/150268 [26:41<00:00, 93.81it/s]
100%|██████████| 6172/6172 [43:09<00:00,  2.38it/s]



#    Model    MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1    0.000     0.000     0.000      0.000
b    run_2    0.048ᵃ    0.056ᵃ    0.091ᵃ     0.097ᵃ
c    run_3    0.088ᵃᵇ   0.106ᵃᵇ   0.150ᵃᵇ    0.171ᵃᵇ

CPU times: user 3h 16min 56s, sys: 1h 19min 54s, total: 4h 36min 51s
Wall time: 3h 29min 4s


In [9]:
def index_dataset(client, df, model, vector_size, collection_name='collection'):
    client.recreate_collection(
        collection_name=collection_name, vectors_config=VectorParams(size=vector_size, distance=Distance.DOT),
    )

    df = df.drop_duplicates(subset=['item_id'])

    points = []
    for row in tqdm(df.itertuples(), total=len(df)):
        points.append(
            PointStruct(
                id=row.Index,
                vector=list(map(float, model(row.title) + model(row.description))), # list(map(float.. to make proper type
                payload={
                    'title': row.title, 'description': row.description, 'keywords': row.keywords, 'item_id': row.item_id
                },
            )
        )

    operation_info = client.upsert(collection_name=collection_name, wait=True, points=points)

    return operation_info

In [10]:
%%time

kf = GroupKFold(n_splits=2)

for train_indices, test_indices in kf.split(X=df, groups=df.query_id):
    df_train, df_test = df.loc[train_indices], df.loc[test_indices]
    print('-' * 80)
    print('Train:', df_train.shape, df_train.query_id.nunique(), '  ',
          'Test:', df_test.shape, df_test.query_id.nunique(), '  ',
          'Intersection:', set(df_train.query_id).intersection(set(df_test.query_id)))
    print()

    runs = []
    for j, (model, vs) in enumerate([(random_model, 32), (baseline_model_1, 312), (baseline_model_2, 768)]):

        _ = index_dataset(client, df, model, vector_size=vs, collection_name=f'train_collection_{j}')

        qrels = Qrels.from_df(df_test, q_id_col='query_id', doc_id_col='item_id', score_col='target')
        test_examples = df_test.drop_duplicates(subset=['query_id', 'query_text']).loc[:, ['query_id', 'query_text']]

        run_dict = {}
        for row in tqdm(test_examples.itertuples(), total=len(test_examples)):
            search_result = client.search(
                collection_name=f'train_collection_{j}', query_vector=list(map(float, model(row.query_text))), limit=50
            )
            run_dict[row.query_id] = {i.payload['item_id']: i.score for i in search_result}

        run = Run(run_dict) # print(evaluate(qrels, run, ['map@10', 'map@50', 'ndcg@10', 'ndcg@50']))
        runs.append(run)

        client.delete_collection(collection_name=f'train_collection_{j}')

    report = compare(
        qrels=qrels, runs=runs,
        metrics=['map@10', 'map@50', 'ndcg@10', 'ndcg@50'],
        n_permutations=1000, stat_test='student', max_p=0.01,
    )
    print(); print(report); print()

--------------------------------------------------------------------------------
Train: (80714, 7) 6172    Test: (80715, 7) 6173    Intersection: set()



100%|██████████| 150268/150268 [00:25<00:00, 5974.75it/s]
100%|██████████| 6173/6173 [04:56<00:00, 20.79it/s]
100%|██████████| 150268/150268 [24:09<00:00, 103.63it/s]
100%|██████████| 6173/6173 [15:50<00:00,  6.49it/s]
100%|██████████| 150268/150268 [1:08:12<00:00, 36.72it/s]
100%|██████████| 6173/6173 [43:47<00:00,  2.35it/s]



#    Model    MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1    0.000     0.000     0.000      0.000
b    run_2    0.012ᵃ    0.015ᵃ    0.032ᵃ     0.034ᵃ
c    run_3    0.070ᵃᵇ   0.089ᵃᵇ   0.134ᵃᵇ    0.158ᵃᵇ

--------------------------------------------------------------------------------
Train: (80715, 7) 6173    Test: (80714, 7) 6172    Intersection: set()



100%|██████████| 150268/150268 [00:23<00:00, 6279.50it/s]
100%|██████████| 6172/6172 [04:59<00:00, 20.60it/s]
100%|██████████| 150268/150268 [24:02<00:00, 104.16it/s]
100%|██████████| 6172/6172 [22:36<00:00,  4.55it/s]
100%|██████████| 150268/150268 [1:08:00<00:00, 36.82it/s]
100%|██████████| 6172/6172 [43:24<00:00,  2.37it/s]



#    Model    MAP@10    MAP@50    NDCG@10    NDCG@50
---  -------  --------  --------  ---------  ---------
a    run_1    0.000     0.000     0.000      0.000
b    run_2    0.013ᵃ    0.016ᵃ    0.032ᵃ     0.035ᵃ
c    run_3    0.071ᵃᵇ   0.089ᵃᵇ   0.132ᵃᵇ    0.158ᵃᵇ

CPU times: user 5h 8min 16s, sys: 1h 21min 28s, total: 6h 29min 44s
Wall time: 5h 22min 32s
