In [2]:
import os
import sys
sys.path.insert(0, '../')
import time
import json
import pandas as pd

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher

In [3]:
def read_df(pth_models, pth_offers):
    df_models = pd.read_csv(pth_models, sep=";")
    df_models = df_models.drop(columns=['average_price', 'comment'])
    df_offers = pd.read_csv(pth_offers, sep=";")
    df_offers = df_offers.drop(columns=['true_match', 'false_match'])

    return df_models, df_offers

def small_df(df, thresh_upper=10):
    ans = [y for x, y in df.groupby('category_id')]
    for i in range(len(ans)):
        #ans[i] = ans[i].drop_duplicates(subset='name', keep="last")
        ans[i] = ans[i].sample(frac=1).sample(frac=1)[:thresh_upper]
    df_new = pd.concat(ans, axis=0)
    
    return df_new

def search(checkpoint, offers, models, nbits, doc_maxlen, tmp_fld):
    index_name = f'models.18_categories.{nbits}bits'

    offers = Queries(path=offers)
    models = Collection(path=models)
    with open(f"{tmp_fld}/logs.txt", "a") as txt:
        txt.write(f"Loaded {len(offers)} queries and {len(models):,} passages")

    start_time = time.time()
    with Run().context(RunConfig(nranks=1, experiment='notebook')):  # nranks specifies the number of GPUs to use.
        config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits)

        indexer = Indexer(checkpoint=checkpoint, config=config)
        indexer.index(name=index_name, collection=models, overwrite=True)
    indexer.get_index() # You can get the absolute path of the index, if needed.

    with Run().context(RunConfig(experiment='notebook')):
        searcher = Searcher(index=index_name)
    with open(f"{tmp_fld}/logs.txt", "a") as txt:
        txt.write(f"Подготовка моделей категории: time_spent = {time.time() - start_time}\n")
    print(f"Подготовка моделей категории: time_spent = {time.time() - start_time}\n")

    start_time = time.time()
    rankings = searcher.search_all(offers, k=5).todict()
    with open(f"{tmp_fld}/logs.txt", "a") as txt:
        txt.write(f"Инференс на всех офферах категории: time_spent = {time.time() - start_time}\n")
    print(f"Инференс на всех офферах категории: time_spent = {time.time() - start_time}\n\n\n")

    return rankings

def ranking_index(rankings, category_rankings, df, index_of_first):
    """
    сделать сдвиг passage_id на величину index_of_first для приведения к формату в котором поиск оффера ведется среди моделей всех категорий 
    упорядочить (passage_id, rank, score) в rankings согласно изначальным индексам в df_offers
    """
    assert len(category_rankings) == len(df)

    for i in category_rankings:
        for j in range(len(category_rankings[i])):
            category_rankings[i][j] = (category_rankings[i][j][0] + index_of_first, category_rankings[i][j][1], category_rankings[i][j][2])

        for k in range(5 - len(category_rankings[i])): # все предикты top-k делаем длиной 5
            category_rankings[i].append((0, 0, 0))#((index_of_first, 0, 0))#(category_rankings[i][- 1])
        assert len(category_rankings[i]) == 5

    i = -1
    for index, row in df.iterrows():
        i += 1
        rankings[index] = category_rankings[i]

    return rankings

def df_split(df, col="name"):
    df = df.reset_index(drop=True)

    df1 = pd.DataFrame()
    df1["id"], df1[col] = [i for i in range(len(df))], df[col]
    
    df2 = pd.DataFrame()
    df2["id"], df2["model_id"] = [i for i in range(len(df))], df['model_id']

    return df1, df2

def prepare_tsv(category_offers, category_models, pth_offers, pth_models):
    query, query_id = df_split(category_offers, col="name")
    query.to_csv(pth_offers, sep='\t', header=False, index=False)

    document, document_id = df_split(category_models, col="full_name")
    document.to_csv(pth_models, sep='\t', header=False, index=False)
    

In [4]:
pth_models = "/home/sondors/Documents/price/ColBERT_data/18_categories/test/models_18_categories.csv"
pth_offers = "/home/sondors/Documents/price/ColBERT_data/18_categories/test/triplets_test_18_categories.csv"
tmp_fld = "/home/sondors/Documents/price/ColBERT/tmp"
ckpt_pth = "/home/sondors/HYPERPARAM/none/2024-01/10/08.58.24/checkpoints/colbert-9555"
pth_dst_json = "/home/sondors/Documents/price/ColBERT/tmp/triples_colbert-9555.json"

df_models, df_offers = read_df(pth_models, pth_offers)
df_offers_small = small_df(df_offers, thresh_upper=10)

df_models

  df_models = pd.read_csv(pth_models, sep=";")


Unnamed: 0,model_id,name,brand_name,full_name,category_name,category_id
0,623742,920-005619,Logitech,Logitech 920-005619,"чехлы, обложки для гаджетов (телефонов, планше...",3994
1,721952,Zipper Bag,Hama,Hama Zipper Bag,"чехлы, обложки для гаджетов (телефонов, планше...",3994
2,721970,CC-3064,Nokia,Nokia CC-3064,"чехлы, обложки для гаджетов (телефонов, планше...",3994
3,751488,CKS-X7/R,Sony,Sony CKS-X7/R,"чехлы, обложки для гаджетов (телефонов, планше...",3994
4,751989,EP-031023,Era Pro,Era Pro EP-031023,"чехлы, обложки для гаджетов (телефонов, планше...",3994
...,...,...,...,...,...,...
103209,7049424,MD-108,Mivo,Mivo MD-108,портативная акустика,3904
103210,7049425,MD-165,Mivo,Mivo MD-165,портативная акустика,3904
103211,7049426,Boost 20W,Rocket,Rocket Boost 20W,портативная акустика,3904
103212,7049427,Motion 10W,Rocket,Rocket Motion 10W,портативная акустика,3904


In [5]:
df_models.category_name.value_counts()

category_name
чехлы, обложки для гаджетов (телефонов, планшетов etc)                                     37045
наушники, гарнитуры, наушники c микрофоном                                                 11118
мобильные телефоны                                                                          9494
корм для собак                                                                              9299
кухонные мойки                                                                              9143
портативная акустика                                                                        7399
парфюмерия                                                                                  6690
планшетные компьютеры и мини-планшеты                                                       3839
смарт-часы и браслеты                                                                       2577
GPS-навигаторы                                                                              1709
радиоприемники, 

In [6]:
df_offers

Unnamed: 0,name,model_id,category_name,category_id
0,Портативная акустика Hoco HC2 Xpress темно-зел...,4771613,портативная акустика,3904
1,Акустическая система HYUNDAI Колонка порт. H-P...,4570006,портативная акустика,3904
2,Кухонная мойка из искусственного камня Vigro V...,5769579,кухонные мойки,740101
3,Хиллс 604304 Puppy Large сух.д/щенков крупных ...,4386018,корм для собак,921401
4,"Смартфон Xiaomi POCO M5 4/128 GB Global, серый",5506115,мобильные телефоны,2801
...,...,...,...,...
135891,Кухонная Мойка Zorg X-78-2-44,3601922,кухонные мойки,740101
135892,"Balenciaga, Cristobal Pour Homme, 100 мл., туа...",3081853,парфюмерия,911906
135893,"Умная колонка MAIL Капсула, белый [mrc01(white)]",4855069,портативная акустика,3904
135894,Наушники беспроводные JBL Tune 750BTNC черные ...,4455835,"наушники, гарнитуры, наушники c микрофоном",2102


In [7]:
df_offers_small

Unnamed: 0,name,model_id,category_name,category_id
97044,"Наушники Jabra Elite 65t, золотой беж",4049479,"наушники, гарнитуры, наушники c микрофоном",2102
127769,"Гарнитура беспроводная Rombica Mysound BH-04, ...",4050007,"наушники, гарнитуры, наушники c микрофоном",2102
132292,Гарнитура Motorola Pulse Escape White,3537883,"наушники, гарнитуры, наушники c микрофоном",2102
18111,SV-012694,745831,"наушники, гарнитуры, наушники c микрофоном",2102
58717,Наушники Harper HB 517 white,4455798,"наушники, гарнитуры, наушники c микрофоном",2102
...,...,...,...,...
68484,Сухой корм Purina Pro Plan Veterinary Diets NC...,4451020,корм для собак,921401
23936,Сухой корм SIRIUS PREMIUM для взрослых собак с...,4836897,корм для собак,921401
57723,KARMY Сухой корм Maxi Junior для щенков крупны...,4388908,корм для собак,921401
18802,Сухой корм FLORIDA для взрослых собак средних ...,6258433,корм для собак,921401


In [8]:
categories = [
    "диктофоны, портативные рекордеры",
    "электронные книги",
    "автомобильные телевизоры, мониторы",
    "смарт-часы и браслеты",
    "портативные медиаплееры",
    "чехлы, обложки для гаджетов (телефонов, планшетов etc)",
    "портативная акустика",
    "мобильные телефоны",
    "VR-гарнитуры (VR-очки, шлемы, очки виртуальной реальности, FPV очки для квадрокоптеров)",
    "планшетные компьютеры и мини-планшеты",
    "наушники, гарнитуры, наушники c микрофоном",
    "радиоприемники, радиобудильники, радиочасы",
    "магнитолы",
    "GPS-навигаторы"
    ]

doc_maxlen = 300
nbits = 2   # encode each dimension with 2 bits
category = categories[0]
print(category)
index_of_first = df_models.index[df_models['category_name'] == category].tolist()[0]
index_of_first

диктофоны, портативные рекордеры


57368

In [9]:
df_models[df_models['category_name'] == category]

Unnamed: 0,model_id,name,brand_name,full_name,category_name,category_id
57368,301025,XR-ONE,Axelvox,Axelvox XR-ONE,"диктофоны, портативные рекордеры",3902
57369,301026,DDD 602,Denn,Denn DDD 602,"диктофоны, портативные рекордеры",3902
57370,301027,DDD 635,Denn,Denn DDD 635,"диктофоны, портативные рекордеры",3902
57371,301028,DDD 655,Denn,Denn DDD 655,"диктофоны, портативные рекордеры",3902
57372,301029,VR-A71,Explay,Explay VR-A71,"диктофоны, портативные рекордеры",3902
...,...,...,...,...,...,...
57851,5491259,DR-44WLB,Tascam,Tascam DR-44WLB,"диктофоны, портативные рекордеры",3902
57852,5491260,F2,Zoom,Zoom F2,"диктофоны, портативные рекордеры",3902
57853,6238826,RR-920 8Gb,Ritmix,Ritmix RR-920 8Gb,"диктофоны, портативные рекордеры",3902
57854,6353700,RR-145 16Gb,Ritmix,Ritmix RR-145 16Gb,"диктофоны, портативные рекордеры",3902


In [10]:
category_models = df_models[df_models['category_name'] == category]
category_offers = df_offers[df_offers['category_name'] == category]

pth_models = f"{tmp_fld}/models.tsv"
pth_offers = f"{tmp_fld}/offers.tsv"
prepare_tsv(category_offers, category_models, pth_offers, pth_models)

category_rankings = search(ckpt_pth, pth_offers, pth_models, nbits, doc_maxlen, tmp_fld)


[Jan 12, 17:31:04] #> Loading the queries from /home/sondors/Documents/price/ColBERT/tmp/offers.tsv ...
[Jan 12, 17:31:04] #> Got 251 queries. All QIDs are unique.

[Jan 12, 17:31:04] #> Loading collection...
0M 


[Jan 12, 17:31:04] #> Note: Output directory /home/sondors/Documents/price/ColBERT/experiments/notebook/indexes/models.18_categories.2bits already exists


[Jan 12, 17:31:04] #> Will delete 10 files already at /home/sondors/Documents/price/ColBERT/experiments/notebook/indexes/models.18_categories.2bits in 20 seconds...


#> Starting...
{
    "query_token_id": "[unused0]",
    "doc_token_id": "[unused1]",
    "query_token": "[Q]",
    "doc_token": "[D]",
    "ncells": null,
    "centroid_score_threshold": null,
    "ndocs": null,
    "load_index_with_mmap": false,
    "index_path": null,
    "nbits": 2,
    "kmeans_niters": 4,
    "resume": false,
    "similarity": "cosine",
    "bsize": 64,
    "accumsteps": 1,
    "lr": 1e-5,
    "maxsteps": 9555,
    "save_every": null,
    "warmup": 0,
    "warmup_bert": null,
    "relu": false,
    "nway": 2,
    "use_ib_negatives": false,
    "reranker": false,
    "distillation_alpha": 1.0,
    "ignore_scores": false,
    "model_name": null,
    "query_maxlen": 32,
    "attend_to_mask_tokens": false,
    "interaction": "colbert",
    "dim": 128,
    "doc_maxlen": 300,
    "mask_punctuation": true,
    "checkpoint": "\/home\/sondors\/HYPERPARAM\/none\/2024-01\/10\/08.58.24\/checkpoints\/colbert-9555",
    "triples": "\/mnt\/vdb1\/Datasets\/ColBERT\/18_categories\/



[Jan 12, 17:31:26] [0] 		 # of sampled PIDs = 488 	 sampled_pids[:3] = [213, 375, 5]
[Jan 12, 17:31:26] [0] 		 #> Encoding 488 passages..


100%|██████████| 8/8 [00:06<00:00,  1.23it/s]
0it [00:00, ?it/s]
  0%|          | 0/8 [00:00<?, ?it/s][A

[Jan 12, 17:31:33] [0] 		 avg_doclen_est = 9.829917907714844 	 len(local_sample) = 488
[Jan 12, 17:31:33] [0] 		 Creaing 1,024 partitions.
[Jan 12, 17:31:33] [0] 		 *Estimated* 4,796 embeddings.
[Jan 12, 17:31:33] [0] 		 #> Saving the indexing plan to /home/sondors/Documents/price/ColBERT/experiments/notebook/indexes/models.18_categories.2bits/plan.json ..
Clustering 4558 points in 128D to 1024 clusters, redo 1 times, 4 iterations
  Preprocessing in 0.00 s
  Iteration 3 (0.01 s, search 0.01 s): objective=437.747 imbalance=1.609 nsplit=0       
[0.02, 0.026, 0.022, 0.024, 0.019, 0.022, 0.019, 0.021, 0.022, 0.02, 0.019, 0.019, 0.021, 0.023, 0.018, 0.023, 0.018, 0.021, 0.017, 0.022, 0.021, 0.02, 0.021, 0.02, 0.018, 0.023, 0.018, 0.023, 0.022, 0.02, 0.02, 0.022, 0.02, 0.019, 0.022, 0.02, 0.021, 0.021, 0.021, 0.024, 0.018, 0.022, 0.019, 0.019, 0.02, 0.021, 0.021, 0.023, 0.02, 0.019, 0.023, 0.023, 0.022, 0.018, 0.021, 0.021, 0.019, 0.023, 0.021, 0.02, 0.019, 0.019, 0.022, 0.023, 0.019, 0.022


 12%|█▎        | 1/8 [00:00<00:05,  1.25it/s][A
 25%|██▌       | 2/8 [00:01<00:04,  1.23it/s][A
 38%|███▊      | 3/8 [00:02<00:04,  1.21it/s][A
 50%|█████     | 4/8 [00:03<00:03,  1.19it/s][A
 62%|██████▎   | 5/8 [00:04<00:02,  1.20it/s][A
 75%|███████▌  | 6/8 [00:04<00:01,  1.21it/s][A
 88%|████████▊ | 7/8 [00:05<00:00,  1.22it/s][A
100%|██████████| 8/8 [00:06<00:00,  1.27it/s][A
1it [00:06,  6.32s/it]
100%|██████████| 1/1 [00:00<00:00, 2008.77it/s]
100%|██████████| 1024/1024 [00:00<00:00, 114832.56it/s]


[Jan 12, 17:31:39] [0] 		 #> Saving chunk 0: 	 488 passages and 4,797 embeddings. From #0 onward.
[Jan 12, 17:31:39] [0] 		 #> Checking all files were saved...
[Jan 12, 17:31:39] [0] 		 Found all files!
[Jan 12, 17:31:39] [0] 		 #> Building IVF...
[Jan 12, 17:31:39] [0] 		 #> Loading codes...
[Jan 12, 17:31:39] [0] 		 Sorting codes...
[Jan 12, 17:31:39] [0] 		 Getting unique codes...
[Jan 12, 17:31:39] #> Optimizing IVF to store map from centroids to list of pids..
[Jan 12, 17:31:39] #> Building the emb2pid mapping..
[Jan 12, 17:31:39] len(emb2pid) = 4797
[Jan 12, 17:31:39] #> Saved optimized IVF to /home/sondors/Documents/price/ColBERT/experiments/notebook/indexes/models.18_categories.2bits/ivf.pid.pt
[Jan 12, 17:31:39] [0] 		 #> Saving the indexing metadata to /home/sondors/Documents/price/ColBERT/experiments/notebook/indexes/models.18_categories.2bits/metadata.json ..
#> Joined...
[Jan 12, 17:31:39] #> Loading collection...
0M 
[Jan 12, 17:31:40] Loading segmented_maxsim_cpp extensi



[Jan 12, 17:31:40] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 2976.79it/s]

[Jan 12, 17:31:40] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 829.73it/s]

[Jan 12, 17:31:40] Loading filter_pids_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...





[Jan 12, 17:31:41] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
Подготовка моделей категории: time_spent = 37.27238655090332



251it [00:00, 489.59it/s]


Инференс на всех офферах категории: time_spent = 4.542740821838379





In [54]:
category_rankings

{0: [(115181, 1, 24.89482879638672),
  (114985, 2, 20.12149429321289),
  (115182, 3, 19.668903350830078),
  (115149, 4, 19.111059188842773),
  (115123, 5, 17.828418731689453)],
 1: [(115150, 1, 29.155689239501953),
  (115149, 2, 25.068798065185547),
  (115182, 3, 23.130203247070312),
  (115008, 4, 21.960613250732422),
  (115111, 5, 21.674766540527344)],
 2: [(114921, 1, 25.650287628173828),
  (114922, 2, 22.73077964782715),
  (114920, 3, 22.153362274169922),
  (114930, 4, 19.867382049560547),
  (114744, 5, 18.332942962646484)],
 3: [(115206, 1, 19.075637817382812),
  (115205, 2, 19.040904998779297),
  (115148, 3, 18.836071014404297),
  (115147, 4, 16.632816314697266),
  (115195, 5, 16.534320831298828)],
 4: [(115211, 1, 28.32815170288086),
  (115210, 2, 21.43598747253418),
  (115187, 3, 19.790164947509766),
  (115140, 4, 19.71548080444336),
  (115011, 5, 18.522918701171875)],
 5: [(115155, 1, 27.070810317993164),
  (115145, 2, 24.684616088867188),
  (114769, 3, 21.717538833618164),
  (

In [57]:
rankings = {}
rankings = ranking_index(rankings, category_rankings, category_offers, index_of_first)
rankings

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250


{363: [(172549, 1, 24.89482879638672),
  (172353, 2, 20.12149429321289),
  (172550, 3, 19.668903350830078),
  (172517, 4, 19.111059188842773),
  (172491, 5, 17.828418731689453)],
 929: [(172518, 1, 29.155689239501953),
  (172517, 2, 25.068798065185547),
  (172550, 3, 23.130203247070312),
  (172376, 4, 21.960613250732422),
  (172479, 5, 21.674766540527344)],
 1794: [(172289, 1, 25.650287628173828),
  (172290, 2, 22.73077964782715),
  (172288, 3, 22.153362274169922),
  (172298, 4, 19.867382049560547),
  (172112, 5, 18.332942962646484)],
 1925: [(172574, 1, 19.075637817382812),
  (172573, 2, 19.040904998779297),
  (172516, 3, 18.836071014404297),
  (172515, 4, 16.632816314697266),
  (172563, 5, 16.534320831298828)],
 2738: [(172579, 1, 28.32815170288086),
  (172578, 2, 21.43598747253418),
  (172555, 3, 19.790164947509766),
  (172508, 4, 19.71548080444336),
  (172379, 5, 18.522918701171875)],
 3069: [(172523, 1, 27.070810317993164),
  (172513, 2, 24.684616088867188),
  (172137, 3, 21.71753