In [None]:
import pandas as pd

pth_models = "/home/sondors/Documents/price/BERT_data/data/27-03-2024_Timofey/740101_models.csv"
pth_offers = "/home/sondors/Documents/price/ColBERT_data/10_categories/740101/test/740101_lr04_bsize230_offers_top_n_model_id_0.csv"

df_models = pd.read_csv(pth_models, sep=";")
# df_models = df_models.drop(columns=['average_price', 'comment'])
df_offers = pd.read_csv(pth_offers, sep=";")
# df_offers = df_offers.drop(columns=['true_match', 'false_match'])

df_offers

In [None]:
top_n = []

for index, row in df_offers.iterrows():
    model_ids = [row[f'model_id_pred_{i}'] for i in range(1, 6)]
    similarities = [row[f'similarity_{i}'] for i in range(1, 6)]
    
    top_n.append({'model_ids': model_ids, 'similarity': similarities})

In [None]:
import sys
sys.path.insert(0, '../')

from interface import prepare_tsv, save_index, top_n_similar, Collection, pair_scores, load_model, get_query_emb_batch, cosine_similarity_batch
import pandas as pd
import numpy as np
import os 

ckpt_pth = "/home/sondors/Documents/ColBERT_weights/740101_lr04_bsize230/colbert-489-finish"

doc_maxlen = 300
nbits = 2   # bits определяет количество битов у каждого измерения в семантическом пространстве во время индексации
nranks = 1  # nranks определяет количество GPU для использования, если они доступны
kmeans_niters = 4 # kmeans_niters указывает количество итераций k-means кластеризации; 4 — хороший и быстрый вариант по умолчанию. 

id_to_name = dict(zip(df_models['model_id'], df_models['full_name']))
checkpoint = load_model(ckpt_pth, doc_maxlen, nbits, kmeans_niters, "cpu")


In [None]:

from typing import Tuple, List, Dict, Union, Any

def top_n_similar_cos(checkpoint, id_to_name, offers: List[str], top_n: List[Dict[str, List]], batch_size: int, batch_size2: int) -> List[Dict[str, List]]:
    """
    Calculate cosine similarity between each offer and the full names of each model in top_n using batch processing.

    Args:
        checkpoint: The checkpoint object used for generating embeddings.
        id_to_name: id_to_name = dict(zip(df_models['model_id'], df_models['full_name']))
        offers (List[str]): List of offer descriptions.
        top_n (List[Dict[str, List]]): List of dictionaries containing model_ids, similarity scores, and full names.
        batch_size (int): The batch size to use during inference.
        batch_size2 (int): The size of the sub-batches to split the input sentences into for batching.

    Returns:
        List[Dict[str, List]]: List of dictionaries containing model_ids, similarity scores, full names, and cosine similarities.
    """
    top_n_extended = top_n.copy()
    for item in top_n_extended:
        model_ids = item['model_ids']
        full_names = [id_to_name[model_id] for model_id in model_ids]
        item['full_names'] = full_names

    offer_embs = get_query_emb_batch(offers, checkpoint, batch_size, batch_size2)

    for i, offer_emb in enumerate(offer_embs):
        cosine_sims = []

        for j, model_info in enumerate(top_n_extended[i]['full_names']):
            
            model_emb = get_query_emb_batch([model_info], checkpoint, batch_size, batch_size2)[0]
            similarity_scores = cosine_similarity_batch([offer_emb], [model_emb], batch_size)
            cosine_sims.append(similarity_scores[0][0])

        top_n[i]['cosine_sims'] = cosine_sims
    return top_n

top_n_cos_sim = top_n_similar_cos(checkpoint, id_to_name, list(df_offers['name']), top_n, batch_size=100, batch_size2=1000)
top_n_cos_sim

In [None]:
df_offers_cos = df_offers.copy()
df_offers_cos

In [None]:
def top_n_to_df(df, top_n, n):
    indices = [i for i in range(len(df))]
    for idx, insert_dict in zip(indices, top_n):
        for i in range(n):
            col_similarity = f'cosine_sims{i+1}'
            df.loc[idx, col_similarity] = round(float(insert_dict['cosine_sims'][i]), 2)
    return df

df_offers_cos = df_offers.copy()
df_offers_cos = top_n_to_df(df_offers_cos, top_n, 5)

df_offers_cos

In [None]:
df_offers_cos.to_csv('/home/sondors/Documents/price/ColBERT/EVAL/740101_lr04_bsize230_offers_top_n_model_id_0_cos.csv', sep=';', index=False)