In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm
from torch.nn import CosineSimilarity
from collections import Counter

data = pd.read_csv("База стартапов_.csv", sep=';')
data = data[[x for x in data if 'Unn' not in x]]
data = data[~data["Рынок"].isna()]

class Embedder:
    def __init__(self, model_name="cointegrated/LaBSE-en-ru", batch_size=8):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.bs = batch_size

    def get_emb(self, texts):
        res = []
        for i in range((len(texts)+self.bs-1)//self.bs):
            cur_batch = self.tokenizer(texts[i*self.bs: (i+1)*self.bs], padding=True, truncation=True, max_length=128, return_tensors='pt')
            model_output = self.model(**cur_batch)
            embeddings = model_output.pooler_output
            embeddings = torch.nn.functional.normalize(embeddings)
            res += embeddings.tolist()
        return res
    
    def fit(self, df):
        texts = [f"Что делают: {df['Что делают'].iloc[i]}  Подробное описание проекта: {df['Подробное описание проекта'].iloc[i]}" for i in range(len(df))]
        self.df = df.copy()
        self.df['embs'] = self.get_emb(texts)
        ranks = {'До 100 млн.руб': 2, '100-500 млн.руб.': 1, '500-1000 млн.руб': 0}
        self.df['rank'] = [ranks[self.df[' Выручка (млн)'].iloc[i]] for i in range(len(self.df))]
        
    def __call__(self, text_summary, text_desc, profiles=None):
        if isinstance(text_summary, str):
            querys = [f"Что делают: {text_summary}  Подробное описание проекта: {text_desc}"]
            if profiles is not None:
                profiles = [profiles]
        else:
            querys = [f"Что делают: {text_summary[i]}  Подробное описание проекта: {text_desc[i]}" for i in range(len(text_desc))]
        embs = self.get_emb(querys)
        for i in range(len(embs)):
            cur_emb = embs[i]
            if profiles is not None:
                cur_prof = profiles[i]
                embs[i] = self.df[self.df['Рынок']==cur_prof].copy()
            else:
                embs[i] = self.df.copy()
            embs[i]['sim'] = CosineSimilarity()(torch.tensor(embs[i]['embs'].tolist()), torch.tensor(cur_emb))
            embs[i] = embs[i].sort_values(by='sim', key=lambda x: -x)
            embs[i] = embs[i].iloc[:3]
            embs[i] = embs[i].sort_values(by='rank', key=lambda x: x)
        return embs
        

model = Embedder()
model.fit(data)
res = model(data['Что делают'].iloc[0], data['Подробное описание проекта'].iloc[0], data['Рынок'].iloc[0])
print(res)

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
