# Хакатон Просепт

In [1]:
from google.colab import drive
import numpy as np
import os
import pandas as pd
import re
from scipy.spatial.distance import cosine
import torch
from transformers import BertModel, BertTokenizer
from typing import List

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


## Изучение данных

In [None]:
DATA_FOLDER = '/content/drive/MyDrive/yandex/data/'

Импорт данных

In [None]:
marketing_dealer = pd.read_csv(os.path.join(DATA_FOLDER, 'marketing_dealer.csv'), sep=';')
marketing_product = pd.read_csv(os.path.join(DATA_FOLDER, 'marketing_product.csv'), sep=';')
marketing_dealerprice = pd.read_csv(os.path.join(DATA_FOLDER, 'marketing_dealerprice.csv'), sep=';')
marketing_productdealerkey = pd.read_csv(os.path.join(DATA_FOLDER, 'marketing_productdealerkey.csv'), sep=';')

In [None]:
marketing_dealerprice = marketing_dealerprice.drop_duplicates(subset='product_name')
marketing_dealerprice.shape

(1953, 7)

Есть 1953 различных описаний товаров с маркетплейсов

In [None]:
data = (marketing_dealerprice.merge(marketing_productdealerkey, left_on='product_key', right_on='key')
                             .merge(marketing_product, left_on='product_id', right_on='id'))
data = data[['product_name', 'name']]

## Предобработка данных

Для определения схожести описания товаров используем `product_name` и `name`

In [None]:
SPACES = r'(?<=[а-яА-Я])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[а-яА-Я])'
DUP_SPACES = r'([ ])\1+'

In [None]:
def clean_string(input_string: str):
    input_string = re.sub(SPACES, ' ', input_string.lower()) # нижний регистр, обработка пропущенных пробелов
    input_string = input_string.replace('prosept', '') # убираем название фирмы
    input_string = input_string.replace('просепт', '')
    input_string = input_string.replace('-', '')
    input_string = input_string.replace(' /', '')
    input_string = re.sub(DUP_SPACES, r'\1', input_string) # обработка двойных пробелов
    input_string = re.sub(r'[^а-яa-z0-9]+', ' ', input_string)
    return input_string

In [None]:
dealer_name = data['product_name'].apply(clean_string)
product_name = data['name'].apply(clean_string)

Эмбеддинги делаем для уникальных описаний продуктов производителя

In [None]:
unique_products = pd.Series(product_name.unique())

In [None]:
unique_products.shape

(414,)

## Подготовка модели

Используем версию BERT, которая поддерживает русский и английский, т.к. в описании есть наименования на английском.

In [None]:
bert_version = 'cointegrated/LaBSE-en-ru'
tokenizer = BertTokenizer.from_pretrained(bert_version)
model = BertModel.from_pretrained(bert_version)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/521k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/516M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.eval()
model.to(device)

Функция для получения эмбеддинга описания

In [None]:
def sentence_embedding(sentence):
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0).to(device)
    with torch.no_grad():
      outputs = model(input_ids)
    last_hidden_states = outputs[0].squeeze(0)
    sentence_embedding = torch.mean(last_hidden_states, dim=0)
    return sentence_embedding.cpu().numpy()

In [None]:
%%time
products_embedding = unique_products.apply(sentence_embedding)

CPU times: user 1min, sys: 113 ms, total: 1min
Wall time: 1min 5s


In [None]:
def cos_similarity(embedding1, embedding2):
  # вычисляется косинусная схожесть
  return 1 - cosine(embedding1, embedding2)

In [None]:
def rank_products(dealer_product, products, products_embedding):
  dealer_embedding = sentence_embedding(dealer_product)
  scores = products_embedding.apply(lambda x: cos_similarity(dealer_embedding, x))
  # возвращается вектор схожести для всех товаров производителя
  return scores

In [None]:
%%time
pred = rank_products(dealer_name[0], unique_products, products_embedding)

CPU times: user 131 ms, sys: 0 ns, total: 131 ms
Wall time: 132 ms


## Результаты работы модели

Для оценки результатов используем метрику accuracy@n. Метрика показывает долю предсказаний, для которых целевой объект находится в *n* рекомендуемых объектах.

In [None]:
def accuracy_at(dealer_names: pd.Series,
                target: pd.Series,
                products: pd.Series,
                product_embeddings: pd.Series,
                n_list: List[int]):
  accuracy = {}
  for n in n_list:
    accuracy[f'accuracy@{str(n)}'] = []
  for i, name in enumerate(dealer_names):
    scores = rank_products(name, products, product_embeddings)
    prediction = products[scores.sort_values(ascending=False).index]
    for n in n_list:
      prediction_is_right = target[i] in prediction.iloc[:n].values
      accuracy[f'accuracy@{str(n)}'].append(prediction_is_right)

  for key in accuracy:
    accuracy[key] = np.array(accuracy[key]).mean()
  return accuracy

In [None]:
%%time
results = accuracy_at(dealer_name, product_name, unique_products, products_embedding, n_list=[1, 3, 5, 10])

CPU times: user 4min 28s, sys: 1.01 s, total: 4min 29s
Wall time: 4min 47s


Метрика на всём наборе данных

In [None]:
results

{'accuracy@1': 0.7622504537205081,
 'accuracy@3': 0.9098608590441621,
 'accuracy@5': 0.940713853599516,
 'accuracy@10': 0.9673321234119783}