## Загрузка данных

### Импорты

In [1]:
!pip install catboost -q
!pip install shap -q

In [2]:
import json
import ast
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances, roc_curve, precision_recall_curve
from sklearn.metrics import  auc, accuracy_score as acc, roc_auc_score, f1_score, precision_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OrdinalEncoder

from nltk.metrics.distance import edit_distance

from tqdm import tqdm
import warnings

# настройки
tqdm.pandas()
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

# константы
RANDOM_STATE = 42

### Read/Head/Info

In [3]:
# загружаем предоставленные файлы
try:
    train_data = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/train_data.parquet')
    train_pairs = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/train_pairs.parquet')
    test_pairs_wo_target = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/test_pairs_wo_target.parquet')
    test_data = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/test_data.parquet')
    submission_example3 = pd.read_csv('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/submission_example3.csv')
except:    
    train_data = pd.read_parquet('train_data.parquet', engine='auto')
    train_pairs = pd.read_parquet('train_pairs.parquet', engine='auto')
    test_data = pd.read_parquet('test_data.parquet', engine='auto')
    test_pairs_wo_target = pd.read_parquet('test_pairs_wo_target.parquet', engine='auto')
    submission_example3 = pd.read_csv('submission_example.csv')

In [4]:
%%time
# грузим дополнительные ембеддинги названий
name_bert = pd.read_parquet('name_bert768.parquet')
name_bert.rename(columns={'caracs_bert768': 'name_bert768'}, inplace=True)

CPU times: total: 11.6 s
Wall time: 12.1 s


In [5]:
# объединим датасеты с новыми эмбеддингами
train_data = train_data.merge(name_bert, on='variantid')
test_data = test_data.merge(name_bert, on='variantid')

display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460084 entries, 0 to 460083
Data columns (total 9 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          460084 non-null  int64 
 1   name                               460084 non-null  object
 2   categories                         460084 non-null  object
 3   color_parsed                       381142 non-null  object
 4   pic_embeddings_resnet_v1           305631 non-null  object
 5   main_pic_embeddings_resnet_v1      460084 non-null  object
 6   name_bert_64                       460084 non-null  object
 7   characteristic_attributes_mapping  460057 non-null  object
 8   name_bert768                       460084 non-null  object
dtypes: int64(1), object(8)
memory usage: 35.1+ MB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38751 entries, 0 to 38750
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   variantid                          38751 non-null  int64 
 1   name                               38751 non-null  object
 2   categories                         38751 non-null  object
 3   color_parsed                       28579 non-null  object
 4   pic_embeddings_resnet_v1           22141 non-null  object
 5   main_pic_embeddings_resnet_v1      38751 non-null  object
 6   name_bert_64                       38751 non-null  object
 7   characteristic_attributes_mapping  38747 non-null  object
 8   name_bert768                       38751 non-null  object
dtypes: int64(1), object(8)
memory usage: 3.0+ MB


None

In [6]:
# удалим дубликаты по колонке variantid
train_data = train_data.drop_duplicates('variantid')
test_data = test_data.drop_duplicates('variantid')

display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457063 entries, 0 to 460083
Data columns (total 9 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          457063 non-null  int64 
 1   name                               457063 non-null  object
 2   categories                         457063 non-null  object
 3   color_parsed                       378652 non-null  object
 4   pic_embeddings_resnet_v1           303467 non-null  object
 5   main_pic_embeddings_resnet_v1      457063 non-null  object
 6   name_bert_64                       457063 non-null  object
 7   characteristic_attributes_mapping  457036 non-null  object
 8   name_bert768                       457063 non-null  object
dtypes: int64(1), object(8)
memory usage: 34.9+ MB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35730 entries, 0 to 38749
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   variantid                          35730 non-null  int64 
 1   name                               35730 non-null  object
 2   categories                         35730 non-null  object
 3   color_parsed                       26089 non-null  object
 4   pic_embeddings_resnet_v1           19977 non-null  object
 5   main_pic_embeddings_resnet_v1      35730 non-null  object
 6   name_bert_64                       35730 non-null  object
 7   characteristic_attributes_mapping  35726 non-null  object
 8   name_bert768                       35730 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.7+ MB


None

In [7]:
# функция вывода info и head
def head_info(df):
    display(df.info())
    display(df.head(2))

    
# head_info(train_data)
# head_info(train_pairs)
# head_info(test_data)
# head_info(test_pairs_wo_target)
# head_info(submission_example3)

In [8]:
# изучим информацию по train_data
head_info(train_data)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457063 entries, 0 to 460083
Data columns (total 9 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          457063 non-null  int64 
 1   name                               457063 non-null  object
 2   categories                         457063 non-null  object
 3   color_parsed                       378652 non-null  object
 4   pic_embeddings_resnet_v1           303467 non-null  object
 5   main_pic_embeddings_resnet_v1      457063 non-null  object
 6   name_bert_64                       457063 non-null  object
 7   characteristic_attributes_mapping  457036 non-null  object
 8   name_bert768                       457063 non-null  object
dtypes: int64(1), object(8)
memory usage: 34.9+ MB


None

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_bert768
0,51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{""Номинальный ток, А"":[""10""],""Цвет товара"":[""о...","[-0.52616537, 0.5570387, 0.19487007, -0.363480..."
1,53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...","{""Конструктивные особенности"":[""Магнитная конс...","[-0.64783597, -0.1698648, 0.4192899, -0.225463..."


## Предобработка данных

### *Колонки cat 2, 3, 4 и набор атрибутов*

In [9]:
def new_col(train_data):


    # переводим текст категорий в словарь
    train_data['attributes_set'] = train_data.characteristic_attributes_mapping.fillna('{}').apply(ast.literal_eval)
    
    train_data['cat2'] = train_data['categories'].apply(lambda x: json.loads(x).get('2'))
    train_data['cat3'] = train_data['categories'].apply(lambda x: json.loads(x).get('3'))  
    train_data['cat4'] = train_data['categories'].apply(lambda x: json.loads(x).get('4'))  
    
    # Переведем двумрные эмбеддинги в одномерные
    train_data['main_pic_embeddings_resnet_v1_new'] = train_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])
    
    return train_data

In [10]:
%%time
# добавляем новые признаки
train_data = new_col(train_data)

CPU times: total: 1min 45s
Wall time: 1min 51s


In [11]:
# удалим ненужные категории
train_data = train_data[train_data['cat2'] == 'Электроника']
train_data = train_data.drop('cat2', axis=1)

### Группировка категорий

In [12]:
REST_train = 1000   # переменная для замены категорий численностью ниже 1000

In [13]:
%%time
def cat3_grouped(train_data, FOR_REST):
    cat3_counts = train_data["cat3"].value_counts().to_dict()

    cntr = 0
    for cat3 in cat3_counts:
        if cat3_counts[cat3] < FOR_REST:
            cntr += cat3_counts[cat3]   
    print(cntr)

    train_data["cat3_grouped"] = train_data["cat3"].apply(lambda x: x if cat3_counts[x] > FOR_REST else "rest")

    return train_data

CPU times: total: 0 ns
Wall time: 0 ns


In [14]:
%%time
# переименуем категории численностью ниже 1000 в rest
train_data = cat3_grouped(train_data, REST_train)

11076
CPU times: total: 219 ms
Wall time: 211 ms


In [15]:
# закодируем колонки cat3 и cat4
cat_col = ['cat3', 'cat4']
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=777)
encoder.fit(train_data[cat_col])

train_data[cat_col] = encoder.transform(train_data[cat_col])    

train_data.head(2)

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_bert768,attributes_set,cat3,cat4,main_pic_embeddings_resnet_v1_new,cat3_grouped
0,51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{""Номинальный ток, А"":[""10""],""Цвет товара"":[""о...","[-0.52616537, 0.5570387, 0.19487007, -0.363480...","{'Номинальный ток, А': ['10'], 'Цвет товара': ...",60.0,210.0,"[0.04603629, 0.18839523, -0.09973055, -0.66368...","Сетевые фильтры, разветвители и удлинители"
1,53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...","{""Конструктивные особенности"":[""Магнитная конс...","[-0.64783597, -0.1698648, 0.4192899, -0.225463...",{'Конструктивные особенности': ['Магнитная кон...,31.0,98.0,"[1.1471839, -0.665361, 0.7745614, 0.26716197, ...",Кабели и переходники


### *Колонка цвет*

In [16]:
# выделим самые популярные цвета и поместим все в матрицу цветов
def color(df):
       
    df['black'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('черный' in x) or ('black') in x else 0)
    df['silver'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('silver'in x) or ('серебристый' in x) or ('серебряный' in x) else 0)
    df['rose'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('pink' in x) or ('розовый'  in x) else 0)
    df['red'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('red' in x) or ('красный' in x) else 0)
    df['white'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('white' in x) or ('белый' in x) else 0)
    df['blue'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('blue' in x) or ('синий' in x) or ('голубой' in x)  else 0)
    df['multicol'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('разноцветный' in x) or ('цветной' in x) or ('многоцветный' in x) else 0)
    df['green'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('green' in x) or ('зеленый' in x) else 0)
    df['yellow'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('yellow' in x) or ('желтый' in x) else 0)
    df['gold'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('gold' in x) or ('золотой' in x) else 0)
    df['purple'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('purple' in x) or ('пурпурный' in x)  or ('сиреневый' in x) or ('фиолетовый' in x) else 0)
    df['orange'] = df['color_parsed'].astype(str).apply(lambda x: 1 if 'оранжевый' in x else 0)
    df['brown'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('коричневый' in x) or ('brown' in x) or ('темно-коричневый' in x) else 0)
    df['grey'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('серый' in x) or ('grey' in x) or ('темно-серый' in x) else 0)
    
    name_color = ['black', 'silver', 'rose', 'red', 'white', 'blue', 'multicol', 
                  'green', 'yellow', 'gold', 'purple', 'orange', 'brown', 'grey'] 
    
    df['clr_vect'] = df[name_color].values.tolist()
    df = df.drop(name_color, axis=1)
    df = df.drop(['color_parsed'], axis=1)
     
    return df

In [17]:
%%time
# переведем цвета в матрицу
train_data = color(train_data)

CPU times: total: 1min 52s
Wall time: 1min 57s


### *Объединение таблиц*

In [18]:
# объединим данные по парам товаров
def merge_f(pairs, data): 
    features = pairs.merge(data.add_suffix('1'), on='variantid1').merge(data.add_suffix('2'), on='variantid2')
    features.rename(columns={'cat31': 'cat3', 'cat3_grouped1': 'cat3_grouped'}, inplace=True)
    return features

In [19]:
%%time
features = merge_f(train_pairs, train_data)

features.head(1)

CPU times: total: 3.69 s
Wall time: 3.94 s


Unnamed: 0,target,variantid1,variantid2,name1,categories1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name_bert7681,attributes_set1,cat3,cat41,main_pic_embeddings_resnet_v1_new1,cat3_grouped,clr_vect1,name2,categories2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2,name_bert7682,attributes_set2,cat32,cat42,main_pic_embeddings_resnet_v1_new2,cat3_grouped2,clr_vect2
0,0.0,51197862,51198054,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...","[-0.4590829, -0.11046589, 0.11674632, -0.17761...","{'Число жил': ['3'], 'Макс. нагрузка, Вт': ['3...",60.0,210.0,"[-0.4304909, -0.49474272, -0.46439183, -0.0609...","Сетевые фильтры, разветвители и удлинители","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",Удлинитель TDM Electric Люкс УЛ05В 1.5 м (SQ13...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",,"[[-0.42941108, -0.5129398, -0.4753536, -0.0677...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,...","{""Электробезопасность"":[""Заземление""],""Длина к...","[-0.55869406, -0.13428268, 0.0906083, -0.18947...","{'Электробезопасность': ['Заземление'], 'Длина...",60.0,210.0,"[-0.42941108, -0.5129398, -0.4753536, -0.06778...","Сетевые фильтры, разветвители и удлинители","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


### *Похожесть атрибутов*

In [20]:
def calculate_statistical_features(embedding1, embedding2):
    
    # Вычисляем разность между двумя эмбеддингами
    diff = embedding1 - embedding2
    
    # Вычисляем среднее значение разности
    mean = np.mean(diff)
    
    # Вычисляем медиану разности
    median = np.median(diff)
    
    # Вычисляем стандартное отклонение разности
    std = np.std(diff)
    
    # Возвращаем вычисленные значения
    return mean, median, std

In [21]:
%%time
# Добавляем новые столбцы в датафрейм features с вычисленными статистическими характеристиками разности 
# между парами эмбеддингов эмбеддингами

features[['name_mean_diff', 'name_median_diff', 'name_std_diff']] = features.progress_apply(
    lambda x: calculate_statistical_features(
        x['name_bert7681'], x['name_bert7682']), axis=1, result_type='expand'
)

features[['ozon_name_mean_diff', 'ozon_name_median_diff', 'ozon_name_std_diff']] = features.progress_apply(
    lambda x: calculate_statistical_features(
        x['name_bert_641'], x['name_bert_642']), axis=1, result_type='expand'
)

features[['main_pic_mean_diff', 'main_pic_median_diff', 'main_pic_std_diff']] = features.progress_apply(
    lambda x: calculate_statistical_features(
        x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1, result_type='expand'
)

100%|██████████| 306306/306306 [01:03<00:00, 4789.74it/s]
100%|██████████| 306306/306306 [00:45<00:00, 6686.10it/s]
100%|██████████| 306306/306306 [00:46<00:00, 6564.72it/s]

CPU times: total: 2min 19s
Wall time: 2min 36s





In [22]:
# добавим коэффициент Жаккара
def calculate_jaccard_similarity(text1, text2):
    # Заменяем значения None на пустые строки
    text1 = '' if text1 is None else text1
    text2 = '' if text2 is None else text2
    
    # Заменяем значения типа float на пустые строки
    text1 = '' if isinstance(text1, float) else text1
    text2 = '' if isinstance(text2, float) else text2
    
    # Инициализируем векторизатор с бинарными значениями
    vectorizer = CountVectorizer(binary=True)

    try:
        # Преобразуем тексты в векторы
        X = vectorizer.fit_transform([text1, text2])
    except ValueError:
        # Если после предобработки текстов не осталось слов для анализа,
        # возвращаем значение 0
        return 0
    
    # Преобразуем тексты в векторы
    X = vectorizer.fit_transform([text1, text2])
    
    # Вычисляем пересечение между векторами
    intersection = X[0].multiply(X[1]).sum()
    
    # Вычисляем объединение между векторами
    union = X.sum() - intersection
    
    # Возвращаем коэффициент Жаккара
    return intersection / union

In [23]:
%%time
# Добавляем новыe признаки в датафрейм features с вычисленным коэффициентом Жаккара между названиями товаров и характеристиками
features['name_jaccard_similarity'] = features.progress_apply(
    lambda x: calculate_jaccard_similarity(x['name1'], x['name2']), axis=1
)

features['attributes_jaccard_similarity'] = features.progress_apply(
    lambda x: calculate_jaccard_similarity(x['characteristic_attributes_mapping1'], x['characteristic_attributes_mapping2']), axis=1
)

100%|██████████| 306306/306306 [09:44<00:00, 524.21it/s]
100%|██████████| 306306/306306 [10:54<00:00, 468.03it/s]

CPU times: total: 19min 32s
Wall time: 20min 38s





In [24]:
# добавим признаки взаимосвязи характеристик
def is_number(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def compare_attr(set1, set2):
    keys1 = set(set1.keys())
    keys2 = set(set2.keys())
    
    # Отношение общих ключей к сумме ключей
    ratio = len(keys2 & keys1) / (len(keys2) + len(keys1))
    
    # Количество одинаковых значений в общих ключах
    d = 0
    for i in keys2 & keys1:
        if set1[i][0] == set2[i][0]:
            d += 1
    
    # Количество общих ключей
    common_keys = len(keys1 & keys2)
    
    # Количество ключей с одинаковыми значениями
    same_value_keys = sum(1 for key in keys1 & keys2 if set1[key] == set2[key])
    
    # Среднее значение числовых характеристик для первого товара
    mean_value1 = np.mean([float(value[0]) for key, value in set1.items() if is_number(value[0])])
    
    # Среднее значение числовых характеристик для второго товара
    mean_value2 = np.mean([float(value[0]) for key, value in set2.items() if is_number(value[0])])
    
    return ratio, d, common_keys, same_value_keys, mean_value1, mean_value2

In [25]:
%%time
# добавим признаки взаимосвязи характеристик
features[['attr_dist', 
          'attr_acc', 
          'common_keys', 
          'same_value_keys', 
          'mean_value1', 
          'mean_value2']] = features[['attributes_set1', 'attributes_set2']].progress_apply(
    lambda x: pd.Series(compare_attr(*x)), axis=1
)

100%|██████████| 306306/306306 [01:33<00:00, 3273.64it/s]

CPU times: total: 1min 33s
Wall time: 1min 33s





In [26]:
# заполним нулями пропуски в mean_value1 и mean_value2
features['mean_value1'] = features['mean_value1'].fillna(0)
features['mean_value2'] = features['mean_value2'].fillna(0)

In [27]:
def manhattan_distance(emb1, emb2):
    """
    Функция для расчета манхэттенского расстояния между двумя одномерными эмбеддингами в строке датафрейма
    :return: манхэттенское расстояние между эмбеддингами в строке
    """
    return np.abs(emb1 - emb2)

In [28]:
# Расчет манхэттенского расстояния между признаками name_bert_641 и name_bert_642
features['ozon_bert_manhattan_distance'] = features.progress_apply(
    lambda x: manhattan_distance(x['name_bert_641'], x['name_bert_642']), axis=1
)

# Расчет манхэттенского расстояния между признаками main_pic_embeddings_resnet_v1_new1 и main_pic_embeddings_resnet_v1_new2
features['main_pic_manhattan_distance'] = features.progress_apply(
    lambda x: manhattan_distance(x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1
)

# Расчет манхэттенского расстояния между признаками name_embedding1 и name_embedding2
features['name_emb_manhattan_distance'] = features.progress_apply(
    lambda x: manhattan_distance(x['name_bert7681'], x['name_bert7682']), axis=1
)

100%|██████████| 306306/306306 [00:05<00:00, 52515.88it/s]
100%|██████████| 306306/306306 [00:06<00:00, 48005.69it/s]
100%|██████████| 306306/306306 [00:08<00:00, 34398.18it/s]


In [29]:
def calculate_statistics(arr):
    """
    Функция для расчета статистических мер для массива
    :param arr: массив значений
    :return: кортеж из среднего значения, медианы и стандартного отклонения
    """
    mean = np.mean(arr)
    median = np.median(arr)
    std = np.std(arr)
    
    return mean, median, std

In [30]:
%%time
# расчитываем среднее, медиану и стд для ozon_bert_manhattan_distance
features[["mean_ozon_bert_manhattan_distance", 
          "median_ozon_bert_manhattan_distance", 
          "std_ozon_bert_manhattan_distance"]] = (features['ozon_bert_manhattan_distance'].progress_apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для main_pic_manhattan_distance
features[["mean_main_pic_manhattan_distance", 
          "median_main_pic_manhattan_distance", 
          "std_main_pic_manhattan_distance"]] = (features['main_pic_manhattan_distance'].progress_apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для name_emb_manhattan_distance
features[["mean_name_emb_manhattan_distance", 
          "median_name_emb_manhattan_distance", 
          "std_name_emb_manhattan_distance"]] = (features['name_emb_manhattan_distance'].progress_apply(
        lambda x: pd.Series(calculate_statistics(x))))

100%|██████████| 306306/306306 [01:15<00:00, 4035.05it/s]
100%|██████████| 306306/306306 [01:18<00:00, 3892.78it/s]
100%|██████████| 306306/306306 [01:35<00:00, 3199.29it/s]

CPU times: total: 4min 10s
Wall time: 4min 10s





In [31]:
def levenshtein_distance(text1, text2):
    """
    Функция для расчета расстояния Левенштейна между двумя текстами
    :param text1: первый текст
    :param text2: второй текст
    :return: расстояние Левенштейна между text1 и text2
    """
    return edit_distance(text1, text2)

In [32]:
%%time
# расчитаем расстояния Левенштейна для name1 и name2
features[['name_levenshtein_distance']] = (
    features[["name1", "name2"]].progress_apply(
        lambda x: pd.Series(levenshtein_distance(*x)), axis=1))

100%|██████████| 306306/306306 [54:27<00:00, 93.74it/s]  

CPU times: total: 54min 28s
Wall time: 54min 28s





### *Введем расстояния*

In [33]:
def get_pic_features(main_pic_embeddings_1,
                     main_pic_embeddings_2,
                     percentiles: List[int]):
    """
    Calculate distances percentiles for 
    pairwise pic distances. Percentiles are useful 
    when product has several pictures.
    """
    
    if main_pic_embeddings_1 is not None and main_pic_embeddings_2 is not None:
        main_pic_embeddings_1 = np.array([x for x in main_pic_embeddings_1])
        main_pic_embeddings_2 = np.array([x for x in main_pic_embeddings_2])
        
        dist_m = pairwise_distances(
            main_pic_embeddings_1, main_pic_embeddings_2
        )
    else:
        dist_m = np.array([[-1]])

    pair_features = []
    pair_features += np.percentile(dist_m, percentiles).tolist()

    return pair_features


def text_dense_distances(ozon_embedding, comp_embedding):
    """
    Calculate Euclidean and Cosine distances between
    ozon_embedding and comp_embedding.
    """
    pair_features = []
    if ozon_embedding is None or comp_embedding is None:
        pair_features = [-1, -1]
    
    elif len(ozon_embedding) == 0 or len(comp_embedding) == 0:
        pair_features = [-1, -1]
    
    else:
        pair_features.append(
            euclidean(ozon_embedding, comp_embedding)
        )
        cosine_value = cosine(ozon_embedding, comp_embedding)
        
        pair_features.append(cosine_value)

    return pair_features

In [34]:
%%time
get_pic_features_func = partial(get_pic_features, percentiles=[0, 25, 50])

features[["pic_dist_0_perc", "pic_dist_25_perc", "pic_dist_50_perc"]] = (
    features[["pic_embeddings_resnet_v11", "pic_embeddings_resnet_v12"]].progress_apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1
    )
)

features[["main_pic_dist_0_perc", "main_pic_dist_25_perc", "main_pic_dist_50_perc"]] = (
    features[["main_pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v12"]].progress_apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1
    )
)

features[["euclidean_main_pic_dist", "cosine_main_pic_dist"]] = (
    features[["main_pic_embeddings_resnet_v1_new1", "main_pic_embeddings_resnet_v1_new2"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1
    )
)

features[["euclidean_color_dist", "cosine_color_dist"]] = (
    features[["clr_vect1", "clr_vect2"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1
    )
)

features[["euclidean_name_bert_dist", "cosine_name_bert_dist"]] = (
    features[["name_bert_641", "name_bert_642"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1
    )
)

features[["euclidean_name_embedding_dist", "cosine_name_embedding_dist"]] = (
    features[["name_bert7681", "name_bert7682"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1
    )
)

100%|██████████| 306306/306306 [02:58<00:00, 1716.80it/s]
100%|██████████| 306306/306306 [03:22<00:00, 1515.20it/s]
100%|██████████| 306306/306306 [01:18<00:00, 3895.77it/s]
100%|██████████| 306306/306306 [01:20<00:00, 3828.11it/s]
100%|██████████| 306306/306306 [01:22<00:00, 3712.84it/s]
100%|██████████| 306306/306306 [01:42<00:00, 2989.24it/s]

CPU times: total: 12min 9s
Wall time: 12min 4s





###  Удаление ненужных колонок

In [37]:
# drop
def drop_f(features):
    features = features.drop([
        'name_bert_641', 
        'name_bert_642', 
        'main_pic_embeddings_resnet_v11', 
        'main_pic_embeddings_resnet_v12',
        'main_pic_embeddings_resnet_v1_new1', 
        'main_pic_embeddings_resnet_v1_new2',
        'name_bert7681', 
        'name_bert7682',
        'categories1',
        'categories2',
        'pic_embeddings_resnet_v11',
        'pic_embeddings_resnet_v12',       
        'cat3_grouped2',
        'attributes_set1', 
        'attributes_set2', 
        'clr_vect1', 
        'clr_vect2'], axis=1)  
    
    return features

In [38]:
# удалим не используещиеся далее признаки
features = drop_f(features)

In [39]:
# заполним пропуски в characteristic_attributes_mapping1 и characteristic_attributes_mapping2
features[['characteristic_attributes_mapping1', 
          'characteristic_attributes_mapping2']] = features[['characteristic_attributes_mapping1', 
                                                             'characteristic_attributes_mapping2']].fillna('{}')

##  Отделение тестовой выборки

In [40]:
X_train, X_my_test = train_test_split(features, test_size=0.1, \
                                random_state=RANDOM_STATE, stratify=features[["target", "cat3_grouped"]])

y_train = X_train[["target", "variantid1", "variantid2"]]
X_train = X_train.drop(["target"], axis=1)

y_my_test = X_my_test[["target", "variantid1", "variantid2"]]           
X_my_test = X_my_test.drop(["target"], axis=1)


feats = list(X_train.columns) 
feats.remove("variantid1")
feats.remove("variantid2")
feats.remove("cat3_grouped")

X_train[feats].info() # на чем обучать

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275675 entries, 204167 to 61494
Data columns (total 52 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   name1                                275675 non-null  object 
 1   characteristic_attributes_mapping1   275675 non-null  object 
 2   cat3                                 275675 non-null  float64
 3   cat41                                275675 non-null  float64
 4   name2                                275675 non-null  object 
 5   characteristic_attributes_mapping2   275675 non-null  object 
 6   cat32                                275675 non-null  float64
 7   cat42                                275675 non-null  float64
 8   name_mean_diff                       275675 non-null  float32
 9   name_median_diff                     275675 non-null  float32
 10  name_std_diff                        275675 non-null  float32
 11  ozon_name

## Model

###  CatBoostClassifier

In [41]:
X_train.head(1)

Unnamed: 0,variantid1,variantid2,name1,characteristic_attributes_mapping1,cat3,cat41,cat3_grouped,name2,characteristic_attributes_mapping2,cat32,cat42,name_mean_diff,name_median_diff,name_std_diff,ozon_name_mean_diff,ozon_name_median_diff,ozon_name_std_diff,main_pic_mean_diff,main_pic_median_diff,main_pic_std_diff,name_jaccard_similarity,attributes_jaccard_similarity,attr_dist,attr_acc,common_keys,same_value_keys,mean_value1,mean_value2,ozon_bert_manhattan_distance,main_pic_manhattan_distance,name_emb_manhattan_distance,mean_ozon_bert_manhattan_distance,median_ozon_bert_manhattan_distance,std_ozon_bert_manhattan_distance,mean_main_pic_manhattan_distance,median_main_pic_manhattan_distance,std_main_pic_manhattan_distance,mean_name_emb_manhattan_distance,median_name_emb_manhattan_distance,std_name_emb_manhattan_distance,name_levenshtein_distance,pic_dist_0_perc,pic_dist_25_perc,pic_dist_50_perc,main_pic_dist_0_perc,main_pic_dist_25_perc,main_pic_dist_50_perc,euclidean_main_pic_dist,cosine_main_pic_dist,euclidean_color_dist,cosine_color_dist,euclidean_name_bert_dist,cosine_name_bert_dist,euclidean_name_embedding_dist,cosine_name_embedding_dist
204167,558654336,785126987,"Кабель витая пара UTP 4 пары, чистая медь (BC)...","{""Коннектор 1"":[""RJ-45""],""Размеры, мм"":[""200x2...",31.0,98.0,Кабели и переходники,"Кабель витая пара UTP 4 пары, чистая медь (BC)...","{""Комплектация"":[""Устройство - 1шт""],""Материал...",31.0,98.0,-8.2e-05,-0.000167,0.015456,-0.012098,-0.002774,0.055224,0.0,0.0,0.0,0.904762,0.933333,0.5,20.0,22.0,20.0,522.2,218.2,"[0.011884868, 0.13925996, 0.0013739765, 0.0674...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.001141429, 0.003529057, 0.0015780926, 0.001...",0.044214,0.039085,0.03523,0.0,0.0,0.0,0.012323,0.010445,0.00933,2,0.0,4.052639,8.038209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452269,0.006031,0.428349,0.000438


In [42]:
# определяем Pool для CatBoost
y_train2 = y_train['target']
y_my_test2 = y_my_test['target']
embeddin_col = ['ozon_bert_manhattan_distance','main_pic_manhattan_distance','name_emb_manhattan_distance'] 

train_pool = Pool(
    data=X_train[feats],
    label=y_train2,
    text_features=['characteristic_attributes_mapping1', 'characteristic_attributes_mapping2', 'name1', 'name2'],
    embedding_features=embeddin_col
)
eval_pool = Pool(
    data=X_my_test[feats],
    label=y_my_test2,
    text_features=['characteristic_attributes_mapping1', 'characteristic_attributes_mapping2', 'name1', 'name2'],
    embedding_features=embeddin_col
)

In [43]:
%%time
# обучаем модель
model = CatBoostClassifier(random_state=RANDOM_STATE, iterations=2500, learning_rate=0.1) #300

model.fit(
    train_pool,
    eval_set=eval_pool,
    plot=True,
    verbose=True,
    use_best_model=True,
    early_stopping_rounds=50,
    metric_period=10
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	learn: 0.6445739	test: 0.6437813	best: 0.6437813 (0)	total: 468ms	remaining: 19m 29s
10:	learn: 0.4727504	test: 0.4697385	best: 0.4697385 (10)	total: 5.29s	remaining: 19m 57s
20:	learn: 0.4425972	test: 0.4395304	best: 0.4395304 (20)	total: 10.3s	remaining: 20m 20s
30:	learn: 0.4314097	test: 0.4287253	best: 0.4287253 (30)	total: 15.1s	remaining: 20m 5s
40:	learn: 0.4232735	test: 0.4207176	best: 0.4207176 (40)	total: 20s	remaining: 19m 59s
50:	learn: 0.4184407	test: 0.4162416	best: 0.4162416 (50)	total: 24.9s	remaining: 19m 56s
60:	learn: 0.4147771	test: 0.4127897	best: 0.4127897 (60)	total: 29.7s	remaining: 19m 47s
70:	learn: 0.4117368	test: 0.4100015	best: 0.4100015 (70)	total: 34.5s	remaining: 19m 39s
80:	learn: 0.4088645	test: 0.4074205	best: 0.4074205 (80)	total: 39.4s	remaining: 19m 35s
90:	learn: 0.4067646	test: 0.4056701	best: 0.4056701 (90)	total: 44.2s	remaining: 19m 30s
100:	learn: 0.4049976	test: 0.4041666	best: 0.4041666 (100)	total: 48.9s	remaining: 19m 21s
110:	learn: 0

890:	learn: 0.3539120	test: 0.3762197	best: 0.3762031 (888)	total: 7m 3s	remaining: 12m 45s
900:	learn: 0.3535437	test: 0.3761036	best: 0.3761036 (900)	total: 7m 8s	remaining: 12m 41s
910:	learn: 0.3531878	test: 0.3760086	best: 0.3760086 (910)	total: 7m 13s	remaining: 12m 36s
920:	learn: 0.3527895	test: 0.3758719	best: 0.3758719 (920)	total: 7m 18s	remaining: 12m 31s
930:	learn: 0.3524201	test: 0.3758254	best: 0.3758254 (930)	total: 7m 23s	remaining: 12m 26s
940:	learn: 0.3520523	test: 0.3756878	best: 0.3756878 (940)	total: 7m 27s	remaining: 12m 22s
950:	learn: 0.3516439	test: 0.3755616	best: 0.3755544 (949)	total: 7m 32s	remaining: 12m 17s
960:	learn: 0.3512945	test: 0.3754333	best: 0.3754333 (960)	total: 7m 37s	remaining: 12m 12s
970:	learn: 0.3509557	test: 0.3753849	best: 0.3753799 (962)	total: 7m 42s	remaining: 12m 7s
980:	learn: 0.3506090	test: 0.3754230	best: 0.3753799 (962)	total: 7m 47s	remaining: 12m 3s
990:	learn: 0.3502728	test: 0.3753594	best: 0.3753594 (990)	total: 7m 51s	

<catboost.core.CatBoostClassifier at 0x1be794ef430>

## *Metric*

In [44]:
# функция расчета метрики pr_auc_macro
def pr_auc_macro(
    target_df: pd.DataFrame,
    predictions_df: pd.DataFrame,
    prec_level: float = 0.75,
    cat_column: str = "cat3_grouped"
) -> float:

    df = target_df.merge(predictions_df, on=["variantid1", "variantid2"])

    y_true = df["target"]
    y_pred = df["scores"]
    categories = df[cat_column]

    weights = []
    pr_aucs = []

    unique_cats, counts = np.unique(categories, return_counts=True)

    # calculate metric for each big category
    for i, category in enumerate(unique_cats):
        # take just a certain category
        cat_idx = np.where(categories == category)[0]
        y_pred_cat = y_pred[cat_idx]
        y_true_cat = y_true[cat_idx]

        # if there is no matches in the category then PRAUC=0
        if sum(y_true_cat) == 0:
            pr_aucs.append(0)
            weights.append(counts[i] / len(categories))
            continue
        
        # get coordinates (x, y) for (recall, precision) of PR-curve
        y, x, _ = precision_recall_curve(y_true_cat, y_pred_cat)
        
        # reverse the lists so that x's are in ascending order (left to right)
        y = y[::-1]
        x = x[::-1]
        
        # get indices for x-coordinate (recall) where y-coordinate (precision) 
        # is higher than precision level (75% for our task)
        good_idx = np.where(y >= prec_level)[0]
        
        # if there are more than one such x's (at least one is always there, 
        # it's x=0 (recall=0)) we get a grid from x=0, to the rightest x 
        # with acceptable precision
        if len(good_idx) > 1:
            gt_prec_level_idx = np.arange(0, good_idx[-1] + 1)
        # if there is only one such x, then we have zeros in the top scores 
        # and the curve simply goes down sharply at x=0 and does not rise 
        # above the required precision: PRAUC=0
        else:
            pr_aucs.append(0)
            weights.append(counts[i] / len(categories))
            continue
        
        # calculate category weight anyway
        weights.append(counts[i] / len(categories))
        # calculate PRAUC for all points where the rightest x 
        # still has required precision 
        try:
            pr_auc_prec_level = auc(x[gt_prec_level_idx], y[gt_prec_level_idx])
            if not np.isnan(pr_auc_prec_level):
                pr_aucs.append(pr_auc_prec_level)
        except ValueError:
            pr_aucs.append(0)
            
    return np.average(pr_aucs, weights=weights)

##  Расчет pr_auc на нашей тестовой

In [45]:
%%time
# расчет на трейне
X_train["scores"] = model.predict_proba(train_pool)[:, 1] # for cat
pred_tr = model.predict(train_pool)

pr_auc_macro_metr = pr_auc_macro(y_train, X_train)
display('PROCtrain:', pr_auc_macro_metr)

print('AUC:', roc_auc_score(y_train['target'], X_train["scores"]))
print('Accuracy:', acc(y_train['target'], pred_tr))
print('F1:', f1_score(y_train['target'], pred_tr))
print('Precision:', precision_score(y_train['target'], pred_tr))

'PROCtrain:'

0.7517883245096232

AUC: 0.9341463199589175
Accuracy: 0.8668033009884828
F1: 0.8492645700516012
Precision: 0.845927379784102
CPU times: total: 14min 7s
Wall time: 42.2 s


In [46]:
%%time
# расчет на тесте
X_my_test["scores"] = model.predict_proba(eval_pool)[:, 1] #for cat
pred_ts = model.predict(eval_pool)

pr_auc_macro_metr = pr_auc_macro(y_my_test, X_my_test)
display('PROCeval:', pr_auc_macro_metr)

print('AUC:', roc_auc_score(y_my_test['target'], X_my_test["scores"]))
print('Accuracy:', acc(y_my_test['target'], pred_ts))
print('F1:', f1_score(y_my_test['target'], pred_ts))
print('Precision:', precision_score(y_my_test['target'], pred_ts))

'PROCeval:'

0.660094681924286

AUC: 0.9116826808205767
Accuracy: 0.8421533740328425
F1: 0.8217511520737327
Precision: 0.8167827042872847
CPU times: total: 1min 33s
Wall time: 5.02 s


In [47]:
# проверим важность признаков
fi = model.get_feature_importance(prettified=True)
fi

Unnamed: 0,Feature Id,Importances
0,name2,15.606899
1,name1,12.751022
2,name_jaccard_similarity,8.675418
3,characteristic_attributes_mapping2,6.655387
4,name_emb_manhattan_distance,6.098602
5,characteristic_attributes_mapping1,5.847714
6,attr_dist,5.100949
7,name_levenshtein_distance,5.020778
8,ozon_bert_manhattan_distance,3.077709
9,attributes_jaccard_similarity,2.116221


## ДООБУЧЕНИЕ НА ВСЕЙ

In [48]:
# общий трейн
X_all = features.drop(["target"], axis=1)
y_all = features["target"]

pool_all = Pool(
    data=X_all[feats],
    label=y_all,
    text_features=['characteristic_attributes_mapping1', 'characteristic_attributes_mapping2', 'name1', 'name2'],
    embedding_features=embeddin_col
)

In [49]:
# обучим модель на всех данных
model.fit(
    pool_all,
    plot=True,
    verbose=True,
    use_best_model=True,
    early_stopping_rounds=40,
    metric_period=10
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.6439947	total: 430ms	remaining: 17m 53s
10:	learn: 0.4727322	total: 5.35s	remaining: 20m 11s
20:	learn: 0.4417225	total: 10.4s	remaining: 20m 29s
30:	learn: 0.4292237	total: 15.4s	remaining: 20m 29s
40:	learn: 0.4218881	total: 20.3s	remaining: 20m 18s
50:	learn: 0.4176330	total: 25.1s	remaining: 20m 3s
60:	learn: 0.4133479	total: 29.7s	remaining: 19m 49s
70:	learn: 0.4105857	total: 34.5s	remaining: 19m 39s
80:	learn: 0.4078973	total: 39.3s	remaining: 19m 34s
90:	learn: 0.4061246	total: 44.1s	remaining: 19m 27s
100:	learn: 0.4045260	total: 48.9s	remaining: 19m 20s
110:	learn: 0.4030569	total: 53.7s	remaining: 19m 14s
120:	learn: 0.4017401	total: 58.3s	remaining: 19m 6s
130:	learn: 0.4003393	total: 1m 3s	remaining: 19m 1s
140:	learn: 0.3987428	total: 1m 7s	remaining: 18m 56s
150:	learn: 0.3974103	total: 1m 12s	remaining: 18m 50s
160:	learn: 0.3961225	total: 1m 17s	remaining: 18m 45s
170:	learn: 0.3949854	total: 1m 22s	remaining: 18m 38s
180:	learn: 0.3938653	total: 1m 26s	rem

1500:	learn: 0.3373456	total: 11m 53s	remaining: 7m 54s
1510:	learn: 0.3371041	total: 11m 58s	remaining: 7m 49s
1520:	learn: 0.3368404	total: 12m 2s	remaining: 7m 45s
1530:	learn: 0.3365817	total: 12m 7s	remaining: 7m 40s
1540:	learn: 0.3363250	total: 12m 11s	remaining: 7m 35s
1550:	learn: 0.3360719	total: 12m 16s	remaining: 7m 30s
1560:	learn: 0.3358150	total: 12m 21s	remaining: 7m 25s
1570:	learn: 0.3355407	total: 12m 25s	remaining: 7m 21s
1580:	learn: 0.3352680	total: 12m 30s	remaining: 7m 16s
1590:	learn: 0.3350041	total: 12m 35s	remaining: 7m 11s
1600:	learn: 0.3347301	total: 12m 40s	remaining: 7m 6s
1610:	learn: 0.3344755	total: 12m 45s	remaining: 7m 2s
1620:	learn: 0.3341319	total: 12m 50s	remaining: 6m 57s
1630:	learn: 0.3338773	total: 12m 54s	remaining: 6m 52s
1640:	learn: 0.3335962	total: 12m 59s	remaining: 6m 47s
1650:	learn: 0.3333276	total: 13m 4s	remaining: 6m 43s
1660:	learn: 0.3330973	total: 13m 8s	remaining: 6m 38s
1670:	learn: 0.3328370	total: 13m 13s	remaining: 6m 33

<catboost.core.CatBoostClassifier at 0x1be794ef430>

## Submission. Расчет на финальной тестовой

### *Предобработка тестовой выборки*

In [50]:
%%time

REST_test = 50

test_data = new_col(test_data)
test_data = cat3_grouped(test_data, REST_test)
test_data[cat_col] = encoder.transform(test_data[cat_col]) 
test_data = color(test_data)

# Объединение таблиц
features_test = merge_f(test_pairs_wo_target, test_data)


features_test[['attr_dist', 
               'attr_acc', 
               'common_keys', 
               'same_value_keys', 
               'mean_value1', 
               'mean_value2']] = features_test[['attributes_set1', 'attributes_set2']].progress_apply(
    lambda x: pd.Series(compare_attr(*x)), axis=1
)

features_test['mean_value1'] = features_test['mean_value1'].fillna(0)
features_test['mean_value2'] = features_test['mean_value2'].fillna(0)

449


100%|██████████| 18084/18084 [00:04<00:00, 3739.18it/s]

CPU times: total: 19.9 s
Wall time: 19.9 s





In [51]:
features_test['main_pic_embeddings_resnet_v1_new1'] = features_test['main_pic_embeddings_resnet_v11'].apply(lambda x: x[0])
features_test['main_pic_embeddings_resnet_v1_new2'] = features_test['main_pic_embeddings_resnet_v12'].apply(lambda x: x[0])

In [52]:
%%time
# Добавляем новые столбцы в датафрейм features_test с вычисленными статистическими характеристиками разности 
# между эмбеддингами названий товаров
features_test[['name_mean_diff', 'name_median_diff', 'name_std_diff']] = features_test.progress_apply(
    lambda x: calculate_statistical_features(
        x['name_bert7681'], x['name_bert7682']), axis=1, result_type='expand'
)

# Добавляем новые столбцы в датафрейм features_test с вычисленными статистическими характеристиками разности 
# между эмбеддингами названий товаров от ozon
features_test[['ozon_name_mean_diff', 'ozon_name_median_diff', 'ozon_name_std_diff']] = features_test.progress_apply(
    lambda x: calculate_statistical_features(
        x['name_bert_641'], x['name_bert_642']), axis=1, result_type='expand'
)

# Добавляем новые столбцы в датафрейм features_test с вычисленными статистическими характеристиками разности 
# между эмбеддингами картинок товаров
features_test[['main_pic_mean_diff', 'main_pic_median_diff', 'main_pic_std_diff']] = features_test.progress_apply(
    lambda x: calculate_statistical_features(
        x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1, result_type='expand'
)

100%|██████████| 18084/18084 [00:03<00:00, 5072.19it/s]
100%|██████████| 18084/18084 [00:02<00:00, 6828.38it/s]
100%|██████████| 18084/18084 [00:02<00:00, 6764.32it/s]

CPU times: total: 9.03 s
Wall time: 8.93 s





In [53]:
%%time
# Добавляем новый столбец в датафрейм features_test с вычисленным коэффициентом Жаккара между названиями товаров
features_test['name_jaccard_similarity'] = features_test.progress_apply(
    lambda x: calculate_jaccard_similarity(x['name1'], x['name2']), axis=1
)

# Добавляем новый столбец в датафрейм features_test с вычисленным коэффициентом Жаккара между атрибутами товаров
features_test['attributes_jaccard_similarity'] = features_test.progress_apply(
    lambda x: calculate_jaccard_similarity(x['characteristic_attributes_mapping1'], x['characteristic_attributes_mapping2']), axis=1
)

100%|██████████| 18084/18084 [00:29<00:00, 604.30it/s]
100%|██████████| 18084/18084 [00:36<00:00, 499.34it/s]

CPU times: total: 1min 6s
Wall time: 1min 6s





In [54]:
%%time
features_test[["pic_dist_0_perc", "pic_dist_25_perc", "pic_dist_50_perc"]] = (
    features_test[["pic_embeddings_resnet_v11", "pic_embeddings_resnet_v12"]].progress_apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1))

features_test[["main_pic_dist_0_perc", "main_pic_dist_25_perc", "main_pic_dist_50_perc"]] = (
    features_test[["main_pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v12"]].progress_apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1))

features_test[["euclidean_main_pic_dist", "cosine_main_pic_dist"]] = (
    features_test[["main_pic_embeddings_resnet_v1_new1", "main_pic_embeddings_resnet_v1_new2"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features_test[["euclidean_color_dist", "cosine_color_dist"]] = (
    features_test[["clr_vect1", "clr_vect2"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features_test[["euclidean_name_bert_dist", "cosine_name_bert_dist"]] = (
    features_test[["name_bert_641", "name_bert_642"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features_test[["euclidean_name_embedding_dist", "cosine_name_embedding_dist"]] = (
    features_test[["name_bert7681", "name_bert7682"]].progress_apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))


100%|██████████| 18084/18084 [00:09<00:00, 1962.37it/s]
100%|██████████| 18084/18084 [00:11<00:00, 1519.89it/s]
100%|██████████| 18084/18084 [00:04<00:00, 3990.33it/s]
100%|██████████| 18084/18084 [00:04<00:00, 3912.60it/s]
100%|██████████| 18084/18084 [00:06<00:00, 2823.28it/s]
100%|██████████| 18084/18084 [00:06<00:00, 2917.60it/s]

CPU times: total: 43.3 s
Wall time: 43 s





In [55]:
# Расчет features_test манхэттенского расстояния между признаками name_bert_641 и name_bert_642
features_test['ozon_bert_manhattan_distance'] = features_test.progress_apply(
    lambda x: manhattan_distance(x['name_bert_641'], x['name_bert_642']), axis=1
)

# Расчет features_test манхэттенского расстояния между признаками main_pic_embeddings_resnet_v1_new1 и main_pic_embeddings_resnet_v1_new2
features_test['main_pic_manhattan_distance'] = features_test.progress_apply(
    lambda x: manhattan_distance(x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1
)

# Расчет features_test манхэттенского расстояния между признаками name_embedding1 и name_embedding2
features_test['name_emb_manhattan_distance'] = features_test.progress_apply(
    lambda x: manhattan_distance(x['name_bert7681'], x['name_bert7682']), axis=1
)

100%|██████████| 18084/18084 [00:00<00:00, 32296.88it/s]
100%|██████████| 18084/18084 [00:00<00:00, 30587.97it/s]
100%|██████████| 18084/18084 [00:00<00:00, 23779.09it/s]


In [56]:
%%time
# расчитываем среднее, медиану и стд для ozon_bert_manhattan_distance
features_test[["mean_ozon_bert_manhattan_distance", 
          "median_ozon_bert_manhattan_distance", 
          "std_ozon_bert_manhattan_distance"]] = (features_test['ozon_bert_manhattan_distance'].progress_apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для main_pic_manhattan_distance
features_test[["mean_main_pic_manhattan_distance", 
          "median_main_pic_manhattan_distance", 
          "std_main_pic_manhattan_distance"]] = (features_test['main_pic_manhattan_distance'].progress_apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для name_emb_manhattan_distance
features_test[["mean_name_emb_manhattan_distance", 
          "median_name_emb_manhattan_distance", 
          "std_name_emb_manhattan_distance"]] = (features_test['name_emb_manhattan_distance'].progress_apply(
        lambda x: pd.Series(calculate_statistics(x))))

100%|██████████| 18084/18084 [00:04<00:00, 4173.18it/s]
100%|██████████| 18084/18084 [00:04<00:00, 4105.39it/s]
100%|██████████| 18084/18084 [00:05<00:00, 3182.74it/s]

CPU times: total: 14.5 s
Wall time: 14.5 s





In [57]:
%%time
# расчитаем расстояния Левенштейна для name1 и name2
features_test[["name_levenshtein_distance"]] = (
    features_test[["name1", "name2"]].progress_apply(
        lambda x: pd.Series(levenshtein_distance(*x)), axis=1))

100%|██████████| 18084/18084 [02:40<00:00, 112.77it/s]

CPU times: total: 2min 40s
Wall time: 2min 40s





In [58]:
# заполним пропуски в characteristic_attributes_mapping1 и characteristic_attributes_mapping2
features_test = drop_f(features_test)
features_test[['characteristic_attributes_mapping1', 
               'characteristic_attributes_mapping2']] = features_test[['characteristic_attributes_mapping1', 
                                                                       'characteristic_attributes_mapping2']].fillna('{}')

In [59]:
display(X_train.info())
display(features_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275675 entries, 204167 to 61494
Data columns (total 56 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   variantid1                           275675 non-null  int64  
 1   variantid2                           275675 non-null  int64  
 2   name1                                275675 non-null  object 
 3   characteristic_attributes_mapping1   275675 non-null  object 
 4   cat3                                 275675 non-null  float64
 5   cat41                                275675 non-null  float64
 6   cat3_grouped                         275675 non-null  object 
 7   name2                                275675 non-null  object 
 8   characteristic_attributes_mapping2   275675 non-null  object 
 9   cat32                                275675 non-null  float64
 10  cat42                                275675 non-null  float64
 11  name_mean

None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18084 entries, 0 to 18083
Data columns (total 58 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   variantid1                           18084 non-null  int64  
 1   variantid2                           18084 non-null  int64  
 2   cat3_grouped                         18084 non-null  object 
 3   name1                                18084 non-null  object 
 4   characteristic_attributes_mapping1   18084 non-null  object 
 5   cat21                                18084 non-null  object 
 6   cat3                                 18084 non-null  float64
 7   cat41                                18084 non-null  float64
 8   cat3_grouped                         18084 non-null  object 
 9   name2                                18084 non-null  object 
 10  characteristic_attributes_mapping2   18084 non-null  object 
 11  cat22                       

None

In [60]:
submission_example = features_test.copy()

submission_example["target"] = model.predict_proba(features_test[feats])[:, 1]
submission_example = submission_example[["variantid1", "variantid2", "target"]]
submission_example.head(3)

Unnamed: 0,variantid1,variantid2,target
0,52076340,290590137,0.253759
1,64525522,204128919,0.412551
2,77243372,479860557,0.590537


In [61]:
s = submission_example.drop_duplicates().merge(
      features_test[["variantid1", "variantid2"]].drop_duplicates(["variantid1", "variantid2"]),
      on=["variantid1", "variantid2"]
)

s.head(3)

Unnamed: 0,variantid1,variantid2,target
0,52076340,290590137,0.253759
1,64525522,204128919,0.412551
2,77243372,479860557,0.590537


In [62]:
features_test.duplicated(["variantid1", "variantid2"]).sum()

0

In [63]:
s.target.min()

0.004406284395753667

In [64]:
s.target.max()

0.9936165022287068

In [65]:
# s = s.drop_duplicates(["variantid1", "variantid2"])

In [66]:
s.to_csv("submission_Cat_LEVI_NEW_FEATURES.csv", index=False)

In [67]:
s.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18084 entries, 0 to 18083
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   variantid1  18084 non-null  int64  
 1   variantid2  18084 non-null  int64  
 2   target      18084 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 565.1 KB
