## Загрузка данных

### Импорты

In [1]:
!pip install catboost -q
!pip install shap -q

In [2]:
import json
import ast
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from scipy.spatial.distance import cosine, euclidean
from sklearn.metrics import pairwise_distances, roc_curve, precision_recall_curve
from sklearn.metrics import  auc, accuracy_score as acc, roc_auc_score, f1_score, precision_score
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

from tqdm import tqdm
import warnings

# настройки
tqdm.pandas()
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

# константы
RANDOM_STATE = 42

### Read/Head/Info

In [3]:
try:
    train_data = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/train_data.parquet')
    train_pairs = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/train_pairs.parquet')
    test_pairs_wo_target = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/test_pairs_wo_target.parquet')
    test_data = pd.read_parquet('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/test_data.parquet')
    submission_example3 = pd.read_csv('C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!!OzonHack3/submission_example3.csv')
except:    
    train_data = pd.read_parquet('train_data.parquet', engine='auto')
    train_pairs = pd.read_parquet('train_pairs.parquet', engine='auto')
    test_data = pd.read_parquet('test_data.parquet', engine='auto')
    test_pairs_wo_target = pd.read_parquet('test_pairs_wo_target.parquet', engine='auto')
    submission_example3 = pd.read_csv('submission_example.csv')

In [4]:
%%time
# Грузим дополнительные ембеддинги названий и характеристик
name_embs = pd.read_parquet('name_bert768.parquet')

CPU times: total: 42 s
Wall time: 2.98 s


In [5]:
%%time
embedding_columns = [str(i) for i in range(768)]

name_embs['name_embedding'] = name_embs[embedding_columns].apply(lambda row: np.array(row.values), axis=1)
embs_df = name_embs[['variantid', 'name_embedding']]

embs_df.head(1)

CPU times: total: 5.59 s
Wall time: 5.58 s


Unnamed: 0,variantid,name_embedding
0,51195767,"[-0.52616537, 0.5570387, 0.19487007, -0.363480..."


In [6]:
display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457063 entries, 0 to 457062
Data columns (total 8 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          457063 non-null  int64 
 1   name                               457063 non-null  object
 2   categories                         457063 non-null  object
 3   color_parsed                       378652 non-null  object
 4   pic_embeddings_resnet_v1           303467 non-null  object
 5   main_pic_embeddings_resnet_v1      457063 non-null  object
 6   name_bert_64                       457063 non-null  object
 7   characteristic_attributes_mapping  457036 non-null  object
dtypes: int64(1), object(7)
memory usage: 27.9+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35730 entries, 0 to 35729
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   variantid                          35730 non-null  int64 
 1   name                               35730 non-null  object
 2   categories                         35730 non-null  object
 3   color_parsed                       26089 non-null  object
 4   pic_embeddings_resnet_v1           19977 non-null  object
 5   main_pic_embeddings_resnet_v1      35730 non-null  object
 6   name_bert_64                       35730 non-null  object
 7   characteristic_attributes_mapping  35726 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.2+ MB


None

In [7]:
train_data = train_data.merge(embs_df, on='variantid')
test_data = test_data.merge(embs_df, on='variantid')

display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460084 entries, 0 to 460083
Data columns (total 9 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          460084 non-null  int64 
 1   name                               460084 non-null  object
 2   categories                         460084 non-null  object
 3   color_parsed                       381142 non-null  object
 4   pic_embeddings_resnet_v1           305631 non-null  object
 5   main_pic_embeddings_resnet_v1      460084 non-null  object
 6   name_bert_64                       460084 non-null  object
 7   characteristic_attributes_mapping  460057 non-null  object
 8   name_embedding                     460084 non-null  object
dtypes: int64(1), object(8)
memory usage: 35.1+ MB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38751 entries, 0 to 38750
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   variantid                          38751 non-null  int64 
 1   name                               38751 non-null  object
 2   categories                         38751 non-null  object
 3   color_parsed                       28579 non-null  object
 4   pic_embeddings_resnet_v1           22141 non-null  object
 5   main_pic_embeddings_resnet_v1      38751 non-null  object
 6   name_bert_64                       38751 non-null  object
 7   characteristic_attributes_mapping  38747 non-null  object
 8   name_embedding                     38751 non-null  object
dtypes: int64(1), object(8)
memory usage: 3.0+ MB


None

In [8]:
train_data = train_data.drop_duplicates("variantid")
test_data = test_data.drop_duplicates("variantid")

display(train_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457063 entries, 0 to 460083
Data columns (total 9 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          457063 non-null  int64 
 1   name                               457063 non-null  object
 2   categories                         457063 non-null  object
 3   color_parsed                       378652 non-null  object
 4   pic_embeddings_resnet_v1           303467 non-null  object
 5   main_pic_embeddings_resnet_v1      457063 non-null  object
 6   name_bert_64                       457063 non-null  object
 7   characteristic_attributes_mapping  457036 non-null  object
 8   name_embedding                     457063 non-null  object
dtypes: int64(1), object(8)
memory usage: 34.9+ MB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35730 entries, 0 to 38749
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   variantid                          35730 non-null  int64 
 1   name                               35730 non-null  object
 2   categories                         35730 non-null  object
 3   color_parsed                       26089 non-null  object
 4   pic_embeddings_resnet_v1           19977 non-null  object
 5   main_pic_embeddings_resnet_v1      35730 non-null  object
 6   name_bert_64                       35730 non-null  object
 7   characteristic_attributes_mapping  35726 non-null  object
 8   name_embedding                     35730 non-null  object
dtypes: int64(1), object(8)
memory usage: 2.7+ MB


None

In [9]:
def head_info(df):
    display(df.info())
    display(df.head(2))

    
# head_info(train_data)
# head_info(train_pairs)
# head_info(test_data)
# head_info(test_pairs_wo_target)
# head_info(submission_example3)

In [10]:
head_info(train_data)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 457063 entries, 0 to 460083
Data columns (total 9 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   variantid                          457063 non-null  int64 
 1   name                               457063 non-null  object
 2   categories                         457063 non-null  object
 3   color_parsed                       378652 non-null  object
 4   pic_embeddings_resnet_v1           303467 non-null  object
 5   main_pic_embeddings_resnet_v1      457063 non-null  object
 6   name_bert_64                       457063 non-null  object
 7   characteristic_attributes_mapping  457036 non-null  object
 8   name_embedding                     457063 non-null  object
dtypes: int64(1), object(8)
memory usage: 34.9+ MB


None

Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_embedding
0,51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{""Номинальный ток, А"":[""10""],""Цвет товара"":[""о...","[-0.52616537, 0.5570387, 0.19487007, -0.363480..."
1,53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...","{""Конструктивные особенности"":[""Магнитная конс...","[-0.64783597, -0.1698648, 0.4192899, -0.225463..."


## Предобработка данных

### Колонки cat3 и 4 и набор атрибутов

In [11]:
def new_col(train_data):


#  переводим текст категорий в словарь
#    train_data['st'] = train_data.characteristic_attributes_mapping.fillna('{}').apply(ast.literal_eval)
   
    train_data['attributes_set'] = train_data.characteristic_attributes_mapping.fillna('{}').apply(ast.literal_eval)
     
    train_data['cat3'] = train_data['categories'].apply(lambda x: json.loads(x).get('3'))  
    train_data['cat4'] = train_data['categories'].apply(lambda x: json.loads(x).get('4'))  
    
    train_data['main_pic_embeddings_resnet_v1_new'] = train_data['main_pic_embeddings_resnet_v1'].apply(lambda x: x[0])  # Переведем двумрные эмбеддинги в одномерные
    
    return train_data

In [12]:
%%time
train_data = new_col(train_data)

CPU times: total: 1min 37s
Wall time: 1min 37s


### Группировка категорий

In [13]:
REST_train = 1000   # меньше скольки, считаем категорию маленькую

In [14]:
%%time
def cat3_grouped(train_data, FOR_REST):
    cat3_counts = train_data["cat3"].value_counts().to_dict()

    cntr = 0
    for cat3 in cat3_counts:
        if cat3_counts[cat3] < FOR_REST:
            cntr += cat3_counts[cat3]   
    print(cntr)

    train_data["cat3_grouped"] = train_data["cat3"].apply(lambda x: x if cat3_counts[x] > FOR_REST else "rest")
    #train_data["cat3_grouped_num"]=train_data["cat3_grouped"]
    return train_data

CPU times: total: 0 ns
Wall time: 0 ns


In [15]:
%%time
train_data = cat3_grouped(train_data, REST_train)

cat_col = ['cat3', 'cat4'] # "cat3_grouped_num"
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=777)
encoder.fit(train_data[cat_col])

train_data[cat_col] = encoder.transform(train_data[cat_col])    

train_data.head(2)

11296
CPU times: total: 1.09 s
Wall time: 1.09 s


Unnamed: 0,variantid,name,categories,color_parsed,pic_embeddings_resnet_v1,main_pic_embeddings_resnet_v1,name_bert_64,characteristic_attributes_mapping,name_embedding,attributes_set,cat3,cat4,main_pic_embeddings_resnet_v1_new,cat3_grouped
0,51195767,"Удлинитель Партнер-Электро ПВС 2х0,75 ГОСТ,6A,...","{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",[оранжевый],,"[[0.04603629, 0.18839523, -0.09973055, -0.6636...","[-0.47045058, 0.67237014, 0.48984158, -0.54485...","{""Номинальный ток, А"":[""10""],""Цвет товара"":[""о...","[-0.52616537, 0.5570387, 0.19487007, -0.363480...","{'Номинальный ток, А': ['10'], 'Цвет товара': ...",93.0,284.0,"[0.04603629, 0.18839523, -0.09973055, -0.66368...","Сетевые фильтры, разветвители и удлинители"
1,53565809,Магнитный кабель USB 2.0 A (m) - USB Type-C (m...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Кабели ...",[красный],"[[0.26863545, -0.3130674, 0.29023397, 0.073978...","[[1.1471839, -0.665361, 0.7745614, 0.26716197,...","[-0.6575592, 0.6522429, 0.5426037, -0.54347897...","{""Конструктивные особенности"":[""Магнитная конс...","[-0.64783597, -0.1698648, 0.4192899, -0.225463...",{'Конструктивные особенности': ['Магнитная кон...,48.0,128.0,"[1.1471839, -0.665361, 0.7745614, 0.26716197, ...",Кабели и переходники


### Колонка цвет

In [16]:
def color(df):
       
    df['black'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('черный' in x) or ('black') in x else 0)
    df['silver'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('silver'in x) or ('серебристый' in x) or ('серебряный' in x) else 0)
    df['rose'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('pink' in x) or ('розовый'  in x) else 0)
    df['red'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('red' in x) or ('красный' in x) else 0)
    df['white'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('white' in x) or ('белый' in x) else 0)
    df['blue'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('blue' in x) or ('синий' in x) or ('голубой' in x)  else 0)
    df['multicol'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('разноцветный' in x) or ('цветной' in x) or ('многоцветный' in x) else 0)
    df['green'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('green' in x) or ('зеленый' in x) else 0)
    df['yellow'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('yellow' in x) or ('желтый' in x) else 0)
    df['gold'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('gold' in x) or ('золотой' in x) else 0)
    df['purple'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('purple' in x) or ('пурпурный' in x)  or ('сиреневый' in x) or ('фиолетовый' in x) else 0)
    df['orange'] = df['color_parsed'].astype(str).apply(lambda x: 1 if 'оранжевый' in x else 0)
    df['brown'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('коричневый' in x) or ('brown' in x) or ('темно-коричневый' in x) else 0)
    df['grey'] = df['color_parsed'].astype(str).apply(lambda x: 1 if ('серый' in x) or ('grey' in x) or ('темно-серый' in x) else 0)
    
    name_color = ['black', 
                  'silver', 
                  'rose', 
                  'red', 
                  'white', 
                  'blue', 
                  'multicol', 
                  'green', 
                  'yellow', 
                  'gold', 
                  'purple', 
                  'orange', 
                  'brown', 
                  'grey'] 
    
    df['clr_vect'] = df[name_color].values.tolist()
    df = df.drop(name_color, axis=1)
    df = df.drop(['color_parsed'], axis=1)
     
    return df

In [17]:
%%time
train_data = color(train_data)

CPU times: total: 1min 50s
Wall time: 1min 50s


### Объединение таблиц

In [18]:
def merge_f(pairs, data): 
    features = pairs.merge(data.add_suffix('1'), on="variantid1").merge(data.add_suffix('2'), on="variantid2")
    features.rename(columns={'cat31': 'cat3', 'cat3_grouped1': 'cat3_grouped'}, inplace=True)
    return features

In [19]:
%%time
features = merge_f(train_pairs, train_data)

features.head(1)

CPU times: total: 2.91 s
Wall time: 2.91 s


Unnamed: 0,target,variantid1,variantid2,name1,categories1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name_embedding1,attributes_set1,cat3,cat41,main_pic_embeddings_resnet_v1_new1,cat3_grouped,clr_vect1,name2,categories2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2,name_embedding2,attributes_set2,cat32,cat42,main_pic_embeddings_resnet_v1_new2,cat3_grouped2,clr_vect2
0,0.0,51197862,51198054,Удлинитель TDM Electric Люкс УЛ05В 5 м (SQ1303...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",,"[[-0.4304909, -0.49474272, -0.46439183, -0.060...","[-0.5104684, 0.56158644, 0.58873796, -0.529718...","{""Число жил"":[""3""],""Макс. нагрузка, Вт"":[""3500...","[-0.4590829, -0.11046589, 0.11674632, -0.17761...","{'Число жил': ['3'], 'Макс. нагрузка, Вт': ['3...",93.0,284.0,"[-0.4304909, -0.49474272, -0.46439183, -0.0609...","Сетевые фильтры, разветвители и удлинители","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",Удлинитель TDM Electric Люкс УЛ05В 1.5 м (SQ13...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Сетевые...",,"[[-0.42941108, -0.5129398, -0.4753536, -0.0677...","[-0.455473, 0.58157134, 0.5870387, -0.5325003,...","{""Электробезопасность"":[""Заземление""],""Длина к...","[-0.55869406, -0.13428268, 0.0906083, -0.18947...","{'Электробезопасность': ['Заземление'], 'Длина...",93.0,284.0,"[-0.42941108, -0.5129398, -0.4753536, -0.06778...","Сетевые фильтры, разветвители и удлинители","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


### Похожесть атрибутов

In [20]:
def calculate_statistical_features(embedding1, embedding2):
    
    # Вычисляем разность между двумя эмбеддингами
    diff = embedding1 - embedding2
    
    # Вычисляем среднее значение разности
    mean = np.mean(diff)
    
    # Вычисляем медиану разности
    median = np.median(diff)
    
    # Вычисляем стандартное отклонение разности
    std = np.std(diff)
    
    # Возвращаем вычисленные значения
    return mean, median, std

In [21]:
%%time
# Добавляем новые столбцы в датафрейм features с вычисленными статистическими характеристиками разности 
# между эмбеддингами названий товаров
features[['name_mean_diff', 'name_median_diff', 'name_std_diff']] = features.apply(
    lambda x: calculate_statistical_features(
        x['name_embedding1'], x['name_embedding2']), axis=1, result_type='expand'
)

# Добавляем новые столбцы в датафрейм features с вычисленными статистическими характеристиками разности 
# между эмбеддингами названий товаров от ozon
features[['ozon_name_mean_diff', 'ozon_name_median_diff', 'ozon_name_std_diff']] = features.apply(
    lambda x: calculate_statistical_features(
        x['name_bert_641'], x['name_bert_642']), axis=1, result_type='expand'
)

# Добавляем новые столбцы в датафрейм features с вычисленными статистическими характеристиками разности 
# между эмбеддингами картинок товаров
features[['main_pic_mean_diff', 'main_pic_median_diff', 'main_pic_std_diff']] = features.apply(
    lambda x: calculate_statistical_features(
        x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1, result_type='expand'
)

CPU times: total: 2min 7s
Wall time: 2min 7s


In [22]:
def calculate_jaccard_similarity(text1, text2):
    # Заменяем значения None на пустые строки
    text1 = '' if text1 is None else text1
    text2 = '' if text2 is None else text2
    
    # Заменяем значения типа float на пустые строки
    text1 = '' if isinstance(text1, float) else text1
    text2 = '' if isinstance(text2, float) else text2
    
    # Инициализируем векторизатор с бинарными значениями
    vectorizer = CountVectorizer(binary=True)

    try:
        # Преобразуем тексты в векторы
        X = vectorizer.fit_transform([text1, text2])
    except ValueError:
        # Если после предобработки текстов не осталось слов для анализа,
        # возвращаем значение 0
        return 0
    
    # Преобразуем тексты в векторы
    X = vectorizer.fit_transform([text1, text2])
    
    # Вычисляем пересечение между векторами
    intersection = X[0].multiply(X[1]).sum()
    
    # Вычисляем объединение между векторами
    union = X.sum() - intersection
    
    # Возвращаем коэффициент Жаккара
    return intersection / union

In [23]:
%%time
# Добавляем новый столбец в датафрейм features с вычисленным коэффициентом Жаккара между названиями товаров
features['name_jaccard_similarity'] = features.apply(
    lambda x: calculate_jaccard_similarity(x['name1'], x['name2']), axis=1
)

# Добавляем новый столбец в датафрейм features с вычисленным коэффициентом Жаккара между атрибутами товаров
features['attributes_jaccard_similarity'] = features.apply(
    lambda x: calculate_jaccard_similarity(x['characteristic_attributes_mapping1'], x['characteristic_attributes_mapping2']), axis=1
)

CPU times: total: 17min 53s
Wall time: 17min 53s


In [24]:
def is_number(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def compare_attr(set1, set2):
    keys1 = set(set1.keys())
    keys2 = set(set2.keys())
    
    # Отношение общих ключей к сумме ключей
    ratio = len(keys2 & keys1) / (len(keys2) + len(keys1))
    
    # Количество одинаковых значений в общих ключах
    d = 0
    for i in keys2 & keys1:
        if set1[i][0] == set2[i][0]:
            d += 1
    
    # Количество общих ключей
    common_keys = len(keys1 & keys2)
    
    # Количество ключей с одинаковыми значениями
    same_value_keys = sum(1 for key in keys1 & keys2 if set1[key] == set2[key])
    
    # Среднее значение числовых характеристик для первого товара
    mean_value1 = np.mean([float(value[0]) for key, value in set1.items() if is_number(value[0])])
    
    # Среднее значение числовых характеристик для второго товара
    mean_value2 = np.mean([float(value[0]) for key, value in set2.items() if is_number(value[0])])
    
    return ratio, d, common_keys, same_value_keys, mean_value1, mean_value2

In [25]:
%%time
features[['attr_dist', 
          'attr_acc', 
          'common_keys', 
          'same_value_keys', 
          'mean_value1', 
          'mean_value2']] = features[['attributes_set1', 'attributes_set2']].apply(
    lambda x: pd.Series(compare_attr(*x)), axis=1
)

CPU times: total: 1min 30s
Wall time: 1min 30s


In [26]:
def manhattan_distance(emb1, emb2):
    """
    Функция для расчета манхэттенского расстояния между двумя одномерными эмбеддингами в строке датафрейма
    :return: манхэттенское расстояние между эмбеддингами в строке
    """
    return np.abs(emb1 - emb2)

In [27]:
# Расчет манхэттенского расстояния между признаками name_bert_641 и name_bert_642
features['ozon_bert_manhattan_distance'] = features.apply(
    lambda x: manhattan_distance(x['name_bert_641'], x['name_bert_642']), axis=1
)

# Расчет манхэттенского расстояния между признаками main_pic_embeddings_resnet_v1_new1 и main_pic_embeddings_resnet_v1_new2
features['main_pic_manhattan_distance'] = features.apply(
    lambda x: manhattan_distance(x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1
)

# Расчет манхэттенского расстояния между признаками name_embedding1 и name_embedding2
features['name_emb_manhattan_distance'] = features.apply(
    lambda x: manhattan_distance(x['name_embedding1'], x['name_embedding2']), axis=1
)

In [28]:
def calculate_statistics(arr):
    """
    Функция для расчета статистических мер для массива
    :param arr: массив значений
    :return: кортеж из среднего значения, медианы и стандартного отклонения
    """
    mean = np.mean(arr)
    median = np.median(arr)
    std = np.std(arr)
    
    return mean, median, std

In [29]:
%%time
# расчитываем среднее, медиану и стд для ozon_bert_manhattan_distance
features[["mean_ozon_bert_manhattan_distance", 
          "median_ozon_bert_manhattan_distance", 
          "std_ozon_bert_manhattan_distance"]] = (features['ozon_bert_manhattan_distance'].apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для main_pic_manhattan_distance
features[["mean_main_pic_manhattan_distance", 
          "median_main_pic_manhattan_distance", 
          "std_main_pic_manhattan_distance"]] = (features['main_pic_manhattan_distance'].apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для name_emb_manhattan_distance
features[["mean_name_emb_manhattan_distance", 
          "median_name_emb_manhattan_distance", 
          "std_name_emb_manhattan_distance"]] = (features['name_emb_manhattan_distance'].apply(
        lambda x: pd.Series(calculate_statistics(x))))

CPU times: total: 4min 2s
Wall time: 4min 2s


In [30]:
from nltk.metrics.distance import edit_distance

def levenshtein_distance(text1, text2):
    """
    Функция для расчета расстояния Левенштейна между двумя текстами
    :param text1: первый текст
    :param text2: второй текст
    :return: расстояние Левенштейна между text1 и text2
    """
    return edit_distance(text1, text2)

In [31]:
%%time
# расчитаем расстояния Левенштейна для name1 и name2
features[["name_levenshtein_distance"]] = (
    features[["name1", "name2"]].progress_apply(
        lambda x: pd.Series(levenshtein_distance(*x)), axis=1))

100%|██████████| 306540/306540 [54:52<00:00, 93.10it/s]  

CPU times: total: 54min 54s
Wall time: 54min 53s





In [32]:
features.isna().sum()

target                                     0
variantid1                                 0
variantid2                                 0
name1                                      0
categories1                                0
pic_embeddings_resnet_v11              91004
main_pic_embeddings_resnet_v11             0
name_bert_641                              0
characteristic_attributes_mapping1        17
name_embedding1                            0
attributes_set1                            0
cat3                                       0
cat41                                      0
main_pic_embeddings_resnet_v1_new1         0
cat3_grouped                               0
clr_vect1                                  0
name2                                      0
categories2                                0
pic_embeddings_resnet_v12              95047
main_pic_embeddings_resnet_v12             0
name_bert_642                              0
characteristic_attributes_mapping2        13
name_embed

In [33]:
features['mean_value1'] = features['mean_value1'].fillna(0)
features['mean_value2'] = features['mean_value2'].fillna(0)

### Введем расстояния 

In [34]:
def get_pic_features(main_pic_embeddings_1,
                     main_pic_embeddings_2,
                     percentiles: List[int]):
    """Calculate distances percentiles for 
    pairwise pic distances. Percentiles are useful 
    when product has several pictures.
    """
    
    if main_pic_embeddings_1 is not None and main_pic_embeddings_2 is not None:
        main_pic_embeddings_1 = np.array([x for x in main_pic_embeddings_1])
        main_pic_embeddings_2 = np.array([x for x in main_pic_embeddings_2])
        
        dist_m = pairwise_distances(
            main_pic_embeddings_1, main_pic_embeddings_2
        )
    else:
        dist_m = np.array([[-1]])

    pair_features = []
    pair_features += np.percentile(dist_m, percentiles).tolist()

    return pair_features


def text_dense_distances(ozon_embedding, comp_embedding):
    """Calculate Euclidean and Cosine distances between
    ozon_embedding and comp_embedding.
    """
    pair_features = []
    if ozon_embedding is None or comp_embedding is None:
        pair_features = [-1, -1]
    
    elif len(ozon_embedding) == 0 or len(comp_embedding) == 0:
        pair_features = [-1, -1]
    
    else:
        pair_features.append(
            euclidean(ozon_embedding, comp_embedding)
        )
        cosine_value = cosine(ozon_embedding, comp_embedding)
        
        pair_features.append(cosine_value)

    return pair_features

In [35]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306540 entries, 0 to 306539
Data columns (total 59 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   target                               306540 non-null  float64
 1   variantid1                           306540 non-null  int64  
 2   variantid2                           306540 non-null  int64  
 3   name1                                306540 non-null  object 
 4   categories1                          306540 non-null  object 
 5   pic_embeddings_resnet_v11            215536 non-null  object 
 6   main_pic_embeddings_resnet_v11       306540 non-null  object 
 7   name_bert_641                        306540 non-null  object 
 8   characteristic_attributes_mapping1   306523 non-null  object 
 9   name_embedding1                      306540 non-null  object 
 10  attributes_set1                      306540 non-null  object 
 11  cat3         

In [36]:
%%time
get_pic_features_func = partial(get_pic_features, percentiles=[0, 25, 50])

features[["pic_dist_0_perc", "pic_dist_25_perc", "pic_dist_50_perc"]] = (
    features[["pic_embeddings_resnet_v11", "pic_embeddings_resnet_v12"]].apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1))

features[["main_pic_dist_0_perc", "main_pic_dist_25_perc", "main_pic_dist_50_perc"]] = (
    features[["main_pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v12"]].apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1))

features[["euclidean_main_pic_dist", "cosine_main_pic_dist"]] = (
    features[["main_pic_embeddings_resnet_v1_new1", "main_pic_embeddings_resnet_v1_new2"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features[["euclidean_color_dist", "cosine_color_dist"]] = (
    features[["clr_vect1", "clr_vect2"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features[["euclidean_name_bert_dist", "cosine_name_bert_dist"]] = (
    features[["name_bert_641", "name_bert_642"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features[["euclidean_name_embedding_dist", "cosine_name_embedding_dist"]] = (
    features[["name_embedding1", "name_embedding2"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))


numeric_pic = ["pic_dist_0_perc", 
               "pic_dist_25_perc", 
               "pic_dist_50_perc", 
               'main_pic_dist_0_perc', 
               'main_pic_dist_25_perc', 
               'main_pic_dist_50_perc', 
               'euclidean_main_pic_dist', 
               'cosine_main_pic_dist']  # ['cat3','cat4','cat3_grouped']

CPU times: total: 11min 35s
Wall time: 11min 35s


###  Удаление ненужных колонок

In [37]:
# drop

def drop_f(features):
    features = features.drop([
        'name_bert_641', 
        'name_bert_642', 
        'main_pic_embeddings_resnet_v11', 
        'main_pic_embeddings_resnet_v12',
        'main_pic_embeddings_resnet_v1_new1', 
        'main_pic_embeddings_resnet_v1_new2',
        'name_embedding1', 
        'name_embedding2',
        'categories1',
        'categories2',
        'pic_embeddings_resnet_v11',
        'pic_embeddings_resnet_v12',       
        'cat3_grouped2',
        'attributes_set1', 
        'attributes_set2', 
        'clr_vect1', 
        'clr_vect2'], axis=1)  
    
    return features

In [38]:
features = drop_f(features)

In [39]:
features[['characteristic_attributes_mapping1', 
          'characteristic_attributes_mapping2']] = features[['characteristic_attributes_mapping1', 
                                                             'characteristic_attributes_mapping2']].fillna('{}')

## Загрузка данных

In [40]:
%%time
#features.to_csv("features_Catb26.csv") #, index=False

CPU times: total: 0 ns
Wall time: 0 ns


In [41]:
%%time
#features=pd.read_csv("features_Catb26n.csv",index_col=0)
#head_info(features)

CPU times: total: 0 ns
Wall time: 0 ns


In [42]:
# features = features.drop([
#     'pic_dist_25_perc', 
#     'cat42', 
#     'pic_dist_50_perc', 
#     'main_pic_median_diff',
#     'cat41', 
#     'main_pic_mean_diff',
#     'ozon_name_mean_diff', 
#     'ozon_name_median_diff',
#     'name_median_diff',
#     'name_mean_diff'], axis=1) 

###  Отделение тестовой выборки

In [43]:
X_train, X_my_test = train_test_split(features, test_size=0.1, \
                                random_state=RANDOM_STATE, stratify=features[["target", "cat3_grouped"]])

y_train = X_train[["target", "variantid1", "variantid2"]]                      #[["target", "variantid1", "variantid2"]]
X_train = X_train.drop(["target"], axis=1)

y_my_test = X_my_test[["target", "variantid1", "variantid2"]]                  #[["target", "variantid1", "variantid2"]]                 
X_my_test = X_my_test.drop(["target"], axis=1)


feats = list(X_train.columns) 
feats.remove("variantid1")
feats.remove("variantid2")
feats.remove("cat3_grouped")

#feats.remove("cat3_grouped_num1")
#feats.remove("cat3_grouped_num2")

# feats.remove("name2")
# feats.remove("name1")
# feats.remove("characteristic_attributes_mapping1")
# feats.remove("characteristic_attributes_mapping2")
X_train[feats].info() # на чем обучать

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275886 entries, 94531 to 139533
Data columns (total 52 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   name1                                275886 non-null  object 
 1   characteristic_attributes_mapping1   275886 non-null  object 
 2   cat3                                 275886 non-null  float64
 3   cat41                                275886 non-null  float64
 4   name2                                275886 non-null  object 
 5   characteristic_attributes_mapping2   275886 non-null  object 
 6   cat32                                275886 non-null  float64
 7   cat42                                275886 non-null  float64
 8   name_mean_diff                       275886 non-null  float32
 9   name_median_diff                     275886 non-null  float32
 10  name_std_diff                        275886 non-null  float32
 11  ozon_name

## Model

###  CatBoostClassifier

In [44]:
#X_train.drop(columns=['name_bert_642','name_bert_641','main_pic_embeddings_resnet_v11','main_pic_embeddings_resnet_v12'],axis=1)
#X_my_test.drop(columns=['name_bert_642','name_bert_641','main_pic_embeddings_resnet_v11','main_pic_embeddings_resnet_v12'],axis=1)

In [45]:
X_train.head(1)

Unnamed: 0,variantid1,variantid2,name1,characteristic_attributes_mapping1,cat3,cat41,cat3_grouped,name2,characteristic_attributes_mapping2,cat32,cat42,name_mean_diff,name_median_diff,name_std_diff,ozon_name_mean_diff,ozon_name_median_diff,ozon_name_std_diff,main_pic_mean_diff,main_pic_median_diff,main_pic_std_diff,name_jaccard_similarity,attributes_jaccard_similarity,attr_dist,attr_acc,common_keys,same_value_keys,mean_value1,mean_value2,ozon_bert_manhattan_distance,main_pic_manhattan_distance,name_emb_manhattan_distance,mean_ozon_bert_manhattan_distance,median_ozon_bert_manhattan_distance,std_ozon_bert_manhattan_distance,mean_main_pic_manhattan_distance,median_main_pic_manhattan_distance,std_main_pic_manhattan_distance,mean_name_emb_manhattan_distance,median_name_emb_manhattan_distance,std_name_emb_manhattan_distance,name_levenshtein_distance,pic_dist_0_perc,pic_dist_25_perc,pic_dist_50_perc,main_pic_dist_0_perc,main_pic_dist_25_perc,main_pic_dist_50_perc,euclidean_main_pic_dist,cosine_main_pic_dist,euclidean_color_dist,cosine_color_dist,euclidean_name_bert_dist,cosine_name_bert_dist,euclidean_name_embedding_dist,cosine_name_embedding_dist
94531,815516420,822097168,Системный блок ЮКОМС 900-9400-202 (AMD A6-9400...,"{""Тип видеокарты"":[""Встроенная""],""Видеокарта"":...",53.0,288.0,Компьютер,Системный блок ЮКОМС 9400-222 (AMD A6-9400 (3....,"{""ОС (краткое название)"":[""Windows 10 Pro""],""Б...",53.0,288.0,-0.0005,-0.003655,0.105085,0.001404,-0.002702,0.030857,0.0,0.0,0.0,0.72,0.858333,0.487179,30.0,38.0,30.0,1411.036364,1563.04,"[0.013211727, 0.008614302, 0.036072254, 0.0141...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.045237303, 0.075727, 0.087270856, 0.0516289...",0.024609,0.022335,0.018668,0.0,0.0,0.0,0.074207,0.05861,0.074406,12,-1.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.292893,0.247108,0.001791,2.91222,0.021454


In [46]:
y_train2 = y_train['target']
y_my_test2 = y_my_test['target']
embeddin_col = ['ozon_bert_manhattan_distance','main_pic_manhattan_distance','name_emb_manhattan_distance'] 

train_pool = Pool(
    data=X_train[feats],
    label=y_train2,
    text_features=['characteristic_attributes_mapping1', 'characteristic_attributes_mapping2', 'name1', 'name2'],
    # cat_features=['cat3_grouped'],
    embedding_features=embeddin_col
)
eval_pool = Pool(
    data=X_my_test[feats],
    label=y_my_test2,
    text_features=['characteristic_attributes_mapping1', 'characteristic_attributes_mapping2', 'name1', 'name2'],
    # cat_features=['cat3_grouped'],
    embedding_features=embeddin_col
)

In [47]:
%%time
model = CatBoostClassifier(random_state=RANDOM_STATE, iterations=2500, learning_rate=0.1) #300

model.fit(
    train_pool,
    eval_set=eval_pool,
    plot=True,
    verbose=True,
    use_best_model=True,
    early_stopping_rounds=50,
    metric_period=10
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	learn: 0.6450229	test: 0.6441363	best: 0.6441363 (0)	total: 562ms	remaining: 23m 24s
10:	learn: 0.4715398	test: 0.4667523	best: 0.4667523 (10)	total: 6.09s	remaining: 22m 59s
20:	learn: 0.4411359	test: 0.4362335	best: 0.4362335 (20)	total: 11.6s	remaining: 22m 50s
30:	learn: 0.4294471	test: 0.4248741	best: 0.4248741 (30)	total: 17.1s	remaining: 22m 39s
40:	learn: 0.4223549	test: 0.4182839	best: 0.4182839 (40)	total: 22.5s	remaining: 22m 29s
50:	learn: 0.4172674	test: 0.4133678	best: 0.4133678 (50)	total: 28s	remaining: 22m 23s
60:	learn: 0.4135226	test: 0.4101133	best: 0.4101133 (60)	total: 33.3s	remaining: 22m 13s
70:	learn: 0.4108072	test: 0.4077108	best: 0.4077108 (70)	total: 38.7s	remaining: 22m 3s
80:	learn: 0.4083961	test: 0.4055204	best: 0.4055204 (80)	total: 44.1s	remaining: 21m 58s
90:	learn: 0.4066345	test: 0.4038400	best: 0.4038400 (90)	total: 49.5s	remaining: 21m 51s
100:	learn: 0.4050440	test: 0.4025309	best: 0.4025309 (100)	total: 54.8s	remaining: 21m 41s
110:	learn: 0

890:	learn: 0.3540092	test: 0.3740469	best: 0.3740423 (888)	total: 7m 59s	remaining: 14m 25s
900:	learn: 0.3536646	test: 0.3740474	best: 0.3740174 (892)	total: 8m 4s	remaining: 14m 20s
910:	learn: 0.3533220	test: 0.3739181	best: 0.3739181 (910)	total: 8m 10s	remaining: 14m 14s
920:	learn: 0.3529408	test: 0.3738653	best: 0.3738653 (920)	total: 8m 15s	remaining: 14m 8s
930:	learn: 0.3525720	test: 0.3737616	best: 0.3737554 (929)	total: 8m 20s	remaining: 14m 3s
940:	learn: 0.3522145	test: 0.3736995	best: 0.3736995 (940)	total: 8m 25s	remaining: 13m 58s
950:	learn: 0.3517853	test: 0.3735756	best: 0.3735756 (950)	total: 8m 31s	remaining: 13m 52s
960:	learn: 0.3514122	test: 0.3734497	best: 0.3734497 (960)	total: 8m 36s	remaining: 13m 47s
970:	learn: 0.3510820	test: 0.3733397	best: 0.3733397 (970)	total: 8m 42s	remaining: 13m 42s
980:	learn: 0.3507394	test: 0.3732750	best: 0.3732750 (980)	total: 8m 47s	remaining: 13m 37s
990:	learn: 0.3503902	test: 0.3731541	best: 0.3731541 (990)	total: 8m 52s

1760:	learn: 0.3264329	test: 0.3694530	best: 0.3694530 (1760)	total: 15m 41s	remaining: 6m 34s
1770:	learn: 0.3261533	test: 0.3693914	best: 0.3693852 (1766)	total: 15m 46s	remaining: 6m 29s
1780:	learn: 0.3258956	test: 0.3693622	best: 0.3693351 (1777)	total: 15m 51s	remaining: 6m 24s
1790:	learn: 0.3256049	test: 0.3693761	best: 0.3693351 (1777)	total: 15m 56s	remaining: 6m 18s
1800:	learn: 0.3253258	test: 0.3693566	best: 0.3693351 (1777)	total: 16m 1s	remaining: 6m 13s
1810:	learn: 0.3250529	test: 0.3693145	best: 0.3693143 (1808)	total: 16m 7s	remaining: 6m 8s
1820:	learn: 0.3247634	test: 0.3692829	best: 0.3692702 (1816)	total: 16m 12s	remaining: 6m 2s
1830:	learn: 0.3245001	test: 0.3692724	best: 0.3692602 (1829)	total: 16m 18s	remaining: 5m 57s
1840:	learn: 0.3242286	test: 0.3692507	best: 0.3692468 (1837)	total: 16m 23s	remaining: 5m 52s
1850:	learn: 0.3239642	test: 0.3692141	best: 0.3692111 (1849)	total: 16m 28s	remaining: 5m 46s
1860:	learn: 0.3237210	test: 0.3691419	best: 0.3691401

<catboost.core.CatBoostClassifier at 0x1f5321d7070>

In [48]:
#     metric_period=50,  iterations=2500, learning_rate=0.15)


###  Параметры RandomizedSearchCV


In [49]:
# %%time

# n_iter = 1  # сколько комбинаций проверит
# params = {'depth': [8, 9, 14, 20, 60],
#          'iterations': [800, 1000],
#          'learning_rate': [0.05, 0.1],
#          'l2_leaf_reg': [1, 2]}

# grid_c = RandomizedSearchCV(model_cb, params, cv=2, verbose=2, n_jobs=-1, random_state=RANDOM_STATE, n_iter=n_iter, scoring='f1')

# grid_c.fit(X_train[feats], y_train ) # verbose=100

# grid_c.best_params_

###  Сохранение модели
!Убедиться, что далее используется лучшая модель

In [50]:
model = model     #  grid_c.best_estimator_    #model_cb

In [51]:
# path ='C:/Users/zoika/OneDrive/Рабочий стол/YP ds54/!!hackathon_files_for_participants_ozon/ozonCat.cbm'
# model.save_model(path)

## Metric

In [52]:
def pr_auc_macro(
    target_df: pd.DataFrame,
    predictions_df: pd.DataFrame,
    prec_level: float = 0.75,
    cat_column: str = "cat3_grouped"
) -> float:

    df = target_df.merge(predictions_df, on=["variantid1", "variantid2"])

    y_true = df["target"]
    y_pred = df["scores"]
    categories = df[cat_column]

    weights = []
    pr_aucs = []

    unique_cats, counts = np.unique(categories, return_counts=True)

    # calculate metric for each big category
    for i, category in enumerate(unique_cats):
        # take just a certain category
        cat_idx = np.where(categories == category)[0]
        y_pred_cat = y_pred[cat_idx]
        y_true_cat = y_true[cat_idx]

        # if there is no matches in the category then PRAUC=0
        if sum(y_true_cat) == 0:
            pr_aucs.append(0)
            weights.append(counts[i] / len(categories))
            continue
        
        # get coordinates (x, y) for (recall, precision) of PR-curve
        y, x, _ = precision_recall_curve(y_true_cat, y_pred_cat)
        
        # reverse the lists so that x's are in ascending order (left to right)
        y = y[::-1]
        x = x[::-1]
        
        # get indices for x-coordinate (recall) where y-coordinate (precision) 
        # is higher than precision level (75% for our task)
        good_idx = np.where(y >= prec_level)[0]
        
        # if there are more than one such x's (at least one is always there, 
        # it's x=0 (recall=0)) we get a grid from x=0, to the rightest x 
        # with acceptable precision
        if len(good_idx) > 1:
            gt_prec_level_idx = np.arange(0, good_idx[-1] + 1)
        # if there is only one such x, then we have zeros in the top scores 
        # and the curve simply goes down sharply at x=0 and does not rise 
        # above the required precision: PRAUC=0
        else:
            pr_aucs.append(0)
            weights.append(counts[i] / len(categories))
            continue
        
        # calculate category weight anyway
        weights.append(counts[i] / len(categories))
        # calculate PRAUC for all points where the rightest x 
        # still has required precision 
        try:
            pr_auc_prec_level = auc(x[gt_prec_level_idx], y[gt_prec_level_idx])
            if not np.isnan(pr_auc_prec_level):
                pr_aucs.append(pr_auc_prec_level)
        except ValueError:
            pr_aucs.append(0)
            
    return np.average(pr_aucs, weights=weights)

##  Расчет pr_auc на нашей тестовой

In [53]:
%%time
X_train["scores"] = model.predict_proba(train_pool)[:, 1] # for cat
pred_tr = model.predict(train_pool)

pr_auc_macro_metr = pr_auc_macro(y_train, X_train)
display('PROCtrain:', pr_auc_macro_metr)

print('AUC:', roc_auc_score(y_train['target'], X_train["scores"]))
print('Accuracy:', acc(y_train['target'], pred_tr))
print('F1:', f1_score(y_train['target'], pred_tr))
print('Precision:', precision_score(y_train['target'], pred_tr))

'PROCtrain:'

0.7931623718964207

AUC: 0.9433378315959932
Accuracy: 0.878602031273787
F1: 0.8626329907224359
Precision: 0.8598399058077071
CPU times: total: 14min 7s
Wall time: 42.8 s


- PROC:      0.7710663102683721
- AUC:       0.9387756112382087
- Accuracy:  0.8731070079670589
- F1:        0.8564587023551794
- Precision: 0.8534283402927021

In [54]:
%%time
X_my_test["scores"] = model.predict_proba(eval_pool)[:, 1] #for cat
pred_ts = model.predict(eval_pool)

pr_auc_macro_metr = pr_auc_macro(y_my_test, X_my_test)
display('PROCeval:', pr_auc_macro_metr)

print('AUC:', roc_auc_score(y_my_test['target'], X_my_test["scores"]))
print('Accuracy:', acc(y_my_test['target'], pred_ts))
print('F1:', f1_score(y_my_test['target'], pred_ts))
print('Precision:', precision_score(y_my_test['target'], pred_ts))

'PROCeval:'

0.6684127865923386

AUC: 0.9139295020658907
Accuracy: 0.8422391857506362
F1: 0.8224017627616599
Precision: 0.8156322843822844
CPU times: total: 1min 31s
Wall time: 4.85 s


- PROC: 0.6572550797676896
- AUC: 0.9119688787813636
- Accuracy: 0.8398251451686566
- F1: 0.8200806156101136
- Precision: 0.8115752828546562

In [55]:
fi = model.get_feature_importance(prettified=True)
fi

Unnamed: 0,Feature Id,Importances
0,name2,15.136199
1,name1,14.26064
2,name_jaccard_similarity,7.828576
3,characteristic_attributes_mapping1,7.248652
4,characteristic_attributes_mapping2,6.76603
5,name_emb_manhattan_distance,6.130306
6,name_levenshtein_distance,4.485204
7,attr_dist,4.292355
8,ozon_bert_manhattan_distance,2.781874
9,attributes_jaccard_similarity,2.337548


## ДООБУЧЕНИЕ НА ВСЕЙ

In [56]:
# model = CatBoostClassifier(iterations=2500, learning_rate=0.125)

In [58]:
X_all = features.drop(["target"], axis=1)
y_all = features["target"]

pool_all = Pool(
    data=X_all[feats],
    label=y_all,
    text_features=['characteristic_attributes_mapping1', 'characteristic_attributes_mapping2', 'name1', 'name2'],
    embedding_features=embeddin_col
)

In [59]:
# del train_data

In [60]:
model.fit(
    pool_all,
    plot=True,
    verbose=True,
    use_best_model=True,
    early_stopping_rounds=40,
    metric_period=10
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 0.6453880	total: 472ms	remaining: 19m 40s
10:	learn: 0.4722811	total: 6.07s	remaining: 22m 54s
20:	learn: 0.4414090	total: 11.7s	remaining: 23m
30:	learn: 0.4294168	total: 17.2s	remaining: 22m 51s
40:	learn: 0.4221991	total: 22.5s	remaining: 22m 30s
50:	learn: 0.4171749	total: 28s	remaining: 22m 24s
60:	learn: 0.4134826	total: 33.3s	remaining: 22m 12s
70:	learn: 0.4106904	total: 38.5s	remaining: 21m 57s
80:	learn: 0.4079761	total: 43.9s	remaining: 21m 49s
90:	learn: 0.4062056	total: 49.2s	remaining: 21m 42s
100:	learn: 0.4046303	total: 54.6s	remaining: 21m 36s
110:	learn: 0.4030092	total: 60s	remaining: 21m 30s
120:	learn: 0.4016222	total: 1m 5s	remaining: 21m 23s
130:	learn: 0.4001032	total: 1m 10s	remaining: 21m 18s
140:	learn: 0.3987830	total: 1m 16s	remaining: 21m 13s
150:	learn: 0.3973715	total: 1m 21s	remaining: 21m 6s
160:	learn: 0.3959720	total: 1m 26s	remaining: 20m 59s
170:	learn: 0.3948859	total: 1m 31s	remaining: 20m 52s
180:	learn: 0.3937194	total: 1m 37s	remaini

1500:	learn: 0.3373810	total: 13m 23s	remaining: 8m 54s
1510:	learn: 0.3371125	total: 13m 28s	remaining: 8m 49s
1520:	learn: 0.3368441	total: 13m 33s	remaining: 8m 43s
1530:	learn: 0.3365636	total: 13m 39s	remaining: 8m 38s
1540:	learn: 0.3362718	total: 13m 44s	remaining: 8m 33s
1550:	learn: 0.3360151	total: 13m 49s	remaining: 8m 27s
1560:	learn: 0.3357825	total: 13m 55s	remaining: 8m 22s
1570:	learn: 0.3355044	total: 14m	remaining: 8m 17s
1580:	learn: 0.3352154	total: 14m 5s	remaining: 8m 11s
1590:	learn: 0.3349506	total: 14m 11s	remaining: 8m 6s
1600:	learn: 0.3346809	total: 14m 16s	remaining: 8m 1s
1610:	learn: 0.3344188	total: 14m 21s	remaining: 7m 55s
1620:	learn: 0.3341611	total: 14m 27s	remaining: 7m 50s
1630:	learn: 0.3339079	total: 14m 32s	remaining: 7m 45s
1640:	learn: 0.3336519	total: 14m 38s	remaining: 7m 39s
1650:	learn: 0.3333966	total: 14m 43s	remaining: 7m 34s
1660:	learn: 0.3331079	total: 14m 48s	remaining: 7m 28s
1670:	learn: 0.3328057	total: 14m 54s	remaining: 7m 23s

<catboost.core.CatBoostClassifier at 0x1f5321d7070>

In [62]:
# X_all["scores"] = model.predict_proba(pool_all)[:, 1] # for cat
# pred_ts = model.predict(pool_all)

# pr_auc_macro_metr = pr_auc_macro(features,X_all)
# display('PROCall:', pr_auc_macro_metr)

# print('AUC:', roc_auc_score(y_all, X_all ))
# print('Accuracy:', acc(y_all['target'], pred_ts ))
# print('F1:', f1_score(y_all['target'], pred_ts ))
# print('Precision:', precision_score(y_all['target'], pred_ts))

###  Метрика на дообученной

## Submission. Расчет на финальной тестовой

### Предобработка

In [63]:
%%time
# test_pairs_wo_target = pd.read_parquet('test_pairs_wo_target.parquet')
# test_data = pd.read_parquet('test_data.parquet')

REST_test = 50

test_data = new_col(test_data)
test_data = cat3_grouped(test_data, REST_test)
test_data[cat_col] = encoder.transform(test_data[cat_col]) 
test_data = color(test_data)

# Объединение таблиц
features_test = merge_f(test_pairs_wo_target, test_data)


features_test[['attr_dist', 
               'attr_acc', 
               'common_keys', 
               'same_value_keys', 
               'mean_value1', 
               'mean_value2']] = features_test[['attributes_set1', 'attributes_set2']].apply(
    lambda x: pd.Series(compare_attr(*x)), axis=1
)

features_test['mean_value1'] = features_test['mean_value1'].fillna(0)
features_test['mean_value2'] = features_test['mean_value2'].fillna(0)

449
CPU times: total: 17.8 s
Wall time: 17.8 s


In [64]:
features_test.head(1)

Unnamed: 0,variantid1,variantid2,cat3_grouped,name1,categories1,pic_embeddings_resnet_v11,main_pic_embeddings_resnet_v11,name_bert_641,characteristic_attributes_mapping1,name_embedding1,attributes_set1,cat3,cat41,main_pic_embeddings_resnet_v1_new1,cat3_grouped.1,clr_vect1,name2,categories2,pic_embeddings_resnet_v12,main_pic_embeddings_resnet_v12,name_bert_642,characteristic_attributes_mapping2,name_embedding2,attributes_set2,cat32,cat42,main_pic_embeddings_resnet_v1_new2,cat3_grouped2,clr_vect2,attr_dist,attr_acc,common_keys,same_value_keys,mean_value1,mean_value2
0,52076340,290590137,Батарейки и аккумуляторы,Батарейка AAA щелочная Perfeo LR03/10BL Super ...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...","[[0.15417035, 0.41160947, 0.2213532, -0.019731...","[[0.04763528, -0.20136409, 0.29605597, 0.26453...","[-0.28437558, 0.60909724, 0.5972025, -0.523296...","{""Напряжение, В"":[""1.5""],""Бренд"":[""Perfeo""],""Т...","[-0.68162066, 0.0034779932, 0.14610538, -0.138...","{'Напряжение, В': ['1.5'], 'Бренд': ['Perfeo']...",17.0,45.0,"[0.04763528, -0.20136409, 0.29605597, 0.264536...",Батарейки и аккумуляторы,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",Батарейка AAA щелочная Perfeo LR03/2BL mini Su...,"{""1"": ""EPG"", ""2"": ""Электроника"", ""3"": ""Батарей...","[[-0.025554053, 0.012488857, 0.43989864, -0.10...","[[0.06223978, -0.16145544, 0.26409012, 0.24271...","[-0.3380968, 0.6156224, 0.6428071, -0.57499236...","{""Форм-фактор батареи"":[""AAA""],""Химический тип...","[-0.47604635, -0.15201007, 0.23725557, -0.0434...","{'Форм-фактор батареи': ['AAA'], 'Химический т...",17.0,45.0,"[0.06223978, -0.16145544, 0.26409012, 0.242712...",Батарейки и аккумуляторы,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",0.5,9.0,10.0,9.0,5.75,2.75


In [65]:
features_test['main_pic_embeddings_resnet_v1_new1'] = features_test['main_pic_embeddings_resnet_v11'].apply(lambda x: x[0])
features_test['main_pic_embeddings_resnet_v1_new2'] = features_test['main_pic_embeddings_resnet_v12'].apply(lambda x: x[0])

In [66]:
%%time
# Добавляем новые столбцы в датафрейм features_test с вычисленными статистическими характеристиками разности 
# между эмбеддингами названий товаров
features_test[['name_mean_diff', 'name_median_diff', 'name_std_diff']] = features_test.apply(
    lambda x: calculate_statistical_features(
        x['name_embedding1'], x['name_embedding2']), axis=1, result_type='expand'
)

# Добавляем новые столбцы в датафрейм features_test с вычисленными статистическими характеристиками разности 
# между эмбеддингами названий товаров от ozon
features_test[['ozon_name_mean_diff', 'ozon_name_median_diff', 'ozon_name_std_diff']] = features_test.apply(
    lambda x: calculate_statistical_features(
        x['name_bert_641'], x['name_bert_642']), axis=1, result_type='expand'
)

# Добавляем новые столбцы в датафрейм features_test с вычисленными статистическими характеристиками разности 
# между эмбеддингами картинок товаров
features_test[['main_pic_mean_diff', 'main_pic_median_diff', 'main_pic_std_diff']] = features_test.apply(
    lambda x: calculate_statistical_features(
        x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1, result_type='expand'
)

CPU times: total: 8.53 s
Wall time: 8.53 s


In [67]:
%%time
# Добавляем новый столбец в датафрейм features_test с вычисленным коэффициентом Жаккара между названиями товаров
features_test['name_jaccard_similarity'] = features_test.apply(
    lambda x: calculate_jaccard_similarity(x['name1'], x['name2']), axis=1
)

# Добавляем новый столбец в датафрейм features_test с вычисленным коэффициентом Жаккара между атрибутами товаров
features_test['attributes_jaccard_similarity'] = features_test.apply(
    lambda x: calculate_jaccard_similarity(x['characteristic_attributes_mapping1'], x['characteristic_attributes_mapping2']), axis=1
)

CPU times: total: 1min 2s
Wall time: 1min 2s


In [68]:
%%time
features_test[["pic_dist_0_perc", "pic_dist_25_perc", "pic_dist_50_perc"]] = (
    features_test[["pic_embeddings_resnet_v11", "pic_embeddings_resnet_v12"]].apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1))

features_test[["main_pic_dist_0_perc", "main_pic_dist_25_perc", "main_pic_dist_50_perc"]] = (
    features_test[["main_pic_embeddings_resnet_v11", "main_pic_embeddings_resnet_v12"]].apply(
        lambda x: pd.Series(get_pic_features_func(*x)), axis=1))

features_test[["euclidean_main_pic_dist", "cosine_main_pic_dist"]] = (
    features_test[["main_pic_embeddings_resnet_v1_new1", "main_pic_embeddings_resnet_v1_new2"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features_test[["euclidean_color_dist", "cosine_color_dist"]] = (
    features_test[["clr_vect1", "clr_vect2"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features_test[["euclidean_name_bert_dist", "cosine_name_bert_dist"]] = (
    features_test[["name_bert_641", "name_bert_642"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))

features_test[["euclidean_name_embedding_dist", "cosine_name_embedding_dist"]] = (
    features_test[["name_embedding1", "name_embedding2"]].apply(
        lambda x: pd.Series(text_dense_distances(*x)), axis=1))


CPU times: total: 40.9 s
Wall time: 40.9 s


In [69]:
# Расчет features_test манхэттенского расстояния между признаками name_bert_641 и name_bert_642
features_test['ozon_bert_manhattan_distance'] = features_test.apply(
    lambda x: manhattan_distance(x['name_bert_641'], x['name_bert_642']), axis=1
)

# Расчет features_test манхэттенского расстояния между признаками main_pic_embeddings_resnet_v1_new1 и main_pic_embeddings_resnet_v1_new2
features_test['main_pic_manhattan_distance'] = features_test.apply(
    lambda x: manhattan_distance(x['main_pic_embeddings_resnet_v1_new1'], x['main_pic_embeddings_resnet_v1_new2']), axis=1
)

# Расчет features_test манхэттенского расстояния между признаками name_embedding1 и name_embedding2
features_test['name_emb_manhattan_distance'] = features_test.apply(
    lambda x: manhattan_distance(x['name_embedding1'], x['name_embedding2']), axis=1
)

In [70]:
%%time
# расчитываем среднее, медиану и стд для ozon_bert_manhattan_distance
features_test[["mean_ozon_bert_manhattan_distance", 
          "median_ozon_bert_manhattan_distance", 
          "std_ozon_bert_manhattan_distance"]] = (features_test['ozon_bert_manhattan_distance'].apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для main_pic_manhattan_distance
features_test[["mean_main_pic_manhattan_distance", 
          "median_main_pic_manhattan_distance", 
          "std_main_pic_manhattan_distance"]] = (features_test['main_pic_manhattan_distance'].apply(
        lambda x: pd.Series(calculate_statistics(x))))

# расчитываем среднее, медиану и стд для name_emb_manhattan_distance
features_test[["mean_name_emb_manhattan_distance", 
          "median_name_emb_manhattan_distance", 
          "std_name_emb_manhattan_distance"]] = (features_test['name_emb_manhattan_distance'].apply(
        lambda x: pd.Series(calculate_statistics(x))))

CPU times: total: 13.8 s
Wall time: 13.8 s


In [71]:
%%time
# расчитаем расстояния Левенштейна для name1 и name2
features_test[["name_levenshtein_distance"]] = (
    features_test[["name1", "name2"]].progress_apply(
        lambda x: pd.Series(levenshtein_distance(*x)), axis=1))

100%|██████████| 18084/18084 [02:39<00:00, 113.20it/s]

CPU times: total: 2min 39s
Wall time: 2min 39s





In [72]:
features_test = drop_f(features_test)
features_test[['characteristic_attributes_mapping1', 
               'characteristic_attributes_mapping2']] = features_test[['characteristic_attributes_mapping1', 
                                                                       'characteristic_attributes_mapping2']].fillna('{}')

In [73]:
display(X_train.info())
display(features_test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275886 entries, 94531 to 139533
Data columns (total 56 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   variantid1                           275886 non-null  int64  
 1   variantid2                           275886 non-null  int64  
 2   name1                                275886 non-null  object 
 3   characteristic_attributes_mapping1   275886 non-null  object 
 4   cat3                                 275886 non-null  float64
 5   cat41                                275886 non-null  float64
 6   cat3_grouped                         275886 non-null  object 
 7   name2                                275886 non-null  object 
 8   characteristic_attributes_mapping2   275886 non-null  object 
 9   cat32                                275886 non-null  float64
 10  cat42                                275886 non-null  float64
 11  name_mean

None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18084 entries, 0 to 18083
Data columns (total 56 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   variantid1                           18084 non-null  int64  
 1   variantid2                           18084 non-null  int64  
 2   cat3_grouped                         18084 non-null  object 
 3   name1                                18084 non-null  object 
 4   characteristic_attributes_mapping1   18084 non-null  object 
 5   cat3                                 18084 non-null  float64
 6   cat41                                18084 non-null  float64
 7   cat3_grouped                         18084 non-null  object 
 8   name2                                18084 non-null  object 
 9   characteristic_attributes_mapping2   18084 non-null  object 
 10  cat32                                18084 non-null  float64
 11  cat42                       

None

In [74]:
submission_example = features_test.copy()

submission_example["target"] = model.predict_proba(features_test[feats])[:, 1]
submission_example = submission_example[["variantid1", "variantid2", "target"]]
submission_example.head(3)

Unnamed: 0,variantid1,variantid2,target
0,52076340,290590137,0.236128
1,64525522,204128919,0.323596
2,77243372,479860557,0.614722


In [75]:
s = submission_example.drop_duplicates().merge(
      features_test[["variantid1", "variantid2"]].drop_duplicates(["variantid1", "variantid2"]),
      on=["variantid1", "variantid2"]
)

s.head(3)

Unnamed: 0,variantid1,variantid2,target
0,52076340,290590137,0.236128
1,64525522,204128919,0.323596
2,77243372,479860557,0.614722


In [76]:
features_test.duplicated(["variantid1", "variantid2"]).sum()

0

In [77]:
s.target.min()

0.003946305318749987

In [78]:
s.target.max()

0.9908497396704322

In [79]:
# s = s.drop_duplicates(["variantid1", "variantid2"])

In [80]:
s.to_csv("submission_Cat_LEVI_NEW_FEATURES.csv", index=False)

In [81]:
s.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18084 entries, 0 to 18083
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   variantid1  18084 non-null  int64  
 1   variantid2  18084 non-null  int64  
 2   target      18084 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 565.1 KB
