# Imports

In [6]:
import ast
import gc
import pickle
import os
import random
import re

import optuna
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from optuna.trial import TrialState
from sklearn.metrics import f1_score
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

from data_preparation import (
    get_embeddings, 
    make_features_transformers,
    make_features_cnn,
    train_valid_test_split
)
from inference import (
    predict,
    predict_sample,
    test_model
)
from training import (
    ad_hoc_train,
    create_labels, 
    get_loaders,
    train_model,
    train_parametrized
)

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["PYTHONHASHSEED"] = str(SEED)


# Data Preprocessing

In [2]:
products = pd.read_parquet('train.parquet')

## Some Statistics

In [3]:
products.sample(3)

Unnamed: 0,product_id,category_id,sale,shop_id,shop_title,rating,text_fields
56718,1159722,14233,False,25374,COMFY wear,5.0,"{""title"": ""Хлопковые женские трусы, подарок на..."
81326,1105829,11826,False,26591,Ежевика,5.0,"{""title"": ""Коврик силиконовый для раскатки тес..."
38250,443799,13297,False,14885,МультиДом,5.0,"{""title"": ""Эко-мешочек сетка 22х25см"", ""descri..."


In [4]:
products.shape

(91120, 7)

We have no missing values in our data

In [5]:
products.isnull().sum()

product_id     0
category_id    0
sale           0
shop_id        0
shop_title     0
rating         0
text_fields    0
dtype: int64

Overall we have 874 different categories.

Some of them are with small number of samples in them.

Classes are dibalanced.

In [6]:
products.category_id.value_counts()

11937    6590
14922    3709
13651    1463
13143    1460
12980    1222
         ... 
12808       2
12901       1
11549       1
11875       1
12836       1
Name: category_id, Length: 874, dtype: int64

## Text Fields

Let's unpack fileds to an appropriate way.

In [7]:
ast.literal_eval(products.iloc[0]['text_fields'])

{'title': 'Зарядный кабель Borofone BX1 Lightning для айфон, 1м',
 'description': '<p><span style="background-color: transparent; color: rgb(0, 26, 52);">Зарядный кабель Borofone BX1 подходит для зарядки всех гаджетов и аксессуаров с разъемом </span>Lightning.</p><p><span style="color: rgb(0, 26, 52); background-color: transparent;">Поддерживает быструю зарядку.</span></p><p><span style="color: rgb(0, 26, 52); background-color: transparent;">Подходит для передачи данных.</span></p>',
 'attributes': ['Длина: 1м',
  'Разъем: Lightning',
  'Подерживает быструю зарядку',
  'Максимальный ток: 2.0А',
  'Для зарядки и синхронизации данных',
  'Вес: 22 г.'],
 'custom_characteristics': {},
 'defined_characteristics': {'Цвет': ['Черный', 'Белый']},
 'filters': {'Цвет': ['Белый', 'Черный']}}

I will take `title`, `description`, and concatenated `attributes` field.

In [8]:
def extract_text(text_field: str):
    content_dict = ast.literal_eval(text_field)
    title = content_dict['title']
    description = re.sub("<[^\>]*>", '', content_dict['description'])
    attributes = '. '.join(content_dict['attributes'])

    return title, description, attributes

extract_text(products.iloc[0]['text_fields'])

('Зарядный кабель Borofone BX1 Lightning для айфон, 1м',
 'Зарядный кабель Borofone BX1 подходит для зарядки всех гаджетов и аксессуаров с разъемом Lightning.Поддерживает быструю зарядку.Подходит для передачи данных.',
 'Длина: 1м. Разъем: Lightning. Подерживает быструю зарядку. Максимальный ток: 2.0А. Для зарядки и синхронизации данных. Вес: 22 г.')

Let's extract needed text fields.

In [9]:
titles, desciptions, attributes = zip(*products['text_fields'].apply(extract_text))
text_data = pd.DataFrame({'title': titles, 'description': desciptions, 'attributes': attributes})
text_data.head()

Unnamed: 0,title,description,attributes
0,Зарядный кабель Borofone BX1 Lightning для айф...,Зарядный кабель Borofone BX1 подходит для заря...,Длина: 1м. Разъем: Lightning. Подерживает быст...
1,Трусы Sela,Трусы-слипы из эластичного бесшовного трикотаж...,
2,"Гуашь ""ЮНЫЙ ВОЛШЕБНИК"", 12 цветов по 35 мл, БО...",Гуашь высшего качества ЮНЛАНДИЯ поможет создат...,
3,Колба для кальяна Крафт (разные цвета),Универсальная колба для кальяна Craft подходит...,Материал: стекло. Внутренний диаметр: 45 мм . ...
4,"Пижама женская, однотонная с шортами",Лёгкая ткань! Комфортная посадка! Идеальная дл...,


Very few products does not contain `description`.

About of one quarter of products does not contain `attributes`.

In [10]:
(text_data == '').sum()

title              0
description     1538
attributes     26131
dtype: int64

## Dataset Creation

Let's combine columns which will be needed for model traing.

I will not take take `shop_id` as it is ordinally encoded. Ordinal encoding does not suit such categorical data as shop identifier. Instead of `shop_id` I want to use embedded `shop_title`.

Binary flag `sale` is not very helpful in prediction across 874 classes.

In [11]:
dataset = pd.concat(
    [
        products[['product_id', 'category_id', 'shop_title', 'rating']].reset_index(drop=True), 
        text_data.reset_index(drop=True)
    ], 
    axis=1
)
dataset.head()

Unnamed: 0,product_id,category_id,shop_title,rating,title,description,attributes
0,325286,12171,Aksik,5.0,Зарядный кабель Borofone BX1 Lightning для айф...,Зарядный кабель Borofone BX1 подходит для заря...,Длина: 1м. Разъем: Lightning. Подерживает быст...
1,888134,14233,Sela,5.0,Трусы Sela,Трусы-слипы из эластичного бесшовного трикотаж...,
2,1267173,13429,ЮНЛАНДИЯ канцтовары,5.0,"Гуашь ""ЮНЫЙ ВОЛШЕБНИК"", 12 цветов по 35 мл, БО...",Гуашь высшего качества ЮНЛАНДИЯ поможет создат...,
3,1416943,2789,вася-nicotine,4.0,Колба для кальяна Крафт (разные цвета),Универсальная колба для кальяна Craft подходит...,Материал: стекло. Внутренний диаметр: 45 мм . ...
4,1058275,12834,Lim Market,4.6,"Пижама женская, однотонная с шортами",Лёгкая ткань! Комфортная посадка! Идеальная дл...,


## Train Test Split

I will divide dataset into `stratified` train valid test split to keep classes balance. I will perform division once and use it for evaluation of each model.

There are some categories which have very few products in them.

In [12]:
categories_frequences = dataset['category_id'].value_counts()
categories_frequences

11937    6590
14922    3709
13651    1463
13143    1460
12980    1222
         ... 
12808       2
12901       1
11549       1
11875       1
12836       1
Name: category_id, Length: 874, dtype: int64

I want to have at least two instances of every `product_id` in train set and one in each valid and test. There some `product ids` which has less products than needed. I will keep these products only in train set

In [23]:
rare_categories = categories_frequences < 4
number_of_rare_categories = rare_categories.sum()
number_of_rare_categories

18

I will apply train, valid, test split on frequent categories.

In [24]:
frequent_categories_ids = categories_frequences[~rare_categories].index
dataset_fr = dataset[dataset['category_id'].isin(frequent_categories_ids)]

In [37]:
dataset_fr = train_valid_test_split(
    dataset_fr, 
    val_size=0.1, 
    test_size=0.1,
    stratify_col='category_id'
)

Now I have stratified split of the data for train, validation and test right in the table.

In [38]:
dataset_fr.head(1)

Unnamed: 0,product_id,category_id,shop_title,rating,title,description,attributes,train,valid,test
0,325286,12171,Aksik,5.0,Зарядный кабель Borofone BX1 Lightning для айф...,Зарядный кабель Borofone BX1 подходит для заря...,Длина: 1м. Разъем: Lightning. Подерживает быст...,True,False,False


Let's now add labels for rare `category ids`.

In [35]:
rare_categories_ids = categories_frequences[rare_categories].index
dataset_rare = dataset[dataset['category_id'].isin(rare_categories_ids)]

size = dataset_rare.shape[0]
train = [True] * size
valid = [False] * size
test = [False] * size

rare_separation = pd.DataFrame({'train': train, 'valid': valid, 'test': test})
dataset_rare = pd.concat(
    [
        dataset_rare.reset_index(drop=True), 
        rare_separation.reset_index(drop=True)
    ], 
    axis=1
)
dataset_rare.sample(1)

Unnamed: 0,product_id,category_id,shop_title,rating,title,description,attributes,train,valid,test
18,831567,11549,SOFIA MUM,4.5,Толстовка утепленная для беременных и кормящих...,Невероятно теплая и уютная толстовка выполнена...,на каждый день. большие размеры. для беременны...,True,False,False


Now we can assemble all the data back.

In [40]:
dataset_splitted = pd.concat([
    dataset_fr.reset_index(drop=True),
    dataset_rare.reset_index(drop=True)
])

Now we have orginal shape of splitted data.

In [41]:
dataset_splitted.shape

(91120, 10)

Let's now save this dataset for future experements to have the same data separation for all the models.

In [43]:
dataset_splitted['attributes'].fillna('', inplace=True)
dataset_splitted['description'].fillna('', inplace=True)
dataset_splitted.to_csv('dataset.csv', index=False)

# Baseline: FastText + MLP

## Data

In [2]:
dataset = pd.read_csv('dataset.csv')
dataset.head(1)

Unnamed: 0,product_id,category_id,shop_title,rating,title,description,attributes,train,valid,test
0,325286,12171,Aksik,5.0,Зарядный кабель Borofone BX1 Lightning для айф...,Зарядный кабель Borofone BX1 подходит для заря...,Длина: 1м. Разъем: Lightning. Подерживает быст...,True,False,False


### Embeddings

Let's now create embeddings for products' titles, descriptions and attributes.

In [8]:
title_emb = get_embeddings(dataset, 'title', file_name='title')
description_emb = get_embeddings(dataset, 'description', file_name='description')
attributes_emb = get_embeddings(dataset, 'attributes', file_name='attributes')

100%|██████████| 91120/91120 [14:49<00:00, 102.48it/s]
100%|██████████| 91120/91120 [1:15:15<00:00, 20.18it/s]
100%|██████████| 91120/91120 [22:18<00:00, 68.07it/s] 


In [3]:
title_emb = np.load('./embeddings/title.npy')
description_emb = np.load('./embeddings/description.npy')
attributes_emb = np.load('./embeddings/attributes.npy')

I will train baseline model on concatenated embeddings of three text columns.

In [4]:
emb = np.concatenate([title_emb, description_emb, attributes_emb], axis=1)

### Loaders

Here I convert category ids into indexes.

In [5]:
labels, index_to_id, id_to_index = create_labels(dataset.category_id)

In [6]:
train_loader, valid_loader, test_loader = get_loaders(dataset, emb, 'category_id', batch_size=1024)

## Model

I will use simlpe two layer FCNN as baseline model.

In [7]:
input_size = 900
hid_size = 100
num_classes = len(index_to_id)

model = nn.Sequential(
    nn.Linear(input_size, hid_size),
    nn.BatchNorm1d(hid_size),
    nn.ReLU(),
    nn.Linear(hid_size, hid_size),
    nn.BatchNorm1d(hid_size),
    nn.ReLU(),
    nn.Linear(hid_size, num_classes)
)

## Training

I train the model with use of early stopping technique. I save checkpoint which shows the best result in validation set.

In [8]:
acc_train, acc_test = train_model(
    model, 
    'baseline', 
    train_loader, 
    valid_loader, 
    num_epochs=30, 
    print_res=True
)


Epoch: 1
Loader: train. f1 score: 0.2349
Loader: valid. f1 score: 0.3322

Epoch: 2
Loader: train. f1 score: 0.4266
Loader: valid. f1 score: 0.5024

Epoch: 3
Loader: train. f1 score: 0.5706
Loader: valid. f1 score: 0.6276

Epoch: 4
Loader: train. f1 score: 0.6778
Loader: valid. f1 score: 0.7033

Epoch: 5
Loader: train. f1 score: 0.7455
Loader: valid. f1 score: 0.7462

Epoch: 6
Loader: train. f1 score: 0.7893
Loader: valid. f1 score: 0.7706

Epoch: 7
Loader: train. f1 score: 0.821
Loader: valid. f1 score: 0.7836

Epoch: 8
Loader: train. f1 score: 0.8502
Loader: valid. f1 score: 0.7951

Epoch: 9
Loader: train. f1 score: 0.8586
Loader: valid. f1 score: 0.7977

Epoch: 10
Loader: train. f1 score: 0.8615
Loader: valid. f1 score: 0.7986

Epoch: 11
Loader: train. f1 score: 0.8645
Loader: valid. f1 score: 0.8007

Epoch: 12
Loader: train. f1 score: 0.8676
Loader: valid. f1 score: 0.8014

Epoch: 13
Loader: train. f1 score: 0.8696
Loader: valid. f1 score: 0.8025

Epoch: 14
Loader: train. f1 score:

As we can see the model is being overfit quite a lot.

## Testing

Let's finally count performance of baseline model on test set.

In [9]:
y_true, y_pred = test_model(model, test_loader)

f1_score(y_pred, y_true, average='weighted')

0.8303200669101575

Also I decided to test the performance of each column separately. I trained model separately on `titles`, `descriptions` and `attributes`.

I deleted sells with training and so forth. Here the results

Title:

In [14]:
y_true, y_pred = test_model(model, test_loader)

f1_score(y_pred, y_true, average='weighted')

0.8038047745902039

Description:

In [19]:
y_true, y_pred = test_model(model, test_loader)

f1_score(y_pred, y_true, average='weighted')

0.7110509801380898

Attributes:

In [23]:
y_true, y_pred = test_model(model, test_loader)

f1_score(y_pred, y_true, average='weighted')

0.7181879317233768

As we can observe `title` column is most informative. It gives almost the same result as all the three columns combined: **0.803** vs **0.83**.

# Bert + MLP

In [2]:
dataset = pd.read_csv('dataset.csv')

## Embeddings

I will take sentence embeddings from `sberbank-ai/ruRoberta-large` model.

In [None]:
max_len = max(dataset['title'].apply(lambda s: len(s.split())))
title_embeddings_bert = make_features_transformers(dataset, 'title', max_len, 'title_embeddings_bert')

dataset['description'].fillna('', inplace=True)
max_len = int(dataset['description'].apply(lambda s: len(s.split())).mean())
description_embeddings_bert = make_features_transformers(dataset, 'description', max_len, 'description_embeddings_bert')

dataset['attributes'].fillna('', inplace=True)
max_len = max(dataset['attributes'].apply(lambda s: len(s.split())))
attributes_embeddings_bert = make_features_transformers(dataset, 'attributes', max_len, 'attributes_embeddings_bert')

## Training

Let's try only title embeddings.

In [3]:
emb = np.load('./embeddings/title_embeddings_bert.npy')
emb = emb.astype(np.double)

ad_hoc_train(emb, 'bert_mlp_titile')


Epoch: 1
Loader: train. f1 score: 0.2262
Loader: valid. f1 score: 0.3213

Epoch: 2
Loader: train. f1 score: 0.4145
Loader: valid. f1 score: 0.4932

Epoch: 3
Loader: train. f1 score: 0.5557
Loader: valid. f1 score: 0.6103

Epoch: 4
Loader: train. f1 score: 0.6556
Loader: valid. f1 score: 0.6759

Epoch: 5
Loader: train. f1 score: 0.7202
Loader: valid. f1 score: 0.7138

Epoch: 6
Loader: train. f1 score: 0.7664
Loader: valid. f1 score: 0.7493

Epoch: 7
Loader: train. f1 score: 0.7969
Loader: valid. f1 score: 0.7627

Epoch: 8
Loader: train. f1 score: 0.8324
Loader: valid. f1 score: 0.7822

Epoch: 9
Loader: train. f1 score: 0.8404
Loader: valid. f1 score: 0.7838

Epoch: 10
Loader: train. f1 score: 0.8443
Loader: valid. f1 score: 0.7854

Epoch: 11
Loader: train. f1 score: 0.8468
Loader: valid. f1 score: 0.7884

Epoch: 12
Loader: train. f1 score: 0.8498
Loader: valid. f1 score: 0.7891

Epoch: 13
Loader: train. f1 score: 0.8521
Loader: valid. f1 score: 0.7889

Epoch: 14
Loader: train. f1 score

In [4]:
title_emb = np.load('./embeddings/title_embeddings_bert.npy')
description_emb = np.load('./embeddings/description_embeddings_bert.npy')
attributes_emb = np.load('./embeddings/attributes_embeddings_bert.npy')
emb = np.concatenate([title_emb, description_emb, attributes_emb], axis=1)
emb = emb.astype(np.double)

ad_hoc_train(emb, 'bert_mlp_titile_description_attributes')


Epoch: 1
Loader: train. f1 score: 0.2188
Loader: valid. f1 score: 0.3084

Epoch: 2
Loader: train. f1 score: 0.4104
Loader: valid. f1 score: 0.4872

Epoch: 3
Loader: train. f1 score: 0.5632
Loader: valid. f1 score: 0.6096

Epoch: 4
Loader: train. f1 score: 0.675
Loader: valid. f1 score: 0.6857

Epoch: 5
Loader: train. f1 score: 0.7478
Loader: valid. f1 score: 0.7378

Epoch: 6
Loader: train. f1 score: 0.8016
Loader: valid. f1 score: 0.7695

Epoch: 7
Loader: train. f1 score: 0.8368
Loader: valid. f1 score: 0.7845

Epoch: 8
Loader: train. f1 score: 0.8744
Loader: valid. f1 score: 0.808

Epoch: 9
Loader: train. f1 score: 0.8837
Loader: valid. f1 score: 0.812

Epoch: 10
Loader: train. f1 score: 0.8879
Loader: valid. f1 score: 0.8123

Epoch: 11
Loader: train. f1 score: 0.8919
Loader: valid. f1 score: 0.812

Epoch: 12
Loader: train. f1 score: 0.8952
Loader: valid. f1 score: 0.8121

Epoch: 13
Loader: train. f1 score: 0.8998
Loader: valid. f1 score: 0.8149

Epoch: 14
Loader: train. f1 score: 0.

## Results

This result of `0.839` is a bit better than baseline with score `0.83`.

# Working with images

At first I want to try pretrained on ImageNet models from **torchvision**. I chose the best one in terms of accuracy and computational power that it needs. I will usw `resnet50` to extract feature maps from the images and use them for future classification.

In [2]:
dataset = pd.read_csv('./dataset.csv')

## Generating feature maps

In [None]:
images_embeddings =  make_features_cnn(dataset, 'product_id', './images/train', 'resnet50_embeddings')

## Testing

In [7]:
img_emb = np.load("./embeddings/images_embeddings.npy")
img_emb = img_emb.astype(np.double)
ad_hoc_train(img_emb, 'images_only')


Epoch: 1
Loader: train. f1 score: 0.1466
Loader: valid. f1 score: 0.1965

Epoch: 2
Loader: train. f1 score: 0.2556
Loader: valid. f1 score: 0.2924

Epoch: 3
Loader: train. f1 score: 0.3336
Loader: valid. f1 score: 0.3542

Epoch: 4
Loader: train. f1 score: 0.3922
Loader: valid. f1 score: 0.3923

Epoch: 5
Loader: train. f1 score: 0.4406
Loader: valid. f1 score: 0.4231

Epoch: 6
Loader: train. f1 score: 0.477
Loader: valid. f1 score: 0.4424

Epoch: 7
Loader: train. f1 score: 0.5094
Loader: valid. f1 score: 0.4575

Epoch: 8
Loader: train. f1 score: 0.5467
Loader: valid. f1 score: 0.467

Epoch: 9
Loader: train. f1 score: 0.5563
Loader: valid. f1 score: 0.4698

Epoch: 10
Loader: train. f1 score: 0.5623
Loader: valid. f1 score: 0.4732

Epoch: 11
Loader: train. f1 score: 0.5655
Loader: valid. f1 score: 0.4737

Epoch: 12
Loader: train. f1 score: 0.5688
Loader: valid. f1 score: 0.475

Epoch: 13
Loader: train. f1 score: 0.5724
Loader: valid. f1 score: 0.4764

Epoch: 14
Loader: train. f1 score: 0

As we can see just images embeddings from not fine tuned model are not very informative in predictions but they can probably help in combination with other features.

# Wikipedia2Vec

Here I decided to generate embeddings from `Wikipedia2Vec` model trained on Russian Wikipedia.

In [2]:
dataset = pd.read_csv("dataset.csv")

In [None]:
title_emb_wiki = get_embeddings(dataset, 'title', file_name='title_wiki', use_wiki2vec=True, use_fasttext=False)
description_emb_wiki = get_embeddings(dataset, 'description', file_name='description_wiki', use_wiki2vec=True, use_fasttext=False)
attributes_emb_wiki = get_embeddings(dataset, 'attributes', file_name='attributes_wiki', use_wiki2vec=True, use_fasttext=False)

In [17]:
title_emb_wiki = np.load('./embeddings/title_wiki.npy')
description_emb_wiki = np.load('./embeddings/description_wiki.npy')
attributes_emb_wiki = np.load('./embeddings/attributes_wiki.npy')

Let's evaluate title embeddings of wiki2vec.

In [23]:
ad_hoc_train(title_emb_wiki, 'title_wiki', num_epochs=30)


Epoch: 1
Loader: train. f1 score: 0.2326
Loader: valid. f1 score: 0.3287

Epoch: 2
Loader: train. f1 score: 0.4038
Loader: valid. f1 score: 0.479

Epoch: 3
Loader: train. f1 score: 0.5382
Loader: valid. f1 score: 0.5868

Epoch: 4
Loader: train. f1 score: 0.6252
Loader: valid. f1 score: 0.6521

Epoch: 5
Loader: train. f1 score: 0.6828
Loader: valid. f1 score: 0.6934

Epoch: 6
Loader: train. f1 score: 0.7243
Loader: valid. f1 score: 0.7164

Epoch: 7
Loader: train. f1 score: 0.753
Loader: valid. f1 score: 0.7348

Epoch: 8
Loader: train. f1 score: 0.7775
Loader: valid. f1 score: 0.7438

Epoch: 9
Loader: train. f1 score: 0.7816
Loader: valid. f1 score: 0.7467

Epoch: 10
Loader: train. f1 score: 0.7847
Loader: valid. f1 score: 0.7469

Epoch: 11
Loader: train. f1 score: 0.7869
Loader: valid. f1 score: 0.7489

Epoch: 12
Loader: train. f1 score: 0.7887
Loader: valid. f1 score: 0.7521

Epoch: 13
Loader: train. f1 score: 0.7908
Loader: valid. f1 score: 0.7513

Epoch: 14
Loader: train. f1 score: 

And finally have a look at combined wiki2vec embeddings from title, description and attributes.

In [22]:
emb = np.concatenate([title_emb_wiki, description_emb_wiki, attributes_emb_wiki], axis=1)
ad_hoc_train(emb, 'wiki', num_epochs=30)


Epoch: 1
Loader: train. f1 score: 0.2385
Loader: valid. f1 score: 0.3276

Epoch: 2
Loader: train. f1 score: 0.422
Loader: valid. f1 score: 0.4997

Epoch: 3
Loader: train. f1 score: 0.5653
Loader: valid. f1 score: 0.6087

Epoch: 4
Loader: train. f1 score: 0.6664
Loader: valid. f1 score: 0.6825

Epoch: 5
Loader: train. f1 score: 0.732
Loader: valid. f1 score: 0.7174

Epoch: 6
Loader: train. f1 score: 0.7779
Loader: valid. f1 score: 0.7518

Epoch: 7
Loader: train. f1 score: 0.8102
Loader: valid. f1 score: 0.7647

Epoch: 8
Loader: train. f1 score: 0.842
Loader: valid. f1 score: 0.7795

Epoch: 9
Loader: train. f1 score: 0.8488
Loader: valid. f1 score: 0.7817

Epoch: 10
Loader: train. f1 score: 0.8518
Loader: valid. f1 score: 0.7829

Epoch: 11
Loader: train. f1 score: 0.8553
Loader: valid. f1 score: 0.7836

Epoch: 12
Loader: train. f1 score: 0.858
Loader: valid. f1 score: 0.7846

Epoch: 13
Loader: train. f1 score: 0.8602
Loader: valid. f1 score: 0.7859

Epoch: 14
Loader: train. f1 score: 0.

# Choosing best architecture and hyperparameters with `optuna`

Here I want to choose best combination of embddings and MLP architecture for this task.

I will take all generated text and images embeddings and use optuna to find the best configuration.

## Data

In [None]:
cols = ['title', 'description', 'attributes']
emb_types = ['fasttext', 'wiki', 'bert']

emb = [np.load(f'./embeddings/{col}_{type}.npy') for col in cols for type in emb_types]
emb.append(np.load('./embeddings/images.npy'))
emb = np.concatenate(emb, axis=1)
gc.collect()

dataset = pd.read_csv('./dataset.csv')
labels, index_to_id, id_to_index = create_labels(dataset.category_id)
train_loader, valid_loader, test_loader = get_loaders(dataset, emb, 'category_id', batch_size=1024)

## Model architecture

I will try different architectures from **0** to **3** `hidden layers`, from **10** to **1000** `neurons in these layers`, `batchnorm` or `dropout` to use.

In [None]:
input_size = emb.shape[1]
emb = None
gc.collect()

num_classes = len(index_to_id)


def define_model(trial):
    n_layers = trial.suggest_int("num_layers", 1, 3)
    layers = []
    in_features = input_size

    if n_layers != 0:
        model_type = trial.suggest_categorical("model_type", ["batchnorm", "dropout"])

        for i in range(n_layers):
            out_features = trial.suggest_int(f"num_neurons_layer_{i + 1}", 10, 1000)
            layers.append(nn.Linear(in_features, out_features))

            if model_type == "batchnorm":
                layers.append(nn.BatchNorm1d(out_features))

            layers.append(nn.ReLU())

            if model_type == "dropout":
                p = trial.suggest_float(f"dropout_layer_{i + 1}", 0.2, 0.7)
                layers.append(nn.Dropout(p))

            in_features = out_features
        
    layers.append(nn.Linear(in_features, num_classes))

    return nn.Sequential(*layers)

## Training method

Also I will choose between four different `optimizers`, `learning rate` and `momentum` for them, and `step size` and `gamma` for `scheduler`.

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


def objective(trial):

    model = define_model(trial)
    model = model.to(device)
    model.double()

    criterion = nn.CrossEntropyLoss()

    optimizer_name = trial.suggest_categorical("optimizer", ["AdamW", "SGD", "Adam", "RMSprop"])

    if optimizer_name == "SGD":
        lr = trial.suggest_float("lr", 0.1, 0.9, log=True)
        momentum = trial.suggest_float("momentum", 0.5, 0.999, log=True)
        optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, momentum=momentum)
    else:
        lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
        optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    step_size = trial.suggest_int("scheduler_step_size", 5, 20)
    gamma = trial.suggest_float("scheduler_gamma", 1e-2, 0.9, log=True)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    loaders = {"train": train_loader, "valid": valid_loader}
    f1 = {"train": [], "valid": []}
    best_f1 = 0

    num_epochs = 30

    for epoch in tqdm(range(num_epochs)):
        for k, dataloader in loaders.items():
            epoch_preds = []
            epoch_ys = []

            for x_batch, y_batch in dataloader:

                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)

                if k == "train":
                    model.train()
                    optimizer.zero_grad()
                    outp = model(x_batch)
                else:
                    model.eval()
                    with torch.no_grad():
                        outp = model(x_batch)

                preds = outp.argmax(-1)
                epoch_preds += preds.tolist()
                epoch_ys += y_batch.tolist()

                if k == "train":
                    loss = criterion(outp, y_batch)
                    loss.backward()
                    optimizer.step()

            if k == 'train':
                scheduler.step()

            f1_epoch = f1_score(epoch_ys, epoch_preds, average='weighted')
            f1[k].append(f1_epoch)
            
            if k == 'valid' and f1['valid'][-1] > best_f1:
                best_f1 = f1['valid'][-1]

            if k == 'valid':
                trial.report(f1_epoch, epoch)
                
                if trial.should_prune():
                    raise optuna.exceptions.TrialPruned()

    return best_f1

## Search

In [None]:
n_trials = 150

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

In [2]:
print("\nBest trial:")
trial = study.best_trial
print("  F1 Score: ", trial.value)
print("\n  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


Best trial:
  F1 Score:  0.8746192054507418

  Params: 
    num_layers: 1
    model_type: batchnorm
    num_neurons_layer_1: 980
    optimizer: AdamW
    lr: 0.0006445724677322522
    scheduler_step_size: 7
    scheduler_gamma: 0.32015736506815595


# Training the best model

I've got 

```
Train f1 score: 0.9999

Valid f1 score: 0.8761

Test f1 score: 0.883
```

for the best parameters obtained from optuna.

Now I will train the best configuration on the whole dataset.

In [None]:
cols = ['title', 'description', 'attributes']
emb_types = ['fasttext', 'wiki', 'bert']

emb = [np.load(f'./embeddings/{col}_{type}.npy') for col in cols for type in emb_types]
emb.append(np.load('./embeddings/images.npy'))
emb = np.concatenate(emb, axis=1)
input_size = emb.shape[1]
gc.collect()

dataset = pd.read_csv('./dataset.csv')
labels, index_to_id, id_to_index = create_labels(dataset['category_id'])

dataloader = DataLoader(
    list(zip(emb, labels)),
    batch_size=1024, 
    num_workers=4,
    shuffle=True, 
    drop_last=True
)

emb = None
gc.collect()

I will use these dictionaries for predictions in the future.

In [4]:
with open('index_to_id.pkl', 'wb') as f:
    pickle.dump(index_to_id, f)

with open('id_to_index.pkl', 'wb') as f:
    pickle.dump(id_to_index, f)

Best model architecture found by optuna.

In [5]:
hidden_size = 980
num_classes = len(index_to_id)


model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.BatchNorm1d(hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, num_classes)
)  

The best model is trained on the whole table with the best params found by **optuna** and saved as `best_params.pt`

In [6]:
train_parametrized(
    model,
    model_name='best_params',
    dataloader=dataloader,
    num_epochs=15,
    optimizer=torch.optim.AdamW,
    lr=0.0006445724677322522,
    step_size=7,
    gamma=0.32015736506815595
)


Train f1 score: 0.9999


# Test data prediction

In [2]:
test_df = pd.read_parquet('./test.parquet')
test_df.head(1)

Unnamed: 0,product_id,sale,shop_id,shop_title,rating,text_fields
1,1997646,False,22758,Sky_Electronics,5.0,"{""title"": ""Светодиодная лента Smart led Strip ..."


## Preprocess

Let's extract `title`, `description` and `attributes` from the `text_fields` column.

In [3]:
def extract_text(text_field: str):
    content_dict = ast.literal_eval(text_field)
    title = content_dict['title']
    description = re.sub("<[^\>]*>", '', content_dict['description'])
    attributes = '. '.join(content_dict['attributes'])

    return title, description, attributes


titles, desciptions, attributes = zip(*test_df['text_fields'].apply(extract_text))
text_data = pd.DataFrame({'title': titles, 'description': desciptions, 'attributes': attributes})
text_data.head(1)

Unnamed: 0,title,description,attributes
0,"Светодиодная лента Smart led Strip Light, с пу...","Светодиодная лента LED, 5 м, RGB (Цветная) вла...","Легкость управления с пульта, а так же смартфо..."


Let' concatenate `title`, `description` and `attributes` with the corresponding `product_id`.

In [4]:
test_df = pd.concat(
    [
        test_df[['product_id']].reset_index(drop=True), 
        text_data.reset_index(drop=True)
    ], 
    axis=1
)
test_df.head(1)

Unnamed: 0,product_id,title,description,attributes
0,1997646,"Светодиодная лента Smart led Strip Light, с пу...","Светодиодная лента LED, 5 м, RGB (Цветная) вла...","Легкость управления с пульта, а так же смартфо..."


## Embeddings

### Fasttext

In [13]:
title_emb = get_embeddings(
    df=test_df,
    column_name='title', 
    file_name='title_fasttext', 
    folder='embeddings_test', 
    use_fasttext=True, 
    use_wiki2vec=False
)
description_emb = get_embeddings(
    df=test_df, 
    column_name='description', 
    file_name='description_fasttext', 
    folder='embeddings_test', 
    use_fasttext=True, 
    use_wiki2vec=False
)
attributes_emb = get_embeddings(
    df=test_df, 
    column_name='attributes',
    file_name='attributes_fasttext', 
    folder='embeddings_test', 
    use_fasttext=True, 
    use_wiki2vec=False
)

100%|██████████| 16860/16860 [03:40<00:00, 76.38it/s] 
100%|██████████| 16860/16860 [15:41<00:00, 17.91it/s]
100%|██████████| 16860/16860 [05:05<00:00, 55.12it/s] 


### Wiki2Vec

In [None]:
title_emb = get_embeddings(
    df=test_df,
    column_name='title', 
    file_name='title_wiki', 
    folder='embeddings_test', 
    use_fasttext=False, 
    use_wiki2vec=True
)
description_emb = get_embeddings(
    df=test_df, 
    column_name='description', 
    file_name='description_wiki', 
    folder='embeddings_test', 
    use_fasttext=False, 
    use_wiki2vec=True
)
attributes_emb = get_embeddings(
    df=test_df, 
    column_name='attributes',
    file_name='attributes_wiki', 
    folder='embeddings_test', 
    use_fasttext=False, 
    use_wiki2vec=True
)

### Bert

In [None]:
max_len = max(test_df['title'].apply(lambda s: len(s.split())))
title_embeddings_bert = make_features_transformers(
    test_df, 
    'title', 
    max_len, 
    'title_bert', 
    folder='embeddings_test'
)

test_df['description'].fillna('', inplace=True)
max_len = int(test_df['description'].apply(lambda s: len(s.split())).mean())
description_embeddings_bert = make_features_transformers(
    test_df, 
    'description', 
    max_len, 
    'description_bert', 
    folder='embeddings_test'
)

test_df['attributes'].fillna('', inplace=True)
max_len = max(test_df['attributes'].apply(lambda s: len(s.split())))
attributes_embeddings_bert = make_features_transformers(
    test_df, 
    'attributes', 
    max_len, 
    'attributes_bert', 
    folder='embeddings_test'
)

### Images

In [12]:
images_embeddings =  make_features_cnn(
    df=test_df, 
    id_column='product_id',
    images_directory='./images/test', 
    filename_to_save='images', 
    directory_to_save='./embeddings_test'
)

100%|██████████| 16860/16860 [17:31<00:00, 16.03it/s]


## Dataloader

In [7]:
cols = ['title', 'description', 'attributes']
emb_types = ['fasttext', 'wiki', 'bert']

emb = [np.load(f'./embeddings_test/{col}_{type}.npy') for col in cols for type in emb_types]
emb.append(np.load('./embeddings_test/images.npy'))
emb = np.concatenate(emb, axis=1)
input_size = emb.shape[1]

dataloader = DataLoader(
    emb,
    batch_size=1024, 
    num_workers=4,
    shuffle=False, 
    drop_last=False
)

## Prediction

In [None]:
with open('./index_to_id.pkl', 'rb') as f:
    index_to_id = pickle.load(f)

hidden_size = 980
num_classes = len(index_to_id)

model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.BatchNorm1d(hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, num_classes)
)
model.load_state_dict(torch.load('./models/best_params.pt'))
model.double()
model.eval()

In [25]:
predictions = predict(model, dataloader, index_to_id)
predicted_df = pd.DataFrame({'product_id': test_df['product_id'], 'predicted_category_id': predictions})
predicted_df

Unnamed: 0,product_id,predicted_category_id
0,1997646,13083
1,927375,14922
2,1921513,2803
3,1668662,12524
4,1467778,13887
...,...,...
16855,1914264,11645
16856,1310569,12357
16857,978095,13651
16858,797547,2740


In [26]:
predicted_df.to_parquet('result.parquet')

# Directions for further Research

To increase the performane of the model in the future these techniques can be applied:
* Feature Engineering: create dublicates of rare product categories, use more columns of the data, fine tune embeddings models on data.
* Solve problems with CatBoost. It requires a lot of memory to perform well on these task.
* Try to use [CLIP from OpenAI](https://habr.com/ru/amp/post/539312/).