### Imports

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"
DEVICE='cuda'

In [3]:
import pandas as pd
from ast import literal_eval
import re
import json
import numpy as np
from tqdm.notebook import tqdm

In [4]:
import torch
import torch.nn as nn

In [5]:
from collections import Counter

In [6]:
import scipy
from scipy.sparse import dok_matrix

In [7]:
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer

In [8]:
%matplotlib inline
import matplotlib.pyplot as plt

In [9]:
from catalyst.utils import set_global_seed, get_device


set_global_seed(42)
device = get_device()

### Data reading and preprocessing

In [10]:
data = pd.read_excel('/storage1/ryazantsev/tenderhack/DataSet_EKB_200000.xlsx')

In [11]:
data.columns

Index(['Идентификатор СТЕ', 'Наименование СТЕ', 'Категория', 'Описание',
       'Характеристики СТЕ', 'Регионы поставки',
       'Кол-во заключенных контрактов', 'Поставщики', 'Страна происхождения',
       'Другая продукция в контрактах', 'Просмотры', 'Идентификатор КПГЗ',
       'Код КПГЗ', 'Модель', 'Цена'],
      dtype='object')

In [12]:
def trim_invalid_items(x):
    if x is np.nan:
        return '[]'
    else:
        rindex = x.rfind('}')
        return x[:rindex+1]+']'

In [13]:
data['Другая продукция в контрактах']=data['Другая продукция в контрактах'].map(trim_invalid_items)

### Feature extraction

#### Category encoding

In [14]:
data['Категория'].nunique()

3810

In [15]:
assert not data['Категория'].isna().sum()
category_hasher = HashingVectorizer(n_features=10)
category_features = category_hasher.fit_transform(data['Категория'])

#### Region encoding

In [16]:
data['Регионы поставки'].iloc[-2]

'[{"Name":"Москва"}]'

In [17]:
reg_feature_allregions = []
reg_feature_nregions = []
reg_counter = Counter()
for z in data['Регионы поставки'].values:
    regions = [x['Name'] for x in json.loads(z)] 
    reg_counter.update(regions)
    reg_feature_allregions.append('Все регионы' in regions)
    #reg_feature_moscow.append('Москва' in regions)
    reg_feature_nregions.append(len(regions))

In [18]:
allregions =   [' '.join([re.sub(' ', '_', x['Name']) for x in json.loads(z)]) for z in data['Регионы поставки'].values]

In [19]:
region_vectorizer = CountVectorizer(lowercase='False')
region_features = region_vectorizer.fit_transform(allregions)

#### Supplier encodingm

In [20]:
allsupliers = [' '.join([str(x['SupplierId']) for x in json.loads(z)]) for z in data['Поставщики'].values]
supplier_vectorizer = HashingVectorizer(n_features=32, token_pattern='[0..9]*')
supplier_features = supplier_vectorizer.fit_transform(allsupliers)

#### Country encoding

In [21]:
data['Страна происхождения'].fillna('Не указано', inplace=True)

In [22]:
country_vectorizer = HashingVectorizer(n_features=10)
country_features = country_vectorizer.fit_transform(data['Страна происхождения'])

#### kpgz identifier encoding

In [23]:
data['Идентификатор КПГЗ'].isna().sum()

24

In [24]:
data['Идентификатор КПГЗ'].fillna(value=-1, inplace=True)

In [25]:
data['Идентификатор КПГЗ'].nunique()

3415

In [26]:
kpgz_identifier_vectorizer = HashingVectorizer(n_features=8)
kpgz_identifier_features = kpgz_identifier_vectorizer.fit_transform([str(x) for x in data['Идентификатор КПГЗ'].values])

In [27]:
kpgz_identifier_features

<200000x8 sparse matrix of type '<class 'numpy.float64'>'
	with 199976 stored elements in Compressed Sparse Row format>

#### kpgz code encoding

In [28]:
data['Код КПГЗ'].fillna(value=-1, inplace=True)

In [29]:
data['Код КПГЗ'].nunique()

3415

### model encoding

Nope lol

### Price handling

In [30]:
def str_to_obj(s: str):
    if pd.notna(s):
        try:
            return eval(s)
        except SyntaxError:
            return eval(s.rsplit("},", 1)[0] + "}]")
    return s

In [31]:
def aggregate_prices(x):
    if type(x) is float:
        return -1,-1,-1,-1,0
    else:
        costs = [z['Cost'] for z in x]
        return np.log10(np.min(costs)), np.log10(np.max(costs)), np.log10(np.mean(costs)), np.log10(np.median(costs)), len(costs)

In [32]:
price_objects = list(map(str_to_obj, data['Цена']))

In [33]:
price_features = np.array([aggregate_prices(x)  for x in price_objects])

### Aggregating features together

In [34]:
sparse_features = scipy.sparse.hstack([category_features, region_features, supplier_features, country_features, kpgz_identifier_features])

In [35]:
data.columns

Index(['Идентификатор СТЕ', 'Наименование СТЕ', 'Категория', 'Описание',
       'Характеристики СТЕ', 'Регионы поставки',
       'Кол-во заключенных контрактов', 'Поставщики', 'Страна происхождения',
       'Другая продукция в контрактах', 'Просмотры', 'Идентификатор КПГЗ',
       'Код КПГЗ', 'Модель', 'Цена'],
      dtype='object')

In [36]:
#these ones are exctracted with tinyBERT by global avg pooling
name_embeddings = np.load('/storage1/nerusskikh/tenderhack/Names_Embeddings.npy')

In [37]:
dense_features = np.hstack([data[['Кол-во заключенных контрактов', 'Просмотры']].fillna(-1).values, 
                            np.array(reg_feature_allregions)[:,None],
                            np.array(reg_feature_nregions)[:,None],
                            price_features,
                            name_embeddings
                           ])
dense_features_divisor = dense_features.max(0)
dense_features /= dense_features_divisor[None,:]

### Target construction

In [38]:
sku_numerator = {}

In [39]:
for i, index_ in enumerate(data['Идентификатор СТЕ'].values):
    sku_numerator[index_]=i

In [40]:
cooccurence_matrix = dok_matrix((200000, 200000))
offset_index = 200000
invisible_sku = []

Здесь для каждого товара строятся рейтинги сопутствующих товаров. Основная идея для рейтинга - как много товарных единиц в среднем покупают на контракт с СТЕ. Для того, чтобы исключить перекос рейтинга в пользу массовых, но дешевых товаров (в основном это расходные материалы) всем соgутствующим товарам, которые проходят в среднем более чем через половину контрактов для товара, присваивается максимальный рейтинг.

In [41]:
for sku_id_, ncontracts, other_sku in tqdm(data[['Идентификатор СТЕ', 'Кол-во заключенных контрактов', 'Другая продукция в контрактах']].itertuples(index=False)):
    other_sku = json.loads(other_sku)
    i = sku_numerator[sku_id_]
    for other_sku_ in other_sku:
        other_sku_id = other_sku_['OtherSkuId']
        if ncontracts>=1:
            quantity = other_sku_['Quantity']/ncontracts
            j = sku_numerator.get(other_sku_id)
            #if j and quantity>0.3:
            if j:
                #cooccurence_matrix[i,j] = quantity
                cooccurence_matrix[i,j] = min(quantity, 0.5)/0.5
#             else:
#                 sku_numerator[other_sku_id] = offset_index
#                 offset_index+=1
#                 invisible_sku.append(other_sku_)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [42]:
import scipy.sparse as sparse

In [43]:
cooccurence_csr = cooccurence_matrix.tocsr()
sparse_features = sparse_features.tocsr()

In [44]:
from torch.utils.data import Dataset, DataLoader

In [45]:
# class skuDataset(Dataset):
#     def __init__(self, cmatrix):
#         self.cmatrix = cmatrix
#         self.rows_, self.cols_ = self.cmatrix.nonzero()
#         #add some negative samples
#         n = self.rows_.shape[0]
#         self.rows_ = np.concatenate([self.rows_, np.random.randint(0, self.cmatrix.shape[0], n)])
#         self.cols_ = np.concatenate([self.cols_, np.random.randint(0, self.cmatrix.shape[0], n)])
#     def __len__(self):
#         return self.rows_.shape[0]
#     def __getitem__(self, index):
#         i = self.rows_[index]
#         j = self.cols_[index]
#         val = self.cmatrix[i,j]
#         return {'sku':i, 'other_sku':j, 'targets':val.astype(np.float32)}


#dataloader = DataLoader(dataset, batch_size=2048, shuffle=True)

In [46]:
### Wrapping the data in pytorch dataloader, model setup and training

In [47]:
def getFeatures(index):
    dense_ = dense_features[index]
    sparse_ = np.array(sparse_features[index].todense()).squeeze()
    return np.concatenate([sparse_, dense_])
    #return sparse_, dense_
    

class skuDatasetWithFeatures(Dataset):
    def __init__(self, cmatrix, n_negative=30):
        self.cmatrix = cmatrix
        self.n = cmatrix.shape[1]
        self.n_negative = n_negative
        
    def __len__(self):
        return self.cmatrix.shape[0]
    def __getitem__(self, index):
        row = self.cmatrix[index]
        i, j = row.nonzero()
        i = np.concatenate([i, np.zeros(self.n_negative,)])
        j = np.concatenate([j, np.random.randint(0, self.n, self.n_negative)])
        val = np.array(row[i,j]).flatten()
        i+=index
        query_features = np.vstack([getFeatures(index) for _ in range(len(j))])
        key_features = np.vstack([getFeatures(j_) for j_ in j])
        
        return {'sku':i.flatten(), 'sku_features': query_features, 'other_sku':j.flatten(), 
                'other_sku_features':key_features, 'targets':val.astype(np.float32)}
    
def collate_fn(
    batch
):
    sku = torch.LongTensor(np.concatenate([b["sku"] for b in batch]))
    other_sku =  torch.LongTensor(np.concatenate([b["other_sku"] for b in batch]))
    targets = torch.Tensor(np.concatenate([b["targets"] for b in batch]))
    sku_features = torch.Tensor(np.vstack([b["sku_features"] for b in batch]))
    other_sku_features = torch.Tensor(np.vstack([b["other_sku_features"] for b in batch]))
    return {"sku": sku, 'sku_features':sku_features, "other_sku": other_sku, 
            'other_sku_features':other_sku_features, "targets": targets}

In [48]:
dataset = skuDatasetWithFeatures(cooccurence_csr)

In [49]:
dataloader = DataLoader(dataset, batch_size=1, shuffle=True, collate_fn=collate_fn)

In [50]:
class MLPEncoder(nn.Module):
    def __init__(self, nfeatures=745, layer_sizes:list=[64,16]):
        super().__init__()
        self.module_list = []
        self.module_list.append(nn.Linear(nfeatures, layer_sizes[0]))
        for input_size, output_size in zip(layer_sizes, layer_sizes[1:]):
            self.module_list.append(nn.ReLU())
            self.module_list.append(nn.Linear(input_size, output_size))
        self.model = nn.Sequential(*self.module_list)
    def forward(self, x):
        return self.model(x)
        
        

class myNСF(nn.Module):
    def __init__(
        self, nfeatures=745, layer_sizes:list=[64, 16]
    ):
        super().__init__()
 
        self.user_encoder = MLPEncoder(nfeatures, layer_sizes)
        self.item_encoder =  MLPEncoder(nfeatures, layer_sizes)
 

    def forward(
        self, user_features: torch.Tensor, item_features: torch.Tensor
    ) -> torch.Tensor:
        user_embeddings = self.user_encoder(user_features)
        item_embeddings = self.item_encoder(item_features)
        products = (user_embeddings*item_embeddings).sum(-1)
        return products

In [51]:
from catalyst.contrib.nn import RAdam


model = myNСF(745, (128, 32))
optimizer = RAdam(model.parameters(), lr=1e-2)
criterion = nn.BCEWithLogitsLoss()
#criterion = nn.MSELoss()

In [52]:
from catalyst.dl import SupervisedRunner

class RecSysRunner(SupervisedRunner):
    def handle_batch(self, batch):
        logits = self.model(batch["sku_features"], batch["other_sku_features"])
        sorted_indeces = torch.argsort(logits, descending=True)
        self.batch["targets"] = batch["targets"][None, sorted_indeces]
        self.batch["logits"] = logits[None, sorted_indeces]

runner = RecSysRunner()

In [53]:
from catalyst import dl

 
 
callbacks = [
    dl.NDCGCallback("logits", "targets", [1,3,5]),
    dl.MAPCallback("logits", "targets", [1,3,5]),
    dl.MRRCallback("logits", "targets", [1,3,5]),
    dl.HitrateCallback("logits", "targets", [1,3,5]),
    dl.OptimizerCallback("loss", accumulation_steps=64),
    dl.CheckpointCallback(loader_key='train',metric_key='ndcg', minimize=False, use_runner_logdir=True)
    #dl.BatchOverfitCallback(train=10, valid=1)
]

In [54]:
##WORKAROUNDS
train_dataloader = dataloader
valid_dataloader = dataloader

In [None]:
%%capture
from pathlib import Path
from datetime import datetime

runner.train(
    model=model,
    optimizer=optimizer,
    loaders={"train": train_dataloader, 
             #"valid": valid_dataloader
            },
    criterion=criterion,
    callbacks=callbacks,
    logdir=Path("logs") / ('mlp_'+datetime.now().strftime("%Y%m%d-%H%M%S")),
    num_epochs=10,
    verbose=True,
)

#### Embeddings inference and serialization

In [55]:
class skuInferenceDatasetWithFeatures(Dataset):
    def __init__(self, cmatrix):
        self.cmatrix = cmatrix
        self.n = cmatrix.shape[1]
        
    def __len__(self):
        return self.cmatrix.shape[0]
    def __getitem__(self, index):
        query_features =  np.float32(getFeatures(index))      
        return {'sku_features': query_features}

In [56]:
inference_dataset = skuInferenceDatasetWithFeatures(cooccurence_csr)
inference_dataloader = DataLoader(inference_dataset, batch_size=200, shuffle=False)

In [57]:
model.load_state_dict(torch.load('/storage1/nerusskikh/tenderhack/logs/mlp_20210905-032436/best.pth', map_location='cpu')['model_state_dict'])

<All keys matched successfully>

In [58]:
embeddings_query = []
embeddings_key = []
for batch in tqdm(inference_dataloader):
    batch_embs_query = model.user_encoder(batch['sku_features'])
    batch_embs_key = model.item_encoder(batch['sku_features'])
    embeddings_query.append(batch_embs_query.cpu().detach().numpy())
    embeddings_key.append(batch_embs_key.cpu().detach().numpy())

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [59]:
embeddings_query = np.vstack(embeddings_query)
embeddings_key = np.vstack(embeddings_key)

In [60]:
import faiss
index = faiss.IndexFlatIP(embeddings_key.shape[1])
index.add(embeddings_key)

In [61]:
def get_recommendations(query_index, k=6):
    d, neighbour_indexes = index.search(embeddings_query[query_index:query_index+1], k)
    neighbour_indexes = list(neighbour_indexes.flatten())
    if neighbour_indexes[0]!=query_index:
        neighbour_indexes = [query_index]+neighbour_indexes[:-1]
    return data.iloc[neighbour_indexes]

In [62]:
#у этого товара нет сопутки, поэтому классическая коллаборативная фильтрация не работает
get_recommendations(2, k=7)

Unnamed: 0,Идентификатор СТЕ,Наименование СТЕ,Категория,Описание,Характеристики СТЕ,Регионы поставки,Кол-во заключенных контрактов,Поставщики,Страна происхождения,Другая продукция в контрактах,Просмотры,Идентификатор КПГЗ,Код КПГЗ,Модель,Цена
2,34172198,Елка пристенно-угловая Графская премиум 210 см...,Предметы внутреннего интерьера,,"[{""Name"":""Цвет"",""Id"":340085190,""Value"":""Зелены...","[{""Name"":""Москва""},{""Name"":""Все регионы""}]",,"[{""SupplierId"":1348988,""Name"":""ИП Чепало Оксан...",Не указано,[],5,68093.0,01.20.08,Елка пристенно-угловая Графская премиум 210 см.,
76663,23151701,"Гирлянда ""Бахрома""",Предметы внутреннего интерьера,,"[{""Name"":""Вид гирлянды"",""Id"":-318490441,""Value...","[{""Name"":""Москва""},{""Name"":""Московская""}]",9.0,"[{""SupplierId"":1796556,""Name"":""ООО \""ТМЦ\"""",""I...",Не указано,"[{""OtherSkuId"":34264045,""OtherSkuName"":""Бант н...",2,68093.0,01.20.08,Гирлянда уличная,
197544,31420292,"Плакат ""Поздравляем с Днём победы!"": Формат А2",Канцелярские журналы,Наглядно-оформительский плакат &quot;Поздравля...,"[{""Name"":""Вид продукции"",""Id"":-681544431,""Valu...","[{""Name"":""Москва""},{""Name"":""Московская""},{""Nam...",,"[{""SupplierId"":1263059,""Name"":""ООО \""УЧМАГ\"""",...",РОССИЯ,[],0,12104674.0,01.15.02.05.03,-,
140803,34411746,"Гирлянда ""Бахрома"" с насадками ""Ёлки""",Предметы внутреннего интерьера,,"[{""Name"":""Вид гирлянды"",""Id"":350448099,""Value""...","[{""Name"":""Москва""}]",3.0,"[{""SupplierId"":2291514,""Name"":""ООО \""СИМА ТЕНД...",КИТАЙ,"[{""OtherSkuId"":34409993,""OtherSkuName"":""Бант п...",0,68093.0,01.20.08,Гирлянда,
123315,20680221,"Гирлянда ""Нить""",Предметы внутреннего интерьера,,"[{""Name"":""Вид продукции"",""Id"":-495980289,""Valu...","[{""Name"":""Москва""}]",4.0,"[{""SupplierId"":1258018,""Name"":""ООО \""КОМПУС-М\...",Не указано,"[{""OtherSkuId"":34333319,""OtherSkuName"":""БАХРОМ...",1,68093.0,01.20.08,Гирлянда,
140697,34241928,"Гирлянда ""Бахрома""",Предметы внутреннего интерьера,,"[{""Name"":""Вид насадки"",""Id"":342393214,""Value"":...","[{""Name"":""Москва""}]",4.0,"[{""SupplierId"":2291514,""Name"":""ООО \""СИМА ТЕНД...",КИТАЙ,"[{""OtherSkuId"":34234191,""OtherSkuName"":""Аромас...",0,68093.0,01.20.08,Гирлянда,
197625,34241915,"Гирлянда ""Нить""",Предметы внутреннего интерьера,,"[{""Name"":""Вид продукции"",""Id"":342392908,""Value...","[{""Name"":""Москва""}]",3.0,"[{""SupplierId"":2291514,""Name"":""ООО \""СИМА ТЕНД...",КИТАЙ,"[{""OtherSkuId"":34234191,""OtherSkuName"":""Аромас...",1,68093.0,01.20.08,Гирлянда,
