In [7]:
!pip install implicit pymorphy3



In [8]:
# from google.colab import drive
# drive.mount("/content/drive")

In [9]:
from gensim.models.word2vec import Word2Vec
import joblib
import implicit
from itertools import combinations, product
import numpy as np
import pandas as pd
import pickle
import pymorphy3
import re
import os
from scipy.sparse import csr_matrix, save_npz, coo_array
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from tqdm import tqdm
tqdm.pandas()

parser = pymorphy3.MorphAnalyzer()


input_folder = '/kaggle/input/dataset/'
folder = './'
train_file_name='cosmetic_train.tsv'
val_file_name='cosmetic_val.tsv'
output_name = 'jetfork_result.csv'

def get_noun(words: list[str]) -> str:
    result = []
    for word in words:
        parsings = [word for parsing in parser.parse(word)[:3] if parsing.tag.POS == 'NOUN']
        if len(parsings)>0:
            result.append(word)
    
    if len(result)==0:
        return words[0]
    return result[0]


def preparation_data(dataset):
    dataset['processed_name'] = dataset['name']\
    .str.lower()\
    .apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^a-zа-я -]', '', x)).strip())

    dataset['category_first_word'] = dataset['processed_name'].str.split(' ')\
                                                .progress_apply(lambda x: x[0])

    dataset['category_noun'] = dataset['processed_name'].str.split(' ')\
                                                .progress_apply(lambda x: get_noun(x[:3]))
        
    return dataset


def get_products_short_name_map(train, val):
    products_name = pd.concat([train[['item_id', 'category_noun']], val[['item_id', 'category_noun']]]).drop_duplicates()
    products_name.columns = ['target', 'category_noun'] 
    products_name.target = products_name.target.astype(str)
    
    return products_name

In [10]:
train = pd.read_csv(os.path.join(input_folder, train_file_name), sep="\t")
val = pd.read_csv(os.path.join(input_folder, val_file_name), sep="\t")

In [11]:
train["local_date"] = pd.to_datetime(train["local_date"])
val["local_date"] = pd.to_datetime(val["local_date"])

# word2vec model

In [12]:
train = preparation_data(train)
val = preparation_data(val)

100%|██████████| 223908/223908 [00:00<00:00, 545662.89it/s]
100%|██████████| 223908/223908 [04:22<00:00, 853.16it/s] 
100%|██████████| 51650/51650 [00:00<00:00, 421127.96it/s]
100%|██████████| 51650/51650 [01:00<00:00, 852.90it/s] 


In [13]:
train['category_noun'].value_counts()[0:50]

category_noun
скраб               40971
крем                12007
обертывание         11680
масло               11513
крем-баттер         11504
гель                10719
коробка             10406
маска                9133
бальзам              8630
сыворотка            6360
соль                 5843
лосьон               5795
спрей                5109
шампунь              4987
пакет                3407
крем-суфле           3286
штаны                3174
кондиционер          3060
щетка                2862
сумка-шоппер         2659
тоник                2350
пенка                2221
эмульсия             2218
антицеллюлитная      2105
крем-корсет          2014
маска-сорбет         1968
крем-гель            1967
мусс                 1881
маска-лифтинг        1879
плампер              1765
помада               1748
флюид                1662
карандаш             1646
сс-крем              1251
массажер             1244
криомасло            1172
молочко              1132
сыворотка-сияние     112

In [14]:
val['category_noun'].value_counts()[0:50]

category_noun
скраб               9236
крем                2801
обертывание         2668
коробка             2655
масло               2590
крем-баттер         2578
гель                2543
маска               2154
бальзам             1957
сыворотка           1472
соль                1405
лосьон              1392
шампунь             1164
спрей               1138
пакет                788
штаны                784
крем-суфле           730
сумка-шоппер         699
кондиционер          686
щетка                681
тоник                533
эмульсия             523
антицеллюлитная      506
пенка                501
крем-корсет          498
крем-гель            455
маска-сорбет         447
маска-лифтинг        424
мусс                 422
плампер              393
карандаш             366
флюид                366
помада               355
сс-крем              282
криомасло            278
сыворотка-сияние     274
массажер             267
молочко              254
гель-пена            243
тонер      

In [15]:
products_name = get_products_short_name_map(train, val)
products_name.rename(columns = {'target':'item_id'}).to_csv(folder + 'item_id_categ_map.csv', sep=';', index=None)

In [16]:
train_grouped = train.groupby('receipt_id')['category_noun'].apply(list).reset_index()

model = Word2Vec(
    train_grouped.category_noun.tolist(), vector_size=64, 
    sg=1, epochs=50, negative=10, min_count=1, 
    window=3, seed=42, workers=8
)

X = model.wv.key_to_index.keys()
len(X)

model.save(folder + "word2vec.model")

# Cosmetic

## Create third dataset

In [17]:
train = train.drop(['processed_name', 'category_first_word', 'category_noun'], axis=1)

In [18]:
train.shape

(223908, 7)

In [19]:
# удаление чеков с 1м товаром
train = train[train.groupby(["receipt_id"])["item_id"].transform(lambda x: x.nunique() > 1)] \
  .reset_index(drop=True)

train.shape

(223906, 7)

In [20]:
unique_items = train["item_id"]
unique_items.nunique()

667

In [21]:
# сборка таргета из товаров, которые встречаются более 1го раза на датасете
hight_support_items = train["item_id"].value_counts().where(lambda x: x > 1).dropna().index

target = train[train["item_id"].isin(hight_support_items)].groupby(["receipt_id"])["item_id"].apply(lambda x: x.sample(1).iloc[0]) \
  .to_dict()

train["target"] = train.apply(lambda x: int(target.get(x["receipt_id"], 0) == x["item_id"]), axis=1)
del target

In [22]:
train["receipt_id"].nunique(), train["target"].sum()

(68282, 68283)

In [23]:
train["has_unique_item"] = ~train["item_id"].isin(hight_support_items)
train["has_unique_item"] = train.groupby(["receipt_id"])["has_unique_item"].transform("max")

In [24]:
#  разделение трейна на 2 выборки по идентификатору чека
val_receipts = train["receipt_id"].drop_duplicates().sample(frac=.2, random_state=42).tolist()

valid = train[(train["receipt_id"].isin(val_receipts)) & (~train["has_unique_item"])]
train = train[(~train["receipt_id"].isin(val_receipts)) | (train["has_unique_item"])]

del val_receipts

In [25]:
# все товары, которые не встречаются на обучении (не в таргете)
lost_items = unique_items[~unique_items.isin(train.loc[train["target"] != 1, "item_id"])].drop_duplicates()
lost_items = valid.loc[valid["item_id"].isin(lost_items),
                                ["receipt_id", "item_id"]].groupby(["item_id"])["receipt_id"] \
  .apply(lambda x: x.sample(1).values) \
  .explode() \
  .tolist()

In [26]:
# перенос всех уникальных товаров с валидации на обучение
train = pd.concat([train,
                   valid[valid["receipt_id"].isin(lost_items)]], axis=0) \
  .reset_index(drop=True)

valid = valid[~valid["receipt_id"].isin(lost_items)].reset_index(drop=True)

In [27]:
valid.shape, train.shape

((44726, 9), (179180, 9))

In [28]:
assert train.loc[train["target"] != 1, "item_id"].nunique() >= (unique_items.nunique() * .95) # поправка максимум на 5% потерянных товаров
assert train["receipt_id"].isin(valid["receipt_id"]).sum() == 0
assert valid["receipt_id"].isin(train["receipt_id"]).sum() == 0

In [29]:
train

Unnamed: 0,device_id,receipt_id,item_id,local_date,name,price,quantity,target,has_unique_item
0,356645110209741,14109912085,200151,2023-02-14 19:29:05,Коробка подарочная A MILLION KISSES,0.0,1,0,False
1,356645110209741,14118224448,200222,2023-02-15 16:55:40,"Крем-баттер для тела Франжипани-Монои, 200 мл",1290.0,1,1,False
2,356645110209741,14118224448,200521,2023-02-15 16:55:40,"Скраб для тела SAKURA, 250 гр",1390.0,1,0,False
3,356645110209741,14118224448,200282,2023-02-15 16:55:40,"Массажное масло от растяжек MIRACLE OIL,150 мл",1290.0,1,0,False
4,356645110209741,14118224448,200136,2023-02-15 16:55:40,"Консилер Liquid camouflage, тон light 10мл.",1390.0,1,0,False
...,...,...,...,...,...,...,...,...,...
179175,352398089986709,9246926843,200365,2021-09-12 21:54:20,"Обертывание-гель ""Уменьшение объемов"", 200 мл.",990.0,1,1,False
179176,352398089986709,9246926843,200388,2021-09-12 21:54:20,Пакет ПВД,0.0,1,0,False
179177,352398089986709,9267569402,200388,2021-09-14 22:39:36,Пакет ПВД,0.0,1,1,False
179178,352398089986709,9267569402,200536,2021-09-14 22:39:36,Скраб для тела водорослевый детокс ALGAE SCRUB...,990.0,1,0,False


In [30]:
# выделение целевого товара в отдельный столбец
train.loc[train["target"] == 1, "target"] =\
  train.loc[train["target"] == 1, "item_id"]

train["target"] = train.groupby(["receipt_id"])["target"].transform("max")
train = train[train["item_id"] != train["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)



valid.loc[valid["target"] == 1, "target"] =\
  valid.loc[valid["target"] == 1, "item_id"]

valid["target"] = valid.groupby(["receipt_id"])["target"].transform("max")
valid = valid[valid["item_id"] != valid["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)

In [31]:
train.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,local_date,name,price,quantity,target
5955,352398083991747,11587454058,200008,2022-05-27 15:11:57,"Английская соль для ванны ""MAGNESIUM SPA SALT""...",990.0,1,200659
77405,352398090018898,11671426787,200278,2022-06-04 22:31:15,"Массажер (гуаша), без ручки для лица",0.0,2,200323
43876,356645110489244,12411703702,200070,2022-08-16 12:27:36,"Гель для душа Candy Raspberry, 100 мл",0.0,1,200065
34381,352398088070463,11217500578,200555,2022-04-18 15:30:23,"Скраб лайм-имбирь, 250 г (товар)",1190.0,1,200024
114134,356645110800358,10187946109,200226,2021-12-22 19:13:53,"Крем-корсет черная икра, 200 мл (товар)",1390.0,1,200530


## Prepare matrix

In [32]:
# агрегация чеков
train_mtx = train.groupby(["receipt_id", "item_id"])["quantity"].min() \
    .reset_index()
train_mtx["quantity"] = 1
train_mtx = train_mtx.drop_duplicates() \
    .reset_index(drop=True)

receipt_items = train_mtx.groupby(["receipt_id"])["item_id"].apply(lambda x: tuple(set(x))).to_dict()
train_mtx["items"] = train_mtx["receipt_id"].map(receipt_items.get)

# удаление чеков из 1го товара и удаление одинаковых чеков
train_mtx = train_mtx[train_mtx["items"].apply(len) > 1] \
  .drop_duplicates(subset=["items", "item_id"]) \
  .reset_index(drop=True)

# преобразование типов данных
train_mtx["receipt_cat"] = train_mtx["receipt_id"].astype("category").cat.codes
train_mtx["item_cat"] = train_mtx["item_id"].astype("category").cat.codes

In [33]:
# словари для обращения к эмбедам ALS
receipt_2idx = train_mtx.drop_duplicates(subset=["receipt_cat"]) \
  .set_index("items")["receipt_cat"].to_dict()

item_2idx = train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_id")["item_cat"].to_dict()
idx_2item = train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_cat")["item_id"].to_dict()

In [34]:
# построение матрицы чек-товар
sparse_receipt_item = csr_matrix((train_mtx["quantity"].astype(float),
                                 (train_mtx["receipt_cat"], train_mtx["item_cat"])))

In [35]:
save_npz(folder + "sparse_matrix.npz", sparse_receipt_item)

with open(folder + "receipt_2idx.pkl", "wb") as f:
    pickle.dump(receipt_2idx, f)

with open(folder + "item_2idx.pkl", "wb") as f:
    pickle.dump(item_2idx, f)

with open(folder + "idx_2item.pkl", "wb") as f:
    pickle.dump(idx_2item, f)

## Fit and tune ALS

In [36]:
!pip install scikit-optimize



In [37]:
def recommend_to_receipt(receipt_cat, sparse_user_item,
                         receipt_vecs, item_vecs, idx_2item, num_items=5):

    receipt_interactions = sparse_user_item[receipt_cat, :].toarray()

    receipt_interactions = receipt_interactions.reshape(-1) + 1
    receipt_interactions[receipt_interactions > 1] = 0

    rec_vector = receipt_vecs[receipt_cat, :].dot(item_vecs.T).toarray()

    recommend_vector = (receipt_interactions * rec_vector)[0]

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    result = []

    for idx in set(item_idx):
        result.append((idx_2item[idx], recommend_vector[idx]))

    return result

In [38]:
def recommend_to_items(items_cat, item_norms, item_vecs, idx_2item, num_items=5):

    scores = item_vecs.dot(item_vecs[items_cat].T).T  / item_norms.reshape(1, -1)
    top_idx = np.argpartition(scores, -num_items, axis=1)[:, -(num_items+1):]
    scores = np.array([scores[idx, row] for idx, row in enumerate(top_idx)])
    scores = scores / item_norms[items_cat].reshape(-1, 1)
    result = []
    for i in sorted(zip(top_idx.reshape(-1), scores.reshape(-1)), key=lambda x: -x[1]):
        if i[0] not in items_cat and idx_2item[i[0]] not in [j[0] for j in result]:
            result.append((idx_2item[i[0]], i[1]))

    return result[:num_items]

In [39]:
valid_agg = valid.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

valid_agg["receipt_cat"] = valid_agg["item_id"].map(receipt_2idx.get)
valid_agg["item_cat"] = valid_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

In [40]:
valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat
10611,356645110691534,13339145181,"(200640, 200227, 200070, 200203, 200495, 20049...",200128,,"[619, 223, 68, 200, 481, 484, 273, 25]"
547,352398085964585,9645306637,"(200065, 200389, 200080, 200469, 200501, 20005...",200294,,"[64, 378, 78, 456, 486, 58, 556]"
12067,356645110747963,13488461040,"(200620,)",200668,,[600]
2890,352398089986709,9951635424,"(200353, 200547, 200651, 200268, 200588, 20014...",200059,,"[346, 531, 630, 263, 570, 141, 515, 247, 217]"
9014,356645110622299,11532275551,"(200423,)",200495,,[411]


### Select parameters

In [41]:
num_recs = 10
alpha_val = 15

param_grid = {
  "factors": [10, 20, 50],
  "regularization": [0.01, 0.1],
  "iterations": [10, 20]
}

all_param_combinations = list(product(*param_grid.values()))
result = []

#TODO: убрать
for params in tqdm(all_param_combinations[:1]):
    factors, regularization, iterations = params
    params = {
      "factors": factors,
      "regularization": regularization,
      "iterations": iterations,
      "calculate_training_loss": False,
      "random_state": 42
    }
    model = implicit.als.AlternatingLeastSquares(**params)
    model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=False)

    receipt_vecs = model.user_factors
    item_vecs = model.item_factors

    receipt_vecs_csr = csr_matrix(receipt_vecs)
    item_vecs_csr = csr_matrix(item_vecs)

    item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

    valid_agg_copy = valid_agg.copy()
    valid_agg_copy["preds"] = valid_agg_copy.apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

    hit = ((valid_agg_copy.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100)
    result.append((hit, params))

100%|██████████| 1/1 [02:49<00:00, 169.49s/it]


In [42]:
result = pd.DataFrame(result, columns=["hit@10", "params"]) \
  .sort_values("hit@10", ascending=False)
result

Unnamed: 0,hit@10,params
0,27.480021,"{'factors': 10, 'regularization': 0.01, 'itera..."


### Fit final model

In [43]:
best_params = result["params"].iloc[0].copy()
best_params["calculate_training_loss"] = True
print(best_params)

{'factors': 10, 'regularization': 0.01, 'iterations': 10, 'calculate_training_loss': True, 'random_state': 42}


In [44]:
model = implicit.als.AlternatingLeastSquares(**best_params)
model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=True)

  0%|          | 0/10 [00:00<?, ?it/s]

In [45]:
joblib.dump({"model": model, "params": best_params, "alpha_val": alpha_val},
            folder + "candidate_model.joblib")

['./candidate_model.joblib']

In [46]:
# извлечение эмбедов из ALS
receipt_vecs = model.user_factors
item_vecs = model.item_factors

receipt_vecs_csr = csr_matrix(receipt_vecs)
item_vecs_csr = csr_matrix(item_vecs)

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

### Generate candidates

In [47]:
valid_agg["preds"] = valid_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

valid_agg = valid_agg[valid_agg["preds"].apply(len) > 0].reset_index(drop=True)

100%|██████████| 13639/13639 [00:04<00:00, 2735.60it/s]


In [48]:
print("accuracy: ", ((valid_agg["preds"].apply(lambda x: x[0][0]) == valid_agg["target"]).mean() * 100))
print("hit@10: ", ((valid_agg.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100))

accuracy:  7.991788254270841
hit@10:  27.480020529364324


In [49]:
valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat,preds
13492,356645111685238,13369196932,"(200345,)",200112,,[338],"[(200336, 0.91740996), (200332, 0.9061887), (2..."
7275,356645110252402,9683965703,"(200478,)",200009,,[464],"[(200490, 0.94000685), (200220, 0.8644388), (2..."
7926,356645110489244,12580038779,"(200311,)",200511,,[304],"[(200218, 0.9730467), (200097, 0.96410483), (2..."
7538,356645110252402,13721158162,"(200184, 200243, 200355, 200302)",200520,10867.0,"[181, 239, 348, 296]","[(200266, 0.5812240242958069), (200267, 0.6157..."
2937,352398089986709,10409929437,"(200606,)",200623,,[587],"[(200089, 0.9555767), (200286, 0.9274138), (20..."


In [50]:
valid_agg = valid_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

valid_agg = pd.concat([valid_agg,
                                pd.DataFrame(valid_agg["preds"].tolist(), columns=["candidate", "als_score"])],
                              axis=1) \
  .drop(["preds"], axis=1)

In [51]:
valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
94512,352398090018898,11469036040,200099,200206,200528,0.961339
146254,356645110248293,9644270837,200377,200263,200007,0.811853
4284,352398083991747,11036657312,200275,200610,200591,0.774737
242862,356645110691534,13015332948,200266,200156,200162,0.96425
129909,356645110209741,14451700829,200631,200608,200435,0.954821


In [52]:
val_agg = val.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x))}) \
  .reset_index()

val_agg["receipt_cat"] = val_agg["item_id"].map(receipt_2idx.get)
val_agg["item_cat"] = val_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

val_agg["preds"] = val_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

val_agg = val_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

val_agg = pd.concat([val_agg, pd.DataFrame(val_agg["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

100%|██████████| 22761/22761 [00:07<00:00, 2846.57it/s]


In [53]:
val_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,candidate,als_score
507127,356645111685238,11203919891,200412,200006.0,0.897685
201026,356645110209741,10841107845,200252,200281.0,0.945254
363180,356645110623479,14458844030,200220,200285.0,0.940232
132953,352398090000896,13397313040,200252,200171.0,0.924828
182275,352398090018898,14472644683,200062,200570.0,0.977958


In [54]:
val_agg = val_agg.dropna(subset=["candidate"]) \
  .astype({"candidate": int})

In [55]:
train.to_csv(folder + "train.csv", index=False)
valid_agg.to_csv(folder + "valid_agg_raw.csv", index=False)
val_agg.to_csv(folder + "val_agg_raw.csv", index=False)

In [56]:
assert valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

## Calculate pair features

### By products

In [57]:
le = LabelEncoder()
train["item_id_enc"] = le.fit_transform(train["item_id"])

checks = train.groupby(["receipt_id"])["item_id_enc"].apply(lambda x: list(set(x))).tolist()
pairs = []

for check in checks:
    if len(check) == 2:
        pairs.append(check)
    else:
        pairs += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs = pd.DataFrame(pairs, columns=["item_1", "item_2"]) \
  .drop_duplicates(subset=["item_1", "item_2"])
pairs = pairs[pairs["item_1"] != pairs["item_2"]].reset_index(drop=True)

mtx = train[["receipt_id", "item_id_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["item_id_enc"])),
                shape=(train["receipt_id"].nunique(), train["item_id_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_res = []

for idx, row in tqdm(pairs.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_res.append((idx, (m == 2).sum()))

del mtx

pairs_res = pd.DataFrame(pairs_res, columns=[0, "both"]).set_index(0)
item_receipts = train.groupby(["item_id"])["receipt_id"].nunique().to_dict()

pairs = pd.concat([pairs, pairs_res[~pairs_res.index.duplicated()]], axis=1)

del pairs_res

pairs["item_id"] = le.inverse_transform(pairs["item_1"])
pairs["candidate"] = le.inverse_transform(pairs["item_2"])

unique_receipts = train["receipt_id"].nunique()

pairs["left"] = pairs["item_id"].map(item_receipts.get)
pairs["right"] = pairs["candidate"].map(item_receipts.get)

pairs["left_frac"] = pairs["left"] / unique_receipts
pairs["right_frac"] = pairs["right"] / unique_receipts
pairs["both_left_frac"] = pairs["both"] / pairs["left"]
pairs["both_right_frac"] = pairs["both"] / pairs["right"]

pairs = pairs.drop(["item_1", "item_2"], axis=1)

pairs = pd.concat([pairs.copy().rename(columns={"item_id": "candidate", "candidate": "item_id"}), pairs], axis=0) \
  .drop_duplicates(subset=["candidate", "item_id"]) \
  .reset_index(drop=True)

32731it [00:41, 787.41it/s]


In [58]:
pairs.to_csv(folder + "pairs.csv", index=False)

In [59]:
pairs.sample(5)

Unnamed: 0,both,candidate,item_id,left,right,left_frac,right_frac,both_left_frac,both_right_frac
23269,1,200162,200012,39,224,0.000714,0.004099,0.025641,0.004464
25721,1,200319,200128,12,272,0.00022,0.004978,0.083333,0.003676
16822,2,200260,200386,215,428,0.003935,0.007833,0.009302,0.004673
35575,3,200454,200189,666,75,0.012188,0.001373,0.004505,0.04
44370,3,200521,200046,597,1138,0.010925,0.020826,0.005025,0.002636


In [60]:
assert pairs.duplicated(subset=["item_id", "candidate"]).sum() == 0

In [61]:
quantity_total_hist_device = train.groupby(["device_id", "item_id"])["quantity"].sum().rename("quantity_total_hist_device").reset_index()
quantity_total_hist = train.groupby(["item_id"])["quantity"].sum().rename("quantity_total_hist").reset_index()

quantity_total_hist_device.to_csv(folder + "quantity_total_hist_device.csv", index=False)
quantity_total_hist.to_csv(folder + "quantity_total_hist.csv", index=False)

In [62]:
valid_agg.sample(1)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
91904,352398090018898,10877617920,200059,200610,200574,0.39353


In [64]:
valid_agg = valid_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left")

val_agg = val_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left")

In [65]:
assert valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

### By product type

In [66]:
cat_model = Word2Vec.load(folder + "word2vec.model")

item2category = pd.read_csv(folder + "item_id_categ_map.csv", sep=";")
item2category.sample(2)

Unnamed: 0,item_id,category_noun
425,200165,коробка
162,200300,мицелярная


In [67]:
train = train.merge(item2category, on=["item_id"], how="left")

In [68]:
le = LabelEncoder()
train["category_noun_enc"] = le.fit_transform(train["category_noun"])

checks = train.groupby(["receipt_id"])["category_noun_enc"].apply(lambda x: list(set(x))).tolist()
pairs_cat = []

for check in checks:
    if len(check) == 2:
        pairs_cat.append(check)
    else:
        pairs_cat += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs_cat = pd.DataFrame(pairs_cat, columns=["category_1", "category_2"]) \
  .drop_duplicates(subset=["category_1", "category_2"])
pairs_cat = pairs_cat[pairs_cat["category_1"] != pairs_cat["category_2"]].reset_index(drop=True)

mtx = train[["receipt_id", "category_noun_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["category_noun_enc"])),
                shape=(train["receipt_id"].nunique(), train["category_noun_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_cat_res = []

for idx, row in tqdm(pairs_cat.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_cat_res.append((idx, (m == 2).sum()))

del mtx

pairs_cat_res = pd.DataFrame(pairs_cat_res, columns=[0, "both"]).set_index(0)
cat_receipts = train.groupby(["category_noun"])["receipt_id"].nunique().to_dict()

pairs_cat = pd.concat([pairs_cat, pairs_cat_res], axis=1)

del pairs_cat_res

pairs_cat["category_noun"] = le.inverse_transform(pairs_cat["category_1"])
pairs_cat["category_noun_candidate"] = le.inverse_transform(pairs_cat["category_2"])


pairs_cat["left"] = pairs_cat["category_noun"].map(cat_receipts.get)
pairs_cat["right"] = pairs_cat["category_noun_candidate"].map(cat_receipts.get)

pairs_cat["cat_both_left_frac"] = pairs_cat["both"] / pairs_cat["left"]
pairs_cat["cat_both_right_frac"] = pairs_cat["both"] / pairs_cat["right"]

pairs_cat = pairs_cat.drop(["category_1", "category_2", "left", "right"], axis=1)

pairs_cat = pd.concat([pairs_cat.copy().rename(columns={"category_noun": "category_noun_candidate", "category_noun_candidate": "category_noun"}), pairs_cat], axis=0) \
  .drop_duplicates(subset=["category_noun_candidate", "category_noun"]) \
  .reset_index(drop=True)

2391it [00:01, 1588.32it/s]


In [69]:
pairs_cat.to_csv(folder + "pairs_categories.csv", index=False)

In [70]:
pairs_cat.sample(2)

Unnamed: 0,both,category_noun_candidate,category_noun,cat_both_left_frac,cat_both_right_frac
2593,48,бронзатор,гель,0.008902,0.128342
2778,36,пенка-баланс,гель,0.006677,0.060708


In [71]:
valid_agg = valid_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

val_agg = val_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

In [72]:
valid_agg.shape

(310770, 20)

In [73]:
assert valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

In [74]:
cat2idx = cat_model.wv.key_to_index
sim = [cat_model.wv.get_vector(i).reshape(1, -1) for i in tqdm(cat2idx.keys())]
sim = np.concatenate(sim, axis=0)
sim = cosine_similarity(sim, sim)

100%|██████████| 86/86 [00:00<00:00, 154811.22it/s]


In [75]:
valid_agg["w2v_sim"] = valid_agg.progress_apply(lambda x:
                                          sim[cat2idx[x["category_noun"]],
                                              cat2idx[x["category_noun_candidate"]]],
                                          axis=1
                                          )

val_agg["w2v_sim"] = val_agg.progress_apply(lambda x:
                                          sim[cat2idx[x["category_noun"]],
                                              cat2idx[x["category_noun_candidate"]]],
                                          axis=1
                                          )

100%|██████████| 310770/310770 [00:06<00:00, 47853.64it/s]
100%|██████████| 516344/516344 [00:10<00:00, 48345.23it/s]


In [76]:
valid_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)
val_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)

In [77]:
valid_agg.to_csv(folder + "valid_agg_features.csv", index=False)
val_agg.to_csv(folder + "val_agg_features.csv", index=False)

## Fit classifier

In [78]:
valid_agg["y"] = (valid_agg["target"] == valid_agg["candidate"]).astype(int)

In [79]:
valid_agg = valid_agg.drop(["target"], axis=1)

In [80]:
non_features = ["device_id", "receipt_id", "item_id", "candidate"]

### Select parameters

In [81]:
from sklearn.model_selection import train_test_split

In [82]:
x_train, x_test = train_test_split(valid_agg,
                                   test_size=0.1,
                                   stratify=valid_agg["y"],
                                   random_state=42)

In [None]:
class_weights = compute_class_weight("balanced", classes=np.unique(valid_agg["y"]), y=valid_agg["y"])

param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
}

best_score = 0
best_params = {}

for n_estimators in tqdm(param_grid["n_estimators"], position=0, leave=False):
    for max_depth in tqdm(param_grid["max_depth"], position=1, leave=False):
        for min_samples_split in tqdm(param_grid["min_samples_split"], position=2, leave=False):
            for min_samples_leaf in tqdm(param_grid["min_samples_leaf"], position=3, leave=False):
                rf = RandomForestClassifier(
                  n_estimators=n_estimators,
                  max_depth=max_depth,
                  min_samples_split=min_samples_split,
                  min_samples_leaf=min_samples_leaf,
                  class_weight=dict(enumerate(class_weights))
                )
                rf.fit(x_train.drop(non_features+["y"], axis=1).fillna(0),
                     x_train["y"])

                y_pred = rf.predict(x_test.drop(non_features+["y"], axis=1).fillna(0))

                f1 = f1_score(x_test["y"], y_pred)

                if f1 > best_score:
                    best_score = f1
                    best_params = {
                      "n_estimators": n_estimators,
                      "max_depth": max_depth,
                      "min_samples_split": min_samples_split,
                      "min_samples_leaf": min_samples_leaf,
                    }

print("Лучшие параметры: ", best_params)
print("Лучший F1-скор: ", best_score)

### Fit final model

In [84]:
best_params["class_weight"] = dict(enumerate(class_weights))
model = RandomForestClassifier(**best_params)

In [85]:
%%time
model.fit(valid_agg.drop(non_features+["y"], axis=1).fillna(0),
          valid_agg["y"])

CPU times: user 17.7 s, sys: 70.7 ms, total: 17.8 s
Wall time: 17.8 s


In [86]:
joblib.dump({"model": model, "params": best_params},
            folder + "classifier_model.joblib")

['./classifier_model.joblib']

In [87]:
y_pred = model.predict(val_agg.drop(non_features, axis=1).fillna(0))
y_proba = model.predict_proba(val_agg.drop(non_features, axis=1).fillna(0))[:, 1]

In [88]:
val_agg["proba"] = y_proba

In [89]:
result = val_agg.sort_values("proba", ascending=False).drop_duplicates(subset=['receipt_id'])

In [90]:
result

Unnamed: 0,device_id,receipt_id,item_id,candidate,als_score,both_x,left,right,left_frac,right_frac,both_left_frac,both_right_frac,quantity_total_hist_device,quantity_total_hist,both_y,cat_both_left_frac,cat_both_right_frac,w2v_sim,proba
163352,352398090018898,12181431181,200645,200133,0.991311,348.0,882.0,746.0,0.016141,0.013652,0.394558,0.466488,75.0,773,851.0,0.315887,0.497661,0.444779,0.905607
269782,356645110250653,12253942610,200133,200645,0.991311,348.0,882.0,746.0,0.016141,0.013652,0.394558,0.466488,40.0,938,851.0,0.315887,0.497661,0.444779,0.905607
282379,356645110252402,9648123527,200133,200645,0.991311,348.0,882.0,746.0,0.016141,0.013652,0.394558,0.466488,44.0,938,851.0,0.315887,0.497661,0.444779,0.905607
231542,356645110236140,13409215619,200133,200645,0.991311,348.0,882.0,746.0,0.016141,0.013652,0.394558,0.466488,30.0,938,851.0,0.315887,0.497661,0.444779,0.905607
412994,356645110714724,9308594732,200133,200645,0.991311,348.0,882.0,746.0,0.016141,0.013652,0.394558,0.466488,38.0,938,851.0,0.315887,0.497661,0.444779,0.905607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
448814,356645110747963,10977503613,200222,200558,0.800322,14.0,1928.0,32.0,0.035284,0.000586,0.007261,0.437500,,34,2502.0,0.421496,0.132563,0.555194,0.127003
238249,356645110237411,9793068963,200222,200558,0.800322,14.0,1928.0,32.0,0.035284,0.000586,0.007261,0.437500,,34,2502.0,0.421496,0.132563,0.555194,0.127003
12380,352398083991747,13345062749,200222,200558,0.800322,14.0,1928.0,32.0,0.035284,0.000586,0.007261,0.437500,,34,2502.0,0.421496,0.132563,0.555194,0.127003
353657,356645110623479,10150376779,200222,200558,0.800322,14.0,1928.0,32.0,0.035284,0.000586,0.007261,0.437500,,34,2502.0,0.421496,0.132563,0.555194,0.127003


#### Проверяем не потеряли ли чего и записываем результат

In [91]:
predict = pd.read_csv(os.path.join(input_folder, val_file_name), sep="\t")

In [92]:
empty = set(predict["receipt_id"].tolist()) - set(result['receipt_id'].tolist()) 
len(empty)

1

In [93]:
the_most_popular = train["item_id"].value_counts().reset_index()['item_id'][0]
probability_the_most_popular = 0.98

empty = predict[predict["receipt_id"].isin(empty)]
empty[["candidate", "proba"]] = the_most_popular, probability_the_most_popular

result = pd.concat([result, empty])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty[["candidate", "proba"]] = the_most_popular, probability_the_most_popular
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty[["candidate", "proba"]] = the_most_popular, probability_the_most_popular


In [94]:
empty

Unnamed: 0,device_id,receipt_id,item_id,local_date,name,price,quantity,candidate,proba
22224,352398089986709,9157375844,200365,2021-09-03 22:07:56,"Обертывание-гель ""Уменьшение объемов"", 200 мл.",990.0,2,200221,0.98


In [95]:
scoring = result[["receipt_id", "candidate", "proba"]]
scoring.to_csv(folder + output_name, index=False)

In [96]:
scoring

Unnamed: 0,receipt_id,candidate,proba
163352,12181431181,200133,0.905607
269782,12253942610,200645,0.905607
282379,9648123527,200645,0.905607
231542,13409215619,200645,0.905607
412994,9308594732,200645,0.905607
...,...,...,...
238249,9793068963,200558,0.127003
12380,13345062749,200558,0.127003
353657,10150376779,200558,0.127003
75299,13372198307,200558,0.127003
