In [2]:
!pip install implicit pymorphy3

Collecting implicit
  Downloading implicit-0.7.1-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pymorphy3
  Downloading pymorphy3-1.2.1-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting dawg-python>=0.7.1 (from pymorphy3)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting docopt-ng>=0.6 (from pymorphy3)
  Downloading docopt_ng-0.9.0-py3-none-any.whl (16 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg-python, docopt-ng, pymorphy3, im

In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

In [3]:
from gensim.models.word2vec import Word2Vec
import joblib
import implicit
from itertools import combinations, product
import numpy as np
import pandas as pd
import pickle
import pymorphy3
import re
import os
from scipy.sparse import csr_matrix, save_npz, coo_array
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

from tqdm import tqdm
tqdm.pandas()

parser = pymorphy3.MorphAnalyzer()


input_folder = '/kaggle/input/gardening/'
folder = './'
train_file_name='gardening_train.tsv'
val_file_name='gardening_test.tsv'
output_name = 'jetfork_gardening_result.csv'

def get_noun(words: list[str]) -> str:
    result = []
    for word in words:
        parsings = [word for parsing in parser.parse(word)[:3] if parsing.tag.POS == 'NOUN']
        if len(parsings)>0:
            result.append(word)
    
    if len(result)==0:
        return words[0]
    return result[0]


def preparation_data(dataset):
    dataset['processed_name'] = dataset['name']\
    .str.lower()\
    .apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^a-zа-я -]', '', x)).strip())

    dataset['category_first_word'] = dataset['processed_name'].str.split(' ')\
                                                .progress_apply(lambda x: x[0])

    dataset['category_noun'] = dataset['processed_name'].str.split(' ')\
                                                .progress_apply(lambda x: get_noun(x[:3]))
        
    return dataset


def get_products_short_name_map(train, val):
    products_name = pd.concat([train[['item_id', 'category_noun']], val[['item_id', 'category_noun']]]).drop_duplicates()
    products_name.columns = ['target', 'category_noun'] 
    products_name.target = products_name.target.astype(str)
    
    return products_name

In [5]:
train = pd.read_csv(os.path.join(input_folder, train_file_name), sep="\t")
val = pd.read_csv(os.path.join(input_folder, val_file_name), sep="\t")

In [6]:
val.shape

(23473, 7)

In [7]:
train.shape

(79258, 7)

In [10]:
train["local_date"] = pd.to_datetime(train["local_date"])
val["local_date"] = pd.to_datetime(val["local_date"])

# word2vec model

In [11]:
train = preparation_data(train)
val = preparation_data(val)

100%|██████████| 79258/79258 [00:00<00:00, 747544.17it/s]
100%|██████████| 79258/79258 [00:58<00:00, 1345.46it/s]
100%|██████████| 23473/23473 [00:00<00:00, 688741.89it/s]
100%|██████████| 23473/23473 [00:17<00:00, 1346.17it/s]


In [12]:
train['category_noun'].value_counts()[0:50]

category_noun
семена          20707
удобрение        5599
грунт            4272
рассада          3091
луковица         2486
земляника        2278
корневище        1264
фунгицид         1238
роза             1103
инсектицид       1085
малина            916
гортензия         828
яблоня            774
торф              654
картофель         653
лук-севок         600
смородина         581
клубень           543
горшок            522
астильба          515
туя               507
дренаж            503
клематис          500
пион              500
стимулятор        483
хоста             456
петуния           455
можжевельник      451
сирень            400
флокс             367
бегония           356
голубика          353
лилейник          340
спирея            339
крыжовник         332
барбарис          322
пеларгония        321
жимолость         317
ежевика           300
субстрат          276
виола             268
бархатцы          267
ель               266
груша             260
регулятор         

In [13]:
val['category_noun'].value_counts()[0:50]

category_noun
семена          6615
удобрение       1627
грунт           1206
рассада         1118
луковица         885
земляника        747
корневище        453
роза             337
фунгицид         333
инсектицид       289
малина           255
гортензия        248
клубень          208
картофель        186
яблоня           176
торф             172
горшок           159
туя              151
лук-севок        149
астильба         145
смородина        145
клематис         142
дренаж           137
стимулятор       133
можжевельник     133
флокс            125
пион             124
сирень           121
петуния          116
хоста            108
голубика         103
бегония           97
бархатцы          94
жимолость         88
пеларгония        84
регулятор         82
спирея            82
крыжовник         80
барбарис          80
виола             76
опора             76
ель               72
подставка         68
лилейник          64
груша             62
лаванда           62
ежевика           61

In [14]:
products_name = get_products_short_name_map(train, val)
products_name.rename(columns = {'target':'item_id'}).to_csv(folder + 'item_id_categ_map.csv', sep=';', index=None)

In [15]:
train_grouped = train.groupby('receipt_id')['category_noun'].apply(list).reset_index()

model = Word2Vec(
    train_grouped.category_noun.tolist(), vector_size=64, 
    sg=1, epochs=50, negative=10, min_count=1, 
    window=3, seed=42, workers=8
)

X = model.wv.key_to_index.keys()
len(X)

model.save(folder + "word2vec.model")

## Create third dataset

In [16]:
train = train.drop(['processed_name', 'category_first_word', 'category_noun'], axis=1)

In [17]:
train.shape

(79258, 7)

In [18]:
# удаление чеков с 1м товаром
train = train[train.groupby(["receipt_id"])["item_id"].transform(lambda x: x.nunique() > 1)] \
  .reset_index(drop=True)

train.shape

(79240, 7)

In [19]:
unique_items = train["item_id"]
unique_items.nunique()

12765

In [20]:
# сборка таргета из товаров, которые встречаются более 1го раза на датасете
hight_support_items = train["item_id"].value_counts().where(lambda x: x > 1).dropna().index

target = train[train["item_id"].isin(hight_support_items)].groupby(["receipt_id"])["item_id"].apply(lambda x: x.sample(1).iloc[0]) \
  .to_dict()

train["target"] = train.apply(lambda x: int(target.get(x["receipt_id"], 0) == x["item_id"]), axis=1)
del target

In [21]:
train["receipt_id"].nunique(), train["target"].sum()

(9162, 9104)

In [22]:
train["has_unique_item"] = ~train["item_id"].isin(hight_support_items)
train["has_unique_item"] = train.groupby(["receipt_id"])["has_unique_item"].transform("max")

In [23]:
#  разделение трейна на 2 выборки по идентификатору чека
val_receipts = train["receipt_id"].drop_duplicates().sample(frac=.2, random_state=42).tolist()

valid = train[(train["receipt_id"].isin(val_receipts)) & (~train["has_unique_item"])]
train = train[(~train["receipt_id"].isin(val_receipts)) | (train["has_unique_item"])]

del val_receipts

In [24]:
# все товары, которые не встречаются на обучении (не в таргете)
lost_items = unique_items[~unique_items.isin(train.loc[train["target"] != 1, "item_id"])].drop_duplicates()
lost_items = valid.loc[valid["item_id"].isin(lost_items),
                                ["receipt_id", "item_id"]].groupby(["item_id"])["receipt_id"] \
  .apply(lambda x: x.sample(1).values) \
  .explode() \
  .tolist()

In [25]:
# перенос всех уникальных товаров с валидации на обучение
train = pd.concat([train,
                   valid[valid["receipt_id"].isin(lost_items)]], axis=0) \
  .reset_index(drop=True)

valid = valid[~valid["receipt_id"].isin(lost_items)].reset_index(drop=True)

In [26]:
valid.shape, train.shape

((8790, 9), (70450, 9))

In [27]:
assert train.loc[train["target"] != 1, "item_id"].nunique() >= (unique_items.nunique() * .95) # поправка максимум на 5% потерянных товаров
assert train["receipt_id"].isin(valid["receipt_id"]).sum() == 0
assert valid["receipt_id"].isin(train["receipt_id"]).sum() == 0

In [28]:
train

Unnamed: 0,device_id,receipt_id,item_id,local_date,name,price,quantity,target,has_unique_item
0,352398089623724,10428823398,301936,2022-01-20 14:46:51,Грунт для садовых роз Робин Грин прессованный ...,299.0,1.0,0,True
1,352398089623724,10428823398,301941,2022-01-20 14:46:51,Грунт для томатов и перцев Фаско Малышок 25л,269.0,3.0,0,True
2,352398089623724,10428823398,302857,2022-01-20 14:46:51,Инсектицид Август Табу для картофеля от колора...,159.0,1.0,0,True
3,352398089623724,10428823398,304301,2022-01-20 14:46:51,"Краска для садовых деревьев ФАСКО ведро 2,5кг",159.0,2.0,0,True
4,352398089623724,10428823398,308178,2022-01-20 14:46:51,Семена Tim/цветы астра Башня белая (пионовидна...,21.0,1.0,0,True
...,...,...,...,...,...,...,...,...,...
70445,356645115075857,15674436306,305447,2023-07-14 13:11:56,Магнолия Леонард Мессел v2 Lav,499.0,1.0,0,False
70446,356645115075857,15674436306,305449,2023-07-14 13:11:56,Магнолия Сьюзан v2 Lav,699.0,1.0,0,False
70447,356645115075857,15674436306,313282,2023-07-14 13:11:56,Цеанотус Делиля Генри Дефузе v2 Lav,699.0,1.0,0,False
70448,356645115075857,15674436306,302990,2023-07-14 13:11:56,Ирис Блю Кинг v2 Tim,349.0,1.0,0,False


In [29]:
# выделение целевого товара в отдельный столбец
train.loc[train["target"] == 1, "target"] =\
  train.loc[train["target"] == 1, "item_id"]

train["target"] = train.groupby(["receipt_id"])["target"].transform("max")
train = train[train["item_id"] != train["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)



valid.loc[valid["target"] == 1, "target"] =\
  valid.loc[valid["target"] == 1, "item_id"]

valid["target"] = valid.groupby(["receipt_id"])["target"].transform("max")
valid = valid[valid["item_id"] != valid["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)

In [30]:
train.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,local_date,name,price,quantity,target
5583,352398089695367,12943665532,305239,2022-10-09 11:43:49,Луковица НВМ ифейон Ролф Фиедлер 10 шт/уп,249.0,1.0,303891
50841,356645110729649,14292094524,308957,2023-03-06 13:02:05,Семена Тимирязевский питомник редис Глориет F1...,89.0,1.0,309643
54172,356645110893239,14359765649,302949,2023-03-13 16:02:16,Инсектицид ЗАС® Гром-2 от почвенных мушек и му...,16.0,3.0,313005
51121,356645110729649,14545695069,307122,2023-04-01 12:08:32,Рассада Перец Маврас d11 Tim,89.0,5.0,307064
39235,352398089709218,10929771135,309260,2022-03-17 15:46:46,Семена арбуз Сахарная Молния F1 6шт Уральский ...,55.0,1.0,312121


## Prepare matrix

In [31]:
# агрегация чеков
train_mtx = train.groupby(["receipt_id", "item_id"])["quantity"].min() \
    .reset_index()
train_mtx["quantity"] = 1
train_mtx = train_mtx.drop_duplicates() \
    .reset_index(drop=True)

receipt_items = train_mtx.groupby(["receipt_id"])["item_id"].apply(lambda x: tuple(set(x))).to_dict()
train_mtx["items"] = train_mtx["receipt_id"].map(receipt_items.get)

# удаление чеков из 1го товара и удаление одинаковых чеков
train_mtx = train_mtx[train_mtx["items"].apply(len) > 1] \
  .drop_duplicates(subset=["items", "item_id"]) \
  .reset_index(drop=True)

# преобразование типов данных
train_mtx["receipt_cat"] = train_mtx["receipt_id"].astype("category").cat.codes
train_mtx["item_cat"] = train_mtx["item_id"].astype("category").cat.codes

In [32]:
# словари для обращения к эмбедам ALS
receipt_2idx = train_mtx.drop_duplicates(subset=["receipt_cat"]) \
  .set_index("items")["receipt_cat"].to_dict()

item_2idx = train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_id")["item_cat"].to_dict()
idx_2item = train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_cat")["item_id"].to_dict()

In [33]:
# построение матрицы чек-товар
sparse_receipt_item = csr_matrix((train_mtx["quantity"].astype(float),
                                 (train_mtx["receipt_cat"], train_mtx["item_cat"])))

In [34]:
save_npz(folder + "sparse_matrix.npz", sparse_receipt_item)

with open(folder + "receipt_2idx.pkl", "wb") as f:
    pickle.dump(receipt_2idx, f)

with open(folder + "item_2idx.pkl", "wb") as f:
    pickle.dump(item_2idx, f)

with open(folder + "idx_2item.pkl", "wb") as f:
    pickle.dump(idx_2item, f)

## Fit and tune ALS

In [35]:
!pip install scikit-optimize



In [36]:
def recommend_to_receipt(receipt_cat, sparse_user_item,
                         receipt_vecs, item_vecs, idx_2item, num_items=5):

    receipt_interactions = sparse_user_item[receipt_cat, :].toarray()

    receipt_interactions = receipt_interactions.reshape(-1) + 1
    receipt_interactions[receipt_interactions > 1] = 0

    rec_vector = receipt_vecs[receipt_cat, :].dot(item_vecs.T).toarray()

    recommend_vector = (receipt_interactions * rec_vector)[0]

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    result = []

    for idx in set(item_idx):
        result.append((idx_2item[idx], recommend_vector[idx]))

    return result

In [37]:
def recommend_to_items(items_cat, item_norms, item_vecs, idx_2item, num_items=5):

    scores = item_vecs.dot(item_vecs[items_cat].T).T  / item_norms.reshape(1, -1)
    top_idx = np.argpartition(scores, -num_items, axis=1)[:, -(num_items+1):]
    scores = np.array([scores[idx, row] for idx, row in enumerate(top_idx)])
    scores = scores / item_norms[items_cat].reshape(-1, 1)
    result = []
    for i in sorted(zip(top_idx.reshape(-1), scores.reshape(-1)), key=lambda x: -x[1]):
        if i[0] not in items_cat and idx_2item[i[0]] not in [j[0] for j in result]:
            result.append((idx_2item[i[0]], i[1]))

    return result[:num_items]

In [38]:
valid_agg = valid.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

valid_agg["receipt_cat"] = valid_agg["item_id"].map(receipt_2idx.get)
valid_agg["item_cat"] = valid_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

In [39]:
valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat
435,352398089703369,11709235210,"(307471,)",300564,,[6503]
1018,356645110729649,15462914968,"(310679, 308805, 308359, 310665, 304301, 30241...",313012,,"[9538, 7727, 7290, 9524, 3735, 2106, 9655, 922..."
210,352398089695367,15586345478,"(303310, 306799, 301680, 306803, 301975, 30611...",311986,,"[2877, 5914, 1460, 5917, 1720, 5325, 1950]"
795,352398089709218,15446438600,"(307512, 305809, 307516, 301455)",307484,,"[6538, 5076, 6542, 1261]"
665,352398089707527,13131873257,"(300900, 300901, 301903)",302413,,"[781, 782, 1648]"


### Select parameters

In [None]:
num_recs = 10
alpha_val = 15

param_grid = {
  "factors": [10, 20, 50],
  "regularization": [0.01, 0.1],
  "iterations": [10, 20]
}

all_param_combinations = list(product(*param_grid.values()))
result = []

#TODO: убрать
for params in tqdm(all_param_combinations):
    factors, regularization, iterations = params
    params = {
      "factors": factors,
      "regularization": regularization,
      "iterations": iterations,
      "calculate_training_loss": False,
      "random_state": 42
    }
    model = implicit.als.AlternatingLeastSquares(**params)
    model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=False)

    receipt_vecs = model.user_factors
    item_vecs = model.item_factors

    receipt_vecs_csr = csr_matrix(receipt_vecs)
    item_vecs_csr = csr_matrix(item_vecs)

    item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

    valid_agg_copy = valid_agg.copy()
    valid_agg_copy["preds"] = valid_agg_copy.apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

    hit = ((valid_agg_copy.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100)
    result.append((hit, params))

 33%|███▎      | 4/12 [12:45<27:00, 202.61s/it]

In [None]:
result = pd.DataFrame(result, columns=["hit@10", "params"]) \
  .sort_values("hit@10", ascending=False)
result

### Fit final model

In [None]:
best_params = result["params"].iloc[0].copy()
best_params["calculate_training_loss"] = True
print(best_params)

In [None]:
model = implicit.als.AlternatingLeastSquares(**best_params)
model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=True)

In [None]:
joblib.dump({"model": model, "params": best_params, "alpha_val": alpha_val},
            folder + "candidate_model.joblib")

In [None]:
# извлечение эмбедов из ALS
receipt_vecs = model.user_factors
item_vecs = model.item_factors

receipt_vecs_csr = csr_matrix(receipt_vecs)
item_vecs_csr = csr_matrix(item_vecs)

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

### Generate candidates

In [None]:
valid_agg["preds"] = valid_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

valid_agg = valid_agg[valid_agg["preds"].apply(len) > 0].reset_index(drop=True)

In [None]:
print("accuracy: ", ((valid_agg["preds"].apply(lambda x: x[0][0]) == valid_agg["target"]).mean() * 100))
print("hit@10: ", ((valid_agg.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100))

In [None]:
valid_agg.sample(5)

In [None]:
valid_agg = valid_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

valid_agg = pd.concat([valid_agg,
                                pd.DataFrame(valid_agg["preds"].tolist(), columns=["candidate", "als_score"])],
                              axis=1) \
  .drop(["preds"], axis=1)

In [None]:
valid_agg.sample(5)

In [None]:
val_agg = val.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x))}) \
  .reset_index()

val_agg["receipt_cat"] = val_agg["item_id"].map(receipt_2idx.get)
val_agg["item_cat"] = val_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

val_agg["preds"] = val_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

val_agg = val_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

val_agg = pd.concat([val_agg, pd.DataFrame(val_agg["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

In [None]:
val_agg.sample(5)

In [None]:
val_agg = val_agg.dropna(subset=["candidate"]) \
  .astype({"candidate": int})

In [None]:
train.to_csv(folder + "train.csv", index=False)
valid_agg.to_csv(folder + "valid_agg_raw.csv", index=False)
val_agg.to_csv(folder + "val_agg_raw.csv", index=False)

In [None]:
assert valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

## Calculate pair features

### By products

In [None]:
le = LabelEncoder()
train["item_id_enc"] = le.fit_transform(train["item_id"])

checks = train.groupby(["receipt_id"])["item_id_enc"].apply(lambda x: list(set(x))).tolist()
pairs = []

for check in checks:
    if len(check) == 2:
        pairs.append(check)
    else:
        pairs += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs = pd.DataFrame(pairs, columns=["item_1", "item_2"]) \
  .drop_duplicates(subset=["item_1", "item_2"])
pairs = pairs[pairs["item_1"] != pairs["item_2"]].reset_index(drop=True)

mtx = train[["receipt_id", "item_id_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["item_id_enc"])),
                shape=(train["receipt_id"].nunique(), train["item_id_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_res = []

for idx, row in tqdm(pairs.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_res.append((idx, (m == 2).sum()))

del mtx

pairs_res = pd.DataFrame(pairs_res, columns=[0, "both"]).set_index(0)
item_receipts = train.groupby(["item_id"])["receipt_id"].nunique().to_dict()

pairs = pd.concat([pairs, pairs_res[~pairs_res.index.duplicated()]], axis=1)

del pairs_res

pairs["item_id"] = le.inverse_transform(pairs["item_1"])
pairs["candidate"] = le.inverse_transform(pairs["item_2"])

unique_receipts = train["receipt_id"].nunique()

pairs["left"] = pairs["item_id"].map(item_receipts.get)
pairs["right"] = pairs["candidate"].map(item_receipts.get)

pairs["left_frac"] = pairs["left"] / unique_receipts
pairs["right_frac"] = pairs["right"] / unique_receipts
pairs["both_left_frac"] = pairs["both"] / pairs["left"]
pairs["both_right_frac"] = pairs["both"] / pairs["right"]

pairs = pairs.drop(["item_1", "item_2"], axis=1)

pairs = pd.concat([pairs.copy().rename(columns={"item_id": "candidate", "candidate": "item_id"}), pairs], axis=0) \
  .drop_duplicates(subset=["candidate", "item_id"]) \
  .reset_index(drop=True)

In [None]:
pairs.to_csv(folder + "pairs.csv", index=False)

In [None]:
pairs.sample(5)

In [None]:
assert pairs.duplicated(subset=["item_id", "candidate"]).sum() == 0

In [None]:
quantity_total_hist_device = train.groupby(["device_id", "item_id"])["quantity"].sum().rename("quantity_total_hist_device").reset_index()
quantity_total_hist = train.groupby(["item_id"])["quantity"].sum().rename("quantity_total_hist").reset_index()

quantity_total_hist_device.to_csv(folder + "quantity_total_hist_device.csv", index=False)
quantity_total_hist.to_csv(folder + "quantity_total_hist.csv", index=False)

In [None]:
valid_agg.sample(1)

In [None]:
valid_agg = valid_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left")

val_agg = val_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left")

In [None]:
assert valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

### By product type

In [None]:
cat_model = Word2Vec.load(folder + "word2vec.model")

item2category = pd.read_csv(folder + "item_id_categ_map.csv", sep=";")
item2category.sample(2)

In [None]:
train = train.merge(item2category, on=["item_id"], how="left")

In [None]:
le = LabelEncoder()
train["category_noun_enc"] = le.fit_transform(train["category_noun"])

checks = train.groupby(["receipt_id"])["category_noun_enc"].apply(lambda x: list(set(x))).tolist()
pairs_cat = []

for check in checks:
    if len(check) == 2:
        pairs_cat.append(check)
    else:
        pairs_cat += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs_cat = pd.DataFrame(pairs_cat, columns=["category_1", "category_2"]) \
  .drop_duplicates(subset=["category_1", "category_2"])
pairs_cat = pairs_cat[pairs_cat["category_1"] != pairs_cat["category_2"]].reset_index(drop=True)

mtx = train[["receipt_id", "category_noun_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["category_noun_enc"])),
                shape=(train["receipt_id"].nunique(), train["category_noun_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_cat_res = []

for idx, row in tqdm(pairs_cat.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_cat_res.append((idx, (m == 2).sum()))

del mtx

pairs_cat_res = pd.DataFrame(pairs_cat_res, columns=[0, "both"]).set_index(0)
cat_receipts = train.groupby(["category_noun"])["receipt_id"].nunique().to_dict()

pairs_cat = pd.concat([pairs_cat, pairs_cat_res], axis=1)

del pairs_cat_res

pairs_cat["category_noun"] = le.inverse_transform(pairs_cat["category_1"])
pairs_cat["category_noun_candidate"] = le.inverse_transform(pairs_cat["category_2"])


pairs_cat["left"] = pairs_cat["category_noun"].map(cat_receipts.get)
pairs_cat["right"] = pairs_cat["category_noun_candidate"].map(cat_receipts.get)

pairs_cat["cat_both_left_frac"] = pairs_cat["both"] / pairs_cat["left"]
pairs_cat["cat_both_right_frac"] = pairs_cat["both"] / pairs_cat["right"]

pairs_cat = pairs_cat.drop(["category_1", "category_2", "left", "right"], axis=1)

pairs_cat = pd.concat([pairs_cat.copy().rename(columns={"category_noun": "category_noun_candidate", "category_noun_candidate": "category_noun"}), pairs_cat], axis=0) \
  .drop_duplicates(subset=["category_noun_candidate", "category_noun"]) \
  .reset_index(drop=True)

In [None]:
pairs_cat.to_csv(folder + "pairs_categories.csv", index=False)

In [None]:
pairs_cat.sample(2)

In [None]:
valid_agg = valid_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

val_agg = val_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

In [None]:
valid_agg.shape

In [None]:
assert valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

In [None]:
cat2idx = cat_model.wv.key_to_index
sim = [cat_model.wv.get_vector(i).reshape(1, -1) for i in tqdm(cat2idx.keys())]
sim = np.concatenate(sim, axis=0)
sim = cosine_similarity(sim, sim)

In [None]:
valid_agg["w2v_sim"] = valid_agg.progress_apply(lambda x:
                                          sim[cat2idx[x["category_noun"]],
                                              cat2idx[x["category_noun_candidate"]]],
                                          axis=1
                                          )

val_agg["w2v_sim"] = val_agg.progress_apply(lambda x:
                                          sim[cat2idx[x["category_noun"]],
                                              cat2idx[x["category_noun_candidate"]]],
                                          axis=1
                                          )

In [None]:
valid_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)
val_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)

In [None]:
valid_agg.to_csv(folder + "valid_agg_features.csv", index=False)
val_agg.to_csv(folder + "val_agg_features.csv", index=False)

## Fit classifier

In [None]:
valid_agg["y"] = (valid_agg["target"] == valid_agg["candidate"]).astype(int)

In [None]:
valid_agg = valid_agg.drop(["target"], axis=1)

In [None]:
non_features = ["device_id", "receipt_id", "item_id", "candidate"]

### Select parameters

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test = train_test_split(valid_agg,
                                   test_size=0.1,
                                   stratify=valid_agg["y"],
                                   random_state=42)

In [None]:
class_weights = compute_class_weight("balanced", classes=np.unique(valid_agg["y"]), y=valid_agg["y"])

param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
}

best_score = 0
best_params = {}

for n_estimators in tqdm(param_grid["n_estimators"], position=0, leave=False):
    for max_depth in tqdm(param_grid["max_depth"], position=1, leave=False):
        for min_samples_split in tqdm(param_grid["min_samples_split"], position=2, leave=False):
            for min_samples_leaf in tqdm(param_grid["min_samples_leaf"], position=3, leave=False):
                rf = RandomForestClassifier(
                  n_estimators=n_estimators,
                  max_depth=max_depth,
                  min_samples_split=min_samples_split,
                  min_samples_leaf=min_samples_leaf,
                  class_weight=dict(enumerate(class_weights))
                )
                rf.fit(x_train.drop(non_features+["y"], axis=1).fillna(0),
                     x_train["y"])

                y_pred = rf.predict(x_test.drop(non_features+["y"], axis=1).fillna(0))

                f1 = f1_score(x_test["y"], y_pred)

                if f1 > best_score:
                    best_score = f1
                    best_params = {
                      "n_estimators": n_estimators,
                      "max_depth": max_depth,
                      "min_samples_split": min_samples_split,
                      "min_samples_leaf": min_samples_leaf,
                    }

print("Лучшие параметры: ", best_params)
print("Лучший F1-скор: ", best_score)

### Fit final model

In [None]:
best_params["class_weight"] = dict(enumerate(class_weights))
model = RandomForestClassifier(**best_params)

In [None]:
%%time
model.fit(valid_agg.drop(non_features+["y"], axis=1).fillna(0),
          valid_agg["y"])

In [None]:
joblib.dump({"model": model, "params": best_params},
            folder + "classifier_model.joblib")

In [None]:
y_pred = model.predict(val_agg.drop(non_features, axis=1).fillna(0))
y_proba = model.predict_proba(val_agg.drop(non_features, axis=1).fillna(0))[:, 1]

In [None]:
val_agg["proba"] = y_proba

In [None]:
result = val_agg.sort_values("proba", ascending=False).drop_duplicates(subset=['receipt_id'])

In [None]:
result

#### Проверяем не потеряли ли чего и записываем результат

In [None]:
predict = pd.read_csv(os.path.join(input_folder, val_file_name), sep="\t")

In [None]:
empty = set(predict["receipt_id"].tolist()) - set(result['receipt_id'].tolist()) 
len(empty)

In [None]:
the_most_popular = train["item_id"].value_counts().reset_index()['item_id'][0]
probability_the_most_popular = 0.98

empty = predict[predict["receipt_id"].isin(empty)]
empty[["candidate", "proba"]] = the_most_popular, probability_the_most_popular

result = pd.concat([result, empty])

In [None]:
empty

In [None]:
scoring = result[["receipt_id", "candidate", "proba"]]
scoring.to_csv(folder + output_name, index=False)

In [None]:
scoring