In [1]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install implicit



In [126]:
from gensim.models.word2vec import Word2Vec
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import implicit
from itertools import product
tqdm.pandas()

In [4]:
cosmetic_train = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_train.tsv", sep="\t")
cosmetic_val = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_val.tsv", sep="\t")
cosmetic_target = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_val_target.tsv", sep="\t") \
    .drop_duplicates() \
    .reset_index(drop=True)

# super_train = pd.read_csv("/content/drive/MyDrive/vseros/supermarket_train.tsv", sep="\t")
# super_val = pd.read_csv("/content/drive/MyDrive/vseros/supermarket_val.tsv", sep="\t")
# super_target = pd.read_csv("/content/drive/MyDrive/vseros/supermarket_val_target.tsv", sep="\t") \
#     .drop_duplicates() \
#     .reset_index(drop=True)

In [5]:
cosmetic_train["local_date"] = pd.to_datetime(cosmetic_train["local_date"])
cosmetic_val["local_date"] = pd.to_datetime(cosmetic_val["local_date"])

# super_train["local_date"] = pd.to_datetime(super_train["local_date"])
# super_val["local_date"] = pd.to_datetime(super_val["local_date"])

# Cosmetic

## Create third dataset

In [6]:
cosmetic_train.shape

(223908, 7)

In [7]:
cosmetic_train = cosmetic_train[cosmetic_train.groupby(["receipt_id"])["item_id"].transform(lambda x: x.nunique() > 1)] \
  .reset_index(drop=True)

cosmetic_train.shape

(223906, 7)

In [8]:
unique_items = cosmetic_train["item_id"]
unique_items.nunique()

667

In [9]:
hight_support_items = cosmetic_train["item_id"].value_counts().where(lambda x: x > 1).dropna().index

target = cosmetic_train[cosmetic_train["item_id"].isin(hight_support_items)].groupby(["receipt_id"])["item_id"].apply(lambda x: x.sample(1).iloc[0]) \
  .to_dict()

cosmetic_train["target"] = cosmetic_train.apply(lambda x: int(target.get(x["receipt_id"], 0) == x["item_id"]), axis=1)
del target

In [10]:
cosmetic_train["receipt_id"].nunique(),  cosmetic_train["target"].sum()

(68282, 68283)

In [11]:
cosmetic_train["has_unique_item"] = ~cosmetic_train["item_id"].isin(hight_support_items)
cosmetic_train["has_unique_item"] = cosmetic_train.groupby(["receipt_id"])["has_unique_item"].transform("max")

In [12]:
val_receipts = cosmetic_train["receipt_id"].drop_duplicates().sample(frac=.2).tolist()
cosmetic_valid = cosmetic_train[(cosmetic_train["receipt_id"].isin(val_receipts)) & (~cosmetic_train["has_unique_item"])]
cosmetic_train = cosmetic_train[(~cosmetic_train["receipt_id"].isin(val_receipts)) | (cosmetic_train["has_unique_item"])]

del val_receipts

In [13]:
lost_items = unique_items[~unique_items.isin(cosmetic_train.loc[cosmetic_train["target"] != 1, "item_id"])].drop_duplicates()
lost_items = cosmetic_valid.loc[cosmetic_valid["item_id"].isin(lost_items),
                                ["receipt_id", "item_id"]].groupby(["item_id"])["receipt_id"] \
  .apply(lambda x: x.sample(1).values) \
  .explode() \
  .tolist()

In [14]:
cosmetic_train = pd.concat([cosmetic_train, cosmetic_valid[cosmetic_valid["receipt_id"].isin(lost_items)]], axis=0).reset_index(drop=True)
cosmetic_valid = cosmetic_valid[~cosmetic_valid["receipt_id"].isin(lost_items)].reset_index(drop=True)

In [15]:
cosmetic_valid.shape, cosmetic_train.shape

((44659, 9), (179247, 9))

In [16]:
assert cosmetic_train.loc[cosmetic_train["target"] != 1, "item_id"].nunique() >= (unique_items.nunique() - 10) # поправка максимум на 10 потерянных товаров
assert cosmetic_train["receipt_id"].isin(cosmetic_valid["receipt_id"]).sum() == 0
assert cosmetic_valid["receipt_id"].isin(cosmetic_train["receipt_id"]).sum() == 0

In [17]:
cosmetic_train.loc[cosmetic_train["target"] == 1, "target"] =\
  cosmetic_train.loc[cosmetic_train["target"] == 1, "item_id"]

cosmetic_valid.loc[cosmetic_valid["target"] == 1, "target"] =\
  cosmetic_valid.loc[cosmetic_valid["target"] == 1, "item_id"]

In [18]:
cosmetic_train["target"] = cosmetic_train.groupby(["receipt_id"])["target"].transform("max")
cosmetic_valid["target"] = cosmetic_valid.groupby(["receipt_id"])["target"].transform("max")

In [19]:
cosmetic_train = cosmetic_train[cosmetic_train["item_id"] != cosmetic_train["target"]].reset_index(drop=True)
cosmetic_valid = cosmetic_valid[cosmetic_valid["item_id"] != cosmetic_valid["target"]].reset_index(drop=True)

In [20]:
cosmetic_train.drop(["has_unique_item"], axis=1, inplace=True)
cosmetic_valid.drop(["has_unique_item"], axis=1, inplace=True)

In [21]:
cosmetic_train.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,local_date,name,price,quantity,target
62985,352398090000896,9917715655,200268,2021-11-23 11:55:13,"Масло для душа нежное ""SHOWER BODY OIL"", 300 мл",690.0,1,200080
96323,356645110691534,14195096017,200217,2023-02-23 17:02:38,Крем мужской для лица и тела увлажняющий КЕДР ...,700.0,1,200431
49017,356645110489244,15674814406,200302,2023-07-14 13:33:56,"Молочная ванна HONEY MILK BATH LETIQUE, 300 мл",447.0,1,200613
49060,356645110489244,15724052340,200197,2023-07-18 20:51:30,"Крем для рук SUNNY MELON, 30мл",145.0,5,200196
111376,356645110747963,13730825794,200253,2022-12-31 14:36:49,Маска для волос ревитализирующая BRAZILIAN CAR...,1290.0,1,200076


## Prepare matrix

In [22]:
cosmetic_train_mtx = cosmetic_train.groupby(["receipt_id", "item_id"])["quantity"].min() \
    .reset_index()

cosmetic_train_mtx["quantity"] = 1
cosmetic_train_mtx = cosmetic_train_mtx.drop_duplicates() \
    .reset_index(drop=True)

receipt_items = cosmetic_train_mtx.groupby(["receipt_id"])["item_id"].apply(lambda x: tuple(set(x))).to_dict()
cosmetic_train_mtx["items"] = cosmetic_train_mtx["receipt_id"].map(receipt_items.get)

cosmetic_train_mtx = cosmetic_train_mtx[cosmetic_train_mtx["items"].apply(len) > 1] \
  .reset_index(drop=True)

cosmetic_train_mtx = cosmetic_train_mtx.drop_duplicates(subset=["items", "item_id"]) \
  .reset_index(drop=True)

cosmetic_train_mtx["receipt_id"] = cosmetic_train_mtx["receipt_id"].astype("category")
cosmetic_train_mtx["item_id"] = cosmetic_train_mtx["item_id"].astype("category")

cosmetic_train_mtx["receipt_cat"] = cosmetic_train_mtx["receipt_id"].cat.codes
cosmetic_train_mtx["item_cat"] = cosmetic_train_mtx["item_id"].cat.codes

In [23]:
receipt_2idx = cosmetic_train_mtx.drop_duplicates(subset=["receipt_cat"]) \
  .set_index("items")["receipt_cat"].to_dict()

item_2idx = cosmetic_train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_id")["item_cat"].to_dict()
idx_2item = cosmetic_train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_cat")["item_id"].to_dict()

In [24]:
sparse_item_user = csr_matrix((cosmetic_train_mtx["quantity"].astype(float), (cosmetic_train_mtx["item_cat"], cosmetic_train_mtx["receipt_cat"])))
sparse_receipt_item = csr_matrix((cosmetic_train_mtx["quantity"].astype(float), (cosmetic_train_mtx["receipt_cat"], cosmetic_train_mtx["item_cat"])))

## Fit ALS

In [25]:
!pip install scikit-optimize



In [26]:
def recommend_to_receipt(receipt_cat, sparse_user_item,
                         receipt_vecs, item_vecs, idx_2item, num_items=5):

    receipt_interactions = sparse_user_item[receipt_cat, :].toarray()

    receipt_interactions = receipt_interactions.reshape(-1) + 1
    receipt_interactions[receipt_interactions > 1] = 0

    rec_vector = receipt_vecs[receipt_cat, :].dot(item_vecs.T).toarray()

    recommend_vector = (receipt_interactions * rec_vector)[0]

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    result = []

    for idx in item_idx:
      result.append((idx_2item[idx], recommend_vector[idx]))

    return result

In [28]:
def recommend_to_items(items_cat, item_norms, item_vecs, idx_2item, num_items=5):

    scores = item_vecs.dot(item_vecs[items_cat].T).T  / item_norms.reshape(1, -1)
    top_idx = np.argpartition(scores, -num_items, axis=1)[:, -(num_items+1):]
    scores = np.array([scores[idx, row] for idx, row in enumerate(top_idx)])
    scores = scores / item_norms[items_cat].reshape(-1, 1)
    result = [(idx_2item[i[0]], i[1]) for i in sorted(zip(top_idx.reshape(-1), scores.reshape(-1)), key=lambda x: -x[1])
              if i[0] not in items_cat][:num_items]

    return result

In [29]:
cosmetic_valid_agg = cosmetic_valid.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

cosmetic_valid_agg["receipt_cat"] = cosmetic_valid_agg["item_id"].map(receipt_2idx.get)
cosmetic_valid_agg["item_cat"] = cosmetic_valid_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

In [30]:
cosmetic_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat
11704,356645110743749,16198767342,"(200291, 200266, 200332, 200018, 200051)",200058,,"[286, 262, 326, 18, 51]"
7911,356645110489244,12065762504,"(200594, 200220)",200051,,"[574, 217]"
153,352398083991747,10694224548,"(200574,)",200559,,[555]
6309,356645110237411,15875042091,"(200497,)",200236,,[481]
11193,356645110714724,13627731080,"(200067,)",200051,,[67]


In [32]:
num_recs = 10
alpha_val = 15

param_grid = {
  "factors": [10, 20, 50],
  "alpha": [1.0, 5.0],
  "regularization": [0.01, 0.1],
  "iterations": [10, 20, 50]
}

all_param_combinations = list(product(*param_grid.values()))
result = []

for params in tqdm(all_param_combinations):
  factors, alpha, regularization, iterations = params
  params = {
      "factors": factors,
      "alpha": alpha,
      "regularization": regularization,
      "iterations": iterations,
      "calculate_training_loss": False,
      "random_state": 42
  }
  model = implicit.als.AlternatingLeastSquares(**params)
  model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=False)

  receipt_vecs = model.user_factors
  item_vecs = model.item_factors

  receipt_vecs_csr = csr_matrix(receipt_vecs)
  item_vecs_csr = csr_matrix(item_vecs)

  item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

  cosmetic_valid_agg_copy = cosmetic_valid_agg.copy()
  cosmetic_valid_agg_copy["preds"] = cosmetic_valid_agg_copy.apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

  hit = ((cosmetic_valid_agg_copy.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100)
  result.append((hit, params))

100%|██████████| 36/36 [20:48<00:00, 34.68s/it]


In [33]:
result = pd.DataFrame(result, columns=["hit@10", "params"]) \
  .sort_values("hit@10", ascending=False)
result

Unnamed: 0,hit@10,params
23,29.490656,"{'factors': 20, 'alpha': 5.0, 'regularization'..."
20,29.483327,"{'factors': 20, 'alpha': 5.0, 'regularization'..."
7,29.278124,"{'factors': 10, 'alpha': 5.0, 'regularization'..."
11,29.263466,"{'factors': 10, 'alpha': 5.0, 'regularization'..."
8,29.248809,"{'factors': 10, 'alpha': 5.0, 'regularization'..."
14,28.992305,"{'factors': 20, 'alpha': 1.0, 'regularization'..."
17,28.948333,"{'factors': 20, 'alpha': 1.0, 'regularization'..."
10,28.889703,"{'factors': 10, 'alpha': 5.0, 'regularization'..."
1,28.838402,"{'factors': 10, 'alpha': 1.0, 'regularization'..."
5,28.79443,"{'factors': 10, 'alpha': 1.0, 'regularization'..."


In [34]:
best_params = result["params"][0].copy()
best_params["calculate_training_loss"] = True
print(best_params)

{'factors': 10, 'alpha': 1.0, 'regularization': 0.01, 'iterations': 10, 'calculate_training_loss': True, 'random_state': 42}


In [35]:
model = implicit.als.AlternatingLeastSquares(**best_params)
model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=True)

  0%|          | 0/10 [00:00<?, ?it/s]

In [36]:
receipt_vecs = model.user_factors
item_vecs = model.item_factors

receipt_vecs_csr = csr_matrix(receipt_vecs)
item_vecs_csr = csr_matrix(item_vecs)

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

## Create predictions

In [37]:
cosmetic_valid_agg["preds"] = cosmetic_valid_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

cosmetic_valid_agg = cosmetic_valid_agg[cosmetic_valid_agg["preds"].apply(len) > 0].reset_index(drop=True)

100%|██████████| 13645/13645 [00:04<00:00, 2789.45it/s]


In [38]:
cosmetic_valid_agg["first_item"] = cosmetic_valid_agg["preds"].apply(lambda x: x[0][0])
cosmetic_valid_agg["first_score"] = cosmetic_valid_agg["preds"].apply(lambda x: x[0][1])

In [39]:
print("accuracy: %2.f" % ((cosmetic_valid_agg["first_item"] == cosmetic_valid_agg["target"]).mean() * 100))
print("hit@10: %2.f" % ((cosmetic_valid_agg.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100))

accuracy: 10
hit@10: 29


In [40]:
cosmetic_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat,preds,first_item,first_score
6975,356645110250653,12103010073,"(200002, 200391, 200588, 200049, 200018, 200282)",200485,,"[2, 380, 568, 49, 18, 277]","[(200121, 0.9759795), (200339, 0.96293145), (2...",200121,0.97598
1320,352398085964585,14301494280,"(200466, 200107)",200420,,"[452, 105]","[(200138, 0.9856448), (200115, 0.98382956), (2...",200138,0.985645
9252,356645110623479,10202320220,"(200232, 200547, 200013, 200166)",200336,,"[229, 529, 13, 164]","[(200525, 0.98021525), (200345, 0.9396033), (2...",200525,0.980215
4020,352398090018898,10913183858,"(200088,)",200332,,[87],"[(200361, 0.8438148), (200655, 0.83668596), (2...",200361,0.843815
97,352398083991747,10020330921,"(200189, 200263)",200013,744.0,"[187, 259]","[(200099, 0.5417131185531616), (200588, 0.5158...",200099,0.541713


In [41]:
cosmetic_valid_agg.drop(["first_item", "first_score", "receipt_cat", "item_cat"], axis=1, inplace=True)
cosmetic_valid_agg = cosmetic_valid_agg.explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

cosmetic_valid_agg = pd.concat([cosmetic_valid_agg, pd.DataFrame(cosmetic_valid_agg["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

In [42]:
cosmetic_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
188741,356645110489244,13835323979,200598,200243,200630,0.989444
1660,352398083991747,9648212823,200591,200582,200582,0.914811
8488,352398083991747,13559420783,200268,200012,200150,0.934947
216701,356645110623479,13637425616,200566,200161,200154,0.974719
233576,356645110691534,9648564374,200347,200013,200651,0.540913


In [43]:
cosmetic_val_agg = cosmetic_val.merge(cosmetic_target.rename(columns={"item_id": "target"}), on=["receipt_id"], how="left") \
  .drop("name", axis=1)

cosmetic_val_agg = cosmetic_val_agg.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

cosmetic_val_agg["receipt_cat"] = cosmetic_val_agg["item_id"].map(receipt_2idx.get)
cosmetic_val_agg["item_cat"] = cosmetic_val_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

cosmetic_val_agg["preds"] = cosmetic_val_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

cosmetic_val_agg.drop(["receipt_cat", "item_cat"], axis=1, inplace=True)
cosmetic_val_agg = cosmetic_val_agg.explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

cosmetic_val_agg = pd.concat([cosmetic_val_agg, pd.DataFrame(cosmetic_val_agg["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

100%|██████████| 22761/22761 [00:05<00:00, 3839.82it/s]


In [44]:
cosmetic_val_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
283063,356645110252402,9659505790,200263,200594,200257.0,0.911208
345603,356645110622299,11573735524,200099,200220,200089.0,0.903423
495549,356645110824226,13427644237,200570,200076,200305.0,0.705311
181788,352398090018898,14368221005,200136,200666,200098.0,0.96191
5649,352398083991747,10505608342,200588,200043,200174.0,0.880332


In [45]:
cosmetic_val_agg = cosmetic_val_agg.dropna(subset=["candidate"]) \
  .astype({"candidate": int})

## Calculate pair features

In [48]:
le = LabelEncoder()
cosmetic_train["item_id_enc"] = le.fit_transform(cosmetic_train["item_id"])

pairs = [pair for pair in product(cosmetic_train["item_id_enc"].unique(), cosmetic_train["item_id_enc"].unique())]
pairs = pd.DataFrame(pairs, columns=["item_1", "item_2"])
pairs = pairs[pairs["item_1"] != pairs["item_2"]].reset_index(drop=True)

mtx = cosmetic_train.pivot_table(index="receipt_id", columns="item_id_enc",
                                 values="quantity", aggfunc="min") \
    .fillna(0) \
    .values

for idx, row in tqdm(pairs.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs.loc[idx, "both"] = (m == 2).sum()
    pairs.loc[idx, "left"] = mtx[:, row["item_1"]].sum()
    pairs.loc[idx, "right"] = mtx[:, row["item_2"]].sum()

pairs["left_frac"] = pairs["left"] / mtx.shape[0]
pairs["right_frac"] = pairs["right"] / mtx.shape[0]
pairs["both_left_frac"] = pairs["both"] / pairs["left"]
pairs["both_right_frac"] = pairs["both"] / pairs["right"]
pairs["item_id"] = le.inverse_transform(pairs["item_1"])
pairs["candidate"] = le.inverse_transform(pairs["item_2"])
pairs = pairs.drop(["item_1", "item_2"], axis=1)

438906it [07:22, 991.40it/s] 


In [49]:
pairs.to_csv("pairs_cosmetic.csv", index=False)

In [50]:
pairs.sample(5)

Unnamed: 0,both,left,right,left_frac,right_frac,both_left_frac,both_right_frac,item_id,candidate
381138,0.0,15.0,8.0,0.000275,0.000146,0.0,0.0,200079,200321
85652,15.0,332.0,110.0,0.006076,0.002013,0.045181,0.136364,200300,200149
310829,2.0,3.0,47.0,5.5e-05,0.00086,0.666667,0.042553,200401,200294
79841,5.0,133.0,66.0,0.002434,0.001208,0.037594,0.075758,200396,200611
727,142.0,2026.0,684.0,0.037081,0.012519,0.070089,0.207602,200222,200574


In [51]:
assert pairs.duplicated(subset=["item_id", "candidate"]).sum() == 0

In [60]:
quantity_total_hist_device = cosmetic_train.groupby(["device_id", "item_id"])["quantity"].sum().rename("quantity_total_hist_device").reset_index()
quantity_total_hist = cosmetic_train.groupby(["item_id"])["quantity"].sum().rename("quantity_total_hist").reset_index()

In [85]:
timebased = cosmetic_train.groupby(["local_date", "device_id", "item_id"])["quantity"].sum() \
  .reset_index(level=-1) \
  .reset_index(level=-1) \
  .sort_index()
timebased.sample(1)

Unnamed: 0_level_0,device_id,item_id,quantity
local_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-07-19 16:02:22,356645110747963,200049,1


In [66]:
cosmetic_valid_agg.sample(1)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
172694,356645110252402,11539035337,200613,200174,200582,0.918204


In [67]:
cosmetic_valid_agg = cosmetic_valid_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left")

cosmetic_val_agg = cosmetic_val_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left")

In [82]:
cosmetic_valid_agg = cosmetic_valid_agg.merge(cosmetic_valid[["receipt_id", "local_date"]].drop_duplicates(),
                                              on=["receipt_id"], how="left")
cosmetic_val_agg = cosmetic_val_agg.merge(cosmetic_val[["receipt_id", "local_date"]].drop_duplicates(),
                                          on=["receipt_id"], how="left")

In [87]:
cosmetic_valid_agg["last_3h_cand_sales"] = cosmetic_valid_agg.progress_apply(
    lambda x:
    timebased.loc[(timebased["item_id"] == x["target"]) & (timebased["device_id"] == x["device_id"]),
                  "quantity"].last("3h").iloc[:-1].sum(),
    axis=1)

100%|██████████| 310140/310140 [06:49<00:00, 756.95it/s]


In [88]:
cosmetic_val_agg["last_3h_cand_sales"] = cosmetic_val_agg.progress_apply(
    lambda x:
    timebased.loc[(timebased["item_id"] == x["target"]) & (timebased["device_id"] == x["device_id"]),
                  "quantity"].last("3h").iloc[:-1].sum(),
    axis=1)

100%|██████████| 516490/516490 [10:57<00:00, 785.96it/s]


In [93]:
cosmetic_val_agg["last_3h_cand_sales"].max(), cosmetic_valid_agg["last_3h_cand_sales"].max()

(8, 8)

In [95]:
item2category = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_item_id_categ_map.csv", sep=";")
item2category.sample(2)

Unnamed: 0,item_id,category_noun
498,200019,криомасло
283,200000,cc-крем


In [99]:
cosmetic_train = cosmetic_train.merge(item2category, on=["item_id"], how="left")

In [109]:
le = LabelEncoder()
cosmetic_train["category_noun_enc"] = le.fit_transform(cosmetic_train["category_noun"])

pairs_cat = [pair for pair in product(cosmetic_train["category_noun_enc"].unique(), cosmetic_train["category_noun_enc"].unique())]
pairs_cat = pd.DataFrame(pairs_cat, columns=["category_1", "category_2"])
pairs_cat = pairs_cat[pairs_cat["category_1"] != pairs_cat["category_2"]].reset_index(drop=True)

mtx = cosmetic_train.pivot_table(index="receipt_id", columns="category_noun_enc",
                                 values="quantity", aggfunc="min") \
    .fillna(0) \
    .values

for idx, row in tqdm(pairs_cat.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_cat.loc[idx, "cat_both"] = (m == 2).sum()
    pairs_cat.loc[idx, "left"] = mtx[:, row["category_1"]].sum()
    pairs_cat.loc[idx, "right"] = mtx[:, row["category_2"]].sum()

pairs_cat["cat_both_left_frac"] = pairs_cat["cat_both"] / pairs_cat["left"]
pairs_cat["cat_both_right_frac"] = pairs_cat["cat_both"] / pairs_cat["right"]
pairs_cat["category_noun"] = le.inverse_transform(pairs_cat["category_1"])
pairs_cat["category_noun_candidate"] = le.inverse_transform(pairs_cat["category_2"])
pairs_cat = pairs_cat.drop(["category_1", "category_2", "left", "right"], axis=1)

7140it [00:12, 582.21it/s]


In [110]:
pairs_cat.to_csv("pairs_cosmetic_categories.csv", index=False)

In [111]:
pairs_cat.sample(2)

Unnamed: 0,cat_both,cat_both_left_frac,cat_both_right_frac,category_noun,category_noun_candidate
2312,31.0,0.056364,0.093373,тушь,мицелярная
5964,214.0,0.473451,0.033178,саше,коробка


In [118]:
cosmetic_val_agg = cosmetic_val_agg.merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left")

cosmetic_valid_agg = cosmetic_valid_agg.merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left")

In [117]:
cosmetic_valid_agg = cosmetic_valid_agg.merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left") \
  .drop(["category_noun", "category_noun_candidate"], axis=1)
cosmetic_val_agg = cosmetic_val_agg.merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left") \
  .drop(["category_noun", "category_noun_candidate"], axis=1)

In [120]:
cat_model_cosmetic = Word2Vec.load("/content/drive/MyDrive/vseros/cosmetic_word2vec.model")

In [132]:
cosmetic_valid_agg["w2v_sim"] = cosmetic_valid_agg.progress_apply(lambda x:
  cosine_similarity(cat_model_cosmetic.wv.get_vector(x["category_noun"]).reshape(1, -1),
                    cat_model_cosmetic.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0], axis=1)

100%|██████████| 310140/310140 [02:27<00:00, 2108.07it/s]


In [133]:
cosmetic_val_agg["w2v_sim"] = cosmetic_val_agg.progress_apply(lambda x:
  cosine_similarity(cat_model_cosmetic.wv.get_vector(x["category_noun"]).reshape(1, -1),
                    cat_model_cosmetic.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0], axis=1)

100%|██████████| 516490/516490 [03:19<00:00, 2583.21it/s]


In [134]:
cosmetic_val_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)
cosmetic_valid_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)

In [135]:
cosmetic_valid_agg.isna().mean()

device_id                     0.000000
receipt_id                    0.000000
item_id                       0.000000
target                        0.000000
candidate                     0.000000
als_score                     0.000000
both                          0.000064
left                          0.000064
right                         0.000064
left_frac                     0.000064
right_frac                    0.000064
both_left_frac                0.000064
both_right_frac               0.000064
quantity_total_hist_device    0.159086
quantity_total_hist           0.000000
local_date                    0.000000
last_3h_cand_sales            0.000000
cat_both                      0.089282
cat_both_left_frac            0.089282
cat_both_right_frac           0.089282
w2v_sim                       0.000000
dtype: float64

In [136]:
cosmetic_val_agg.isna().mean()

device_id                     0.000000
receipt_id                    0.000000
item_id                       0.000000
target                        0.000000
candidate                     0.000000
als_score                     0.000000
both                          0.000136
left                          0.000136
right                         0.000136
left_frac                     0.000136
right_frac                    0.000136
both_left_frac                0.000136
both_right_frac               0.000136
quantity_total_hist_device    0.156677
quantity_total_hist           0.000000
local_date                    0.000000
last_3h_cand_sales            0.000000
cat_both                      0.090662
cat_both_left_frac            0.090662
cat_both_right_frac           0.090662
w2v_sim                       0.000000
dtype: float64

In [137]:
cosmetic_valid_agg.to_csv("cosmetic_valid_agg.csv", index=False)
cosmetic_val_agg.to_csv("cosmetic_val_agg.csv", index=False)

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

In [140]:
cosmetic_valid_agg["y"] = (cosmetic_valid_agg["target"] == cosmetic_valid_agg["candidate"]).astype(int)
cosmetic_val_agg["y"] = (cosmetic_val_agg["target"] == cosmetic_val_agg["candidate"]).astype(int)

In [142]:
cosmetic_valid_agg.sample(5).columns

Index(['device_id', 'receipt_id', 'item_id', 'target', 'candidate',
       'als_score', 'both', 'left', 'right', 'left_frac', 'right_frac',
       'both_left_frac', 'both_right_frac', 'quantity_total_hist_device',
       'quantity_total_hist', 'local_date', 'last_3h_cand_sales', 'cat_both',
       'cat_both_left_frac', 'cat_both_right_frac', 'w2v_sim', 'y'],
      dtype='object')

In [150]:
class_weights = compute_class_weight("balanced", classes=np.unique(cosmetic_valid_agg["y"]), y=cosmetic_valid_agg["y"])

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

best_score = 0
best_params = {}

for n_estimators in tqdm(param_grid["n_estimators"], position=0, leave=False):
    for max_depth in tqdm(param_grid["max_depth"], position=1, leave=False):
        for min_samples_split in tqdm(param_grid["min_samples_split"], position=2, leave=False):
            for min_samples_leaf in tqdm(param_grid["min_samples_leaf"], position=3, leave=False):
                  rf = RandomForestClassifier(
                      n_estimators=n_estimators,
                      max_depth=max_depth,
                      min_samples_split=min_samples_split,
                      min_samples_leaf=min_samples_leaf,
                      class_weight=dict(enumerate(class_weights))
                  )
                  rf.fit(cosmetic_valid_agg.drop(["device_id", "receipt_id", "item_id", "target", "candidate", "y", "local_date"], axis=1).fillna(0),
                          cosmetic_valid_agg["y"])

                  y_pred = rf.predict(cosmetic_val_agg.drop(["device_id", "receipt_id", "item_id", "target", "candidate", "y", "local_date"], axis=1).fillna(0))

                  f1 = f1_score(cosmetic_val_agg["y"], y_pred)

                  if f1 > best_score:
                      best_score = f1
                      best_params = {
                          "n_estimators": n_estimators,
                          "max_depth": max_depth,
                          "min_samples_split": min_samples_split,
                          "min_samples_leaf": min_samples_leaf,
                      }

print("Лучшие параметры: ", best_params)
print("Лучший F1-скор: ", best_score)


Лучшие параметры:  {'n_estimators': 50, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 4}
Лучший F1-скор:  0.2615733019679441


In [156]:
model = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    class_weight=dict(enumerate(class_weights))
)

In [157]:
%%time
model.fit(cosmetic_valid_agg.drop(["device_id", "receipt_id", "item_id", "target", "candidate", "y", "local_date"], axis=1).fillna(0),
       cosmetic_valid_agg["y"])

CPU times: user 34 s, sys: 5.85 ms, total: 34 s
Wall time: 37.5 s


In [158]:
y_pred = model.predict(cosmetic_val_agg.drop(["device_id", "receipt_id", "item_id", "target", "candidate", "y", "local_date"], axis=1).fillna(0))
y_proba = model.predict_proba(cosmetic_val_agg.drop(["device_id", "receipt_id", "item_id", "target", "candidate", "y", "local_date"], axis=1).fillna(0))

In [159]:
print(classification_report(cosmetic_val_agg["y"], y_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96    496735
           1       0.20      0.38      0.26     19755

    accuracy                           0.92    516490
   macro avg       0.59      0.66      0.61    516490
weighted avg       0.94      0.92      0.93    516490



In [160]:
cosmetic_val_agg["proba"] = y_proba[:, 1]

In [161]:
result = cosmetic_val_agg.sort_values("proba", ascending=False) \
  .groupby(["receipt_id", "target"], sort=False)["candidate"].first() \
  .reset_index()

In [162]:
(result["candidate"] == result["target"]).mean() * 100

13.36115992970123