In [1]:
!pip install implicit



In [2]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import gc
from gensim.models.word2vec import Word2Vec
import joblib
import implicit
from itertools import combinations, product
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix, save_npz, coo_array
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
tqdm.pandas()



In [4]:
super_train = pd.read_csv("/content/drive/MyDrive/vseros/supermarket_train.tsv", sep="\t")
super_val = pd.read_csv("/content/drive/MyDrive/vseros/supermarket_val.tsv", sep="\t")
super_target = pd.read_csv("/content/drive/MyDrive/vseros/supermarket_val_target.tsv", sep="\t") \
    .drop_duplicates() \
    .reset_index(drop=True)

In [5]:
super_train["local_date"] = pd.to_datetime(super_train["local_date"])
super_val["local_date"] = pd.to_datetime(super_val["local_date"])

# Super

## Create third dataset

In [6]:
super_train.shape

(696787, 9)

In [7]:
# удаление чеков с 1м товаром
super_train = super_train[super_train.groupby(["receipt_id"])["item_id"].transform(lambda x: x.nunique() > 1)] \
  .reset_index(drop=True)

super_train.shape

(687053, 9)

In [8]:
unique_items = super_train["item_id"]
unique_items.nunique()

16220

In [10]:
# сборка таргета из товаров, которые встречаются более 1го раза на датасете
hight_support_items = super_train["item_id"].value_counts().where(lambda x: x > 1).dropna().index

target = super_train[super_train["item_id"].isin(hight_support_items)].groupby(["receipt_id"])["item_id"].apply(lambda x: x.sample(1).iloc[0]) \
  .to_dict()

super_train["target"] = super_train.apply(lambda x: int(target.get(x["receipt_id"], 0) == x["item_id"]), axis=1)
del target

In [11]:
super_train["receipt_id"].nunique(), super_train["target"].sum()

(159672, 168855)

In [12]:
super_train["has_unique_item"] = ~super_train["item_id"].isin(hight_support_items)
super_train["has_unique_item"] = super_train.groupby(["receipt_id"])["has_unique_item"].transform("max")

In [13]:
#  разделение трейна на 2 выборки по идентификатору чека
val_receipts = super_train["receipt_id"].drop_duplicates().sample(frac=.2, random_state=42).tolist()

super_valid = super_train[(super_train["receipt_id"].isin(val_receipts)) & (~super_train["has_unique_item"])]
super_train = super_train[(~super_train["receipt_id"].isin(val_receipts)) | (super_train["has_unique_item"])]

del val_receipts

In [14]:
# все товары, которые не встречаются на обучении (не в таргете)
lost_items = unique_items[~unique_items.isin(super_train.loc[super_train["target"] != 1, "item_id"])].drop_duplicates()
lost_items = super_valid.loc[super_valid["item_id"].isin(lost_items),
                             ["receipt_id", "item_id"]].groupby(["item_id"])["receipt_id"] \
  .apply(lambda x: x.sample(1).values) \
  .explode() \
  .tolist()

In [15]:
# перенос всех уникальных товаров с валидации на обучение
super_train = pd.concat([super_train,
                         super_valid[super_valid["receipt_id"].isin(lost_items)]], axis=0) \
  .reset_index(drop=True)

super_valid = super_valid[~super_valid["receipt_id"].isin(lost_items)].reset_index(drop=True)

In [16]:
super_valid.shape, super_train.shape

((130729, 11), (556324, 11))

In [17]:
assert super_train.loc[super_train["target"] != 1, "item_id"].nunique() >= (unique_items.nunique() * .95) # поправка максимум на 5% потерянных товаров
assert super_train["receipt_id"].isin(super_valid["receipt_id"]).sum() == 0
assert super_valid["receipt_id"].isin(super_train["receipt_id"]).sum() == 0

In [18]:
# выделение целевого товара в отдельный столбец
super_train.loc[super_train["target"] == 1, "target"] =\
  super_train.loc[super_train["target"] == 1, "item_id"]

super_train["target"] = super_train.groupby(["receipt_id"])["target"].transform("max")
super_train = super_train[super_train["item_id"] != super_train["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)



super_valid.loc[super_valid["target"] == 1, "target"] =\
  super_valid.loc[super_valid["target"] == 1, "item_id"]

super_valid["target"] = super_valid.groupby(["receipt_id"])["target"].transform("max")
super_valid = super_valid[super_valid["item_id"] != super_valid["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)

In [19]:
super_train.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,server_date,local_date,name,price,quantity,my_ckecker,target
393278,352398082091853,12090553628,111052,2022-07-15 16:50:13,2022-07-15 16:50:12,Пиво Жигулёвское традиционное 4.5% 0.45л ст.бу...,65.0,3.0,,111561
199171,352398082091853,9308358412,111033,2021-09-18 17:57:02,2021-09-18 17:57:01,Пиво Живое барное 4% 1.42л ПЭТ /Брянск/,107.0,1.0,,109194
176280,352398080037759,12403093923,108196,2022-08-15 14:52:13,2022-08-15 14:52:11,Морож.Юнилевер Русь Джемка 57гр эскимо кубничн...,51.0,1.0,,111461
134748,352398081864565,13766922708,112705,2023-01-06 15:01:07,2023-01-06 15:01:04,Сайра Sun Feel 250гр в с/с ж/б,159.9,1.0,,106724
236131,352398080037759,12771307136,100327,2022-09-21 10:01:59,2022-09-21 10:01:57,Батон нарезной 300гр /Кунья х/з/,36.0,1.0,,101430


## Prepare matrix

In [20]:
# агрегация чеков
super_train_mtx = super_train.groupby(["receipt_id", "item_id"])["quantity"].min() \
    .reset_index()
super_train_mtx["quantity"] = 1
super_train_mtx = super_train_mtx.drop_duplicates() \
    .reset_index(drop=True)

receipt_items = super_train_mtx.groupby(["receipt_id"])["item_id"].apply(lambda x: tuple(set(x))).to_dict()
super_train_mtx["items"] = super_train_mtx["receipt_id"].map(receipt_items.get)

# удаление чеков из 1го товара и удаление одинаковых чеков
super_train_mtx = super_train_mtx[super_train_mtx["items"].apply(len) > 1] \
  .drop_duplicates(subset=["items", "item_id"]) \
  .reset_index(drop=True)

# преобразование типов данных
super_train_mtx["receipt_cat"] = super_train_mtx["receipt_id"].astype("category").cat.codes
super_train_mtx["item_cat"] = super_train_mtx["item_id"].astype("category").cat.codes

In [21]:
# словари для обращения к эмбедам ALS
receipt_2idx = super_train_mtx.drop_duplicates(subset=["receipt_cat"]) \
  .set_index("items")["receipt_cat"].to_dict()

item_2idx = super_train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_id")["item_cat"].to_dict()
idx_2item = super_train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_cat")["item_id"].to_dict()

In [22]:
# построение матрицы чек-товар
sparse_receipt_item = csr_matrix((super_train_mtx["quantity"].astype(float),
                                 (super_train_mtx["receipt_cat"], super_train_mtx["item_cat"])))

In [23]:
save_npz("/content/drive/MyDrive/1.0-super/sparse_receipt_item.npz", sparse_receipt_item)

with open("/content/drive/MyDrive/1.0-super/receipt_2idx.pkl", "wb") as f:
    pickle.dump(receipt_2idx, f)

with open("/content/drive/MyDrive/1.0-super/item_2idx.pkl", "wb") as f:
    pickle.dump(item_2idx, f)

with open("/content/drive/MyDrive/1.0-super/idx_2item.pkl", "wb") as f:
    pickle.dump(idx_2item, f)

## Fit and tune ALS

In [24]:
!pip install scikit-optimize



In [25]:
def recommend_to_receipt(receipt_cat, sparse_user_item,
                         receipt_vecs, item_vecs, idx_2item, num_items=5):

    receipt_interactions = sparse_user_item[receipt_cat, :].toarray()

    receipt_interactions = receipt_interactions.reshape(-1) + 1
    receipt_interactions[receipt_interactions > 1] = 0

    rec_vector = receipt_vecs[receipt_cat, :].dot(item_vecs.T).toarray()

    recommend_vector = (receipt_interactions * rec_vector)[0]

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    result = []

    for idx in item_idx:
      result.append((idx_2item[idx], recommend_vector[idx]))

    return result

In [27]:
def recommend_to_items(items_cat, item_norms, item_vecs, idx_2item, num_items=5):

    scores = item_vecs.dot(item_vecs[items_cat].T).T  / item_norms.reshape(1, -1)
    top_idx = np.argpartition(scores, -num_items, axis=1)[:, -(num_items+1):]
    scores = np.array([scores[idx, row] for idx, row in enumerate(top_idx)])
    scores = scores / item_norms[items_cat].reshape(-1, 1)
    result = []
    for i in sorted(zip(top_idx.reshape(-1), scores.reshape(-1)), key=lambda x: -x[1]):
      if i[0] not in items_cat and idx_2item[i[0]] not in [j[0] for j in result]:
        result.append((idx_2item[i[0]], i[1]))

    return result[:num_items]

In [53]:
super_valid_agg = super_valid.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

super_valid_agg["receipt_cat"] = super_valid_agg["item_id"].map(receipt_2idx.get)
super_valid_agg["item_cat"] = super_valid_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

In [29]:
super_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat
4870,352398080037759,15366341221,"(108821, 106535)",108037,,"[7870, 5827]"
19009,352398080462536,10383068468,"(102748, 114479)",107082,,"[2418, 12953]"
17736,352398080458112,9412057482,"(100522, 115866, 101525, 115887)",115878,,"[457, 14182, 1348, 14202]"
21248,352398080462627,15672152632,"(107337, 107737, 115887)",109784,,"[6576, 6938, 14202]"
3827,352398080037759,14072529650,"(101525,)",111072,,[1348]


### Select parameters

In [30]:
num_recs = 10
alpha_val = 15

param_grid = {
  "factors": [10, 20, 50],
  "regularization": [0.01, 0.1],
  "iterations": [10, 20]
}

all_param_combinations = list(product(*param_grid.values()))
result = []

for params in tqdm(all_param_combinations):
  factors, regularization, iterations = params
  params = {
      "factors": factors,
      "regularization": regularization,
      "iterations": iterations,
      "calculate_training_loss": False,
      "random_state": 42
  }
  model = implicit.als.AlternatingLeastSquares(**params)
  model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=False)

  receipt_vecs = model.user_factors
  item_vecs = model.item_factors

  receipt_vecs_csr = csr_matrix(receipt_vecs)
  item_vecs_csr = csr_matrix(item_vecs)

  item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

  super_valid_agg_copy = super_valid_agg.copy()
  super_valid_agg_copy["preds"] = super_valid_agg_copy.apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

  hit = ((super_valid_agg_copy.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100)
  result.append((hit, params))

100%|██████████| 12/12 [31:00<00:00, 155.08s/it]


In [31]:
result = pd.DataFrame(result, columns=["hit@10", "params"]) \
  .sort_values("hit@10", ascending=False)
result

Unnamed: 0,hit@10,params
8,2.308412,"{'factors': 50, 'regularization': 0.01, 'itera..."
1,2.218138,"{'factors': 10, 'regularization': 0.01, 'itera..."
5,2.208466,"{'factors': 20, 'regularization': 0.01, 'itera..."
3,2.073057,"{'factors': 10, 'regularization': 0.1, 'iterat..."
7,2.050488,"{'factors': 20, 'regularization': 0.1, 'iterat..."
4,2.015024,"{'factors': 20, 'regularization': 0.01, 'itera..."
6,1.944095,"{'factors': 20, 'regularization': 0.1, 'iterat..."
2,1.857046,"{'factors': 10, 'regularization': 0.1, 'iterat..."
0,1.821582,"{'factors': 10, 'regularization': 0.01, 'itera..."
11,1.805462,"{'factors': 50, 'regularization': 0.1, 'iterat..."


### Fit final model

In [49]:
best_params = result["params"].iloc[0].copy()
best_params["calculate_training_loss"] = True
print(best_params)

{'factors': 50, 'regularization': 0.01, 'iterations': 10, 'calculate_training_loss': True, 'random_state': 42}


In [50]:
model = implicit.als.AlternatingLeastSquares(**best_params)
model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=True)

  0%|          | 0/10 [00:00<?, ?it/s]

In [51]:
joblib.dump({"model": model, "params": best_params, "alpha_val": alpha_val},
            "/content/drive/MyDrive/1.0-super/candidate_model.joblib")

['/content/drive/MyDrive/1.0-super/candidate_model.joblib']

In [52]:
# извлечение эмбедов из ALS
receipt_vecs = model.user_factors
item_vecs = model.item_factors

receipt_vecs_csr = csr_matrix(receipt_vecs)
item_vecs_csr = csr_matrix(item_vecs)

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

### Generate candidates

In [54]:
super_valid_agg["preds"] = super_valid_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

super_valid_agg = super_valid_agg[super_valid_agg["preds"].apply(len) > 0].reset_index(drop=True)

100%|██████████| 31017/31017 [01:57<00:00, 265.09it/s]


In [55]:
print("accuracy: ", ((super_valid_agg["preds"].apply(lambda x: x[0][0]) == super_valid_agg["target"]).mean() * 100))
print("hit@10: ", ((super_valid_agg.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100))

accuracy:  0.5870967741935484
hit@10:  2.3096774193548386


In [56]:
super_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat,preds
26771,352398081535645,14290749140,"(115881, 103835, 115870, 107708, 114846, 109247)",106874,,"[14196, 3404, 14186, 6912, 13281, 8263]","[(101235, 0.8940072), (115130, 0.89280415), (1..."
28018,352398081864565,14907972577,"(106284, 106988, 116142, 104118, 114583, 108952)",109456,,"[5603, 6244, 14428, 3667, 13044, 7996]","[(110259, 0.9316215), (111299, 0.9300972), (11..."
7138,352398080098538,12812760952,"(114261,)",111245,,[12758],"[(102842, 0.73439187), (110163, 0.72900116), (..."
10645,352398080098702,14867481741,"(111472, 101457, 116441)",111125,,"[10278, 1285, 14693]","[(100823, 0.88231415), (104572, 0.88217115), (..."
20461,352398080462627,13418254749,"(108848, 109601, 116850, 107726)",111479,,"[7897, 8565, 15062, 6928]","[(105317, 0.93222547), (102631, 0.92390084), (..."


In [57]:
super_valid_agg = super_valid_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

super_valid_agg = pd.concat([super_valid_agg,
                             pd.DataFrame(super_valid_agg["preds"].tolist(), columns=["candidate", "als_score"])],
                            axis=1) \
  .drop(["preds"], axis=1)

In [58]:
super_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
24908,352398080037759,9968033575,108817,114614,111019,0.674131
120730,352398080037759,13714558412,105552,108973,101511,0.937175
937718,352398082091853,13059998602,115869,106497,113680,0.936381
52623,352398080037759,11115101835,100323,112442,104209,0.879099
739594,352398080550058,14784652354,104741,116837,105114,0.919557


In [59]:
super_val_agg = super_val.merge(super_target.rename(columns={"item_id": "target"}), on=["receipt_id"], how="left") \
  .drop("name", axis=1)

super_val_agg = super_val_agg.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

super_val_agg["receipt_cat"] = super_val_agg["item_id"].map(receipt_2idx.get)
super_val_agg["item_cat"] = super_val_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

super_val_agg["preds"] = super_val_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

super_val_agg = super_val_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

super_val_agg = pd.concat([super_val_agg, pd.DataFrame(super_val_agg["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

100%|██████████| 53226/53226 [03:47<00:00, 234.41it/s]


In [60]:
super_val_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
652958,352398080124383,11975452085,100553,102199,102300.0,0.94214
971609,352398080458112,9816002160,100329,100619,100783.0,0.929416
758004,352398080391545,13246664554,107559,112738,112736.0,0.977882
388148,352398080098538,11857878637,112128,101631,114626.0,0.863032
1547550,352398081864565,15705075802,108051,107543,105246.0,0.986392


In [61]:
super_val_agg = super_val_agg.dropna(subset=["candidate"]) \
  .astype({"candidate": int})

In [62]:
super_train.to_csv("/content/drive/MyDrive/1.0-super/train.csv", index=False)
super_valid_agg.to_csv("/content/drive/MyDrive/1.0-super/valid_agg_raw.csv", index=False)
super_val_agg.to_csv("/content/drive/MyDrive/1.0-super/val_agg_raw.csv", index=False)

In [63]:
assert super_valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert super_val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

## Calculate pair features

In [64]:
gc.collect()

0

### By products

In [77]:
le = LabelEncoder()
super_train["item_id_enc"] = le.fit_transform(super_train["item_id"])

checks = super_train.groupby(["receipt_id"])["item_id_enc"].apply(lambda x: list(set(x))).tolist()
pairs = []

for check in checks:
  if len(check) == 2:
    pairs.append(check)
  else:
    pairs += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs = pd.DataFrame(pairs, columns=["item_1", "item_2"]) \
  .drop_duplicates(subset=["item_1", "item_2"])
pairs = pairs[pairs["item_1"] != pairs["item_2"]].reset_index(drop=True)
print(pairs.shape[0])

mtx = super_train[["receipt_id", "item_id_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["item_id_enc"])),
                shape=(super_train["receipt_id"].nunique(), super_train["item_id_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_res = []

for idx, row in tqdm(pairs.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_res.append((idx, (m == 2).sum()))

del mtx

pairs_res = pd.DataFrame(pairs_res, columns=[0, "both"]).set_index(0)
item_receipts = super_train.groupby(["item_id"])["receipt_id"].nunique().to_dict()

pairs = pd.concat([pairs, pairs_res[~pairs_res.index.duplicated()]], axis=1)

del pairs_res

pairs["item_id"] = le.inverse_transform(pairs["item_1"])
pairs["candidate"] = le.inverse_transform(pairs["item_2"])

unique_receipts = super_train["receipt_id"].nunique()

pairs["left"] = pairs["item_id"].map(item_receipts.get)
pairs["right"] = pairs["candidate"].map(item_receipts.get)

pairs["left_frac"] = pairs["left"] / unique_receipts
pairs["right_frac"] = pairs["right"] / unique_receipts
pairs["both_left_frac"] = pairs["both"] / pairs["left"]
pairs["both_right_frac"] = pairs["both"] / pairs["right"]

pairs = pairs.drop(["item_1", "item_2"], axis=1)

pairs = pd.concat([pairs.copy().rename(columns={"item_id": "candidate", "candidate": "item_id"}), pairs], axis=0) \
  .drop_duplicates(subset=["candidate", "item_id"]) \
  .reset_index(drop=True)

In [79]:
pairs.to_csv("/content/drive/MyDrive/1.0-super/pairs.csv", index=False)

In [80]:
pairs.sample(5)

Unnamed: 0,both,candidate,item_id,left,right,left_frac,right_frac,both_left_frac,both_right_frac
231648,5,101413,113736,236,691,0.001834,0.005371,0.021186,0.007236
844949,1,111060,101430,175,203,0.00136,0.001578,0.005714,0.004926
387896,1,113137,109137,63,2,0.00049,1.6e-05,0.015873,0.5
88981,1,110664,113749,3,61,2.3e-05,0.000474,0.333333,0.016393
1033169,1,100350,111703,223,6,0.001733,4.7e-05,0.004484,0.166667


In [81]:
assert pairs.duplicated(subset=["item_id", "candidate"]).sum() == 0

In [82]:
quantity_total_hist_device = super_train.groupby(["device_id", "item_id"])["quantity"].sum().rename("quantity_total_hist_device").reset_index()
quantity_total_hist = super_train.groupby(["item_id"])["quantity"].sum().rename("quantity_total_hist").reset_index()

quantity_total_hist_device.to_csv("/content/drive/MyDrive/1.0-super/quantity_total_hist_device.csv", index=False)
quantity_total_hist.to_csv("/content/drive/MyDrive/1.0-super/quantity_total_hist.csv", index=False)

In [83]:
super_valid_agg = super_valid_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left") \
  .merge(super_valid[["receipt_id", "local_date"]].drop_duplicates(), on=["receipt_id"], how="left")

super_val_agg = super_val_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left") \
  .merge(super_valid[["receipt_id", "local_date"]].drop_duplicates(), on=["receipt_id"], how="left")

In [84]:
assert super_valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert super_val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

### By product type

In [85]:
cat_model_super = Word2Vec.load("/content/drive/MyDrive/vseros/supermarket_word2vec.model")

item2category = pd.read_csv("/content/drive/MyDrive/vseros/super_item_id_categ_map.csv", sep=";")
item2category.sample(2)

Unnamed: 0,item_id,category_noun
15438,106343,крышка
7129,115534,тушка


In [86]:
super_train = super_train.merge(item2category, on=["item_id"], how="left")

In [87]:
le = LabelEncoder()
super_train["category_noun_enc"] = le.fit_transform(super_train["category_noun"])

checks = super_train.groupby(["receipt_id"])["category_noun_enc"].apply(lambda x: list(set(x))).tolist()
pairs_cat = []

for check in checks:
  if len(check) == 2:
    pairs_cat.append(check)
  else:
    pairs_cat += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs_cat = pd.DataFrame(pairs_cat, columns=["category_1", "category_2"]) \
  .drop_duplicates(subset=["category_1", "category_2"])
pairs_cat = pairs_cat[pairs_cat["category_1"] != pairs_cat["category_2"]].reset_index(drop=True)

mtx = super_train[["receipt_id", "category_noun_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["category_noun_enc"])),
                shape=(super_train["receipt_id"].nunique(), super_train["category_noun_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_cat_res = []

for idx, row in tqdm(pairs_cat.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_cat_res.append((idx, (m == 2).sum()))

del mtx

pairs_cat_res = pd.DataFrame(pairs_cat_res, columns=[0, "both"]).set_index(0)
cat_receipts = super_train.groupby(["category_noun"])["receipt_id"].nunique().to_dict()

pairs_cat = pd.concat([pairs_cat, pairs_cat_res], axis=1)

del pairs_cat_res

pairs_cat["category_noun"] = le.inverse_transform(pairs_cat["category_1"])
pairs_cat["category_noun_candidate"] = le.inverse_transform(pairs_cat["category_2"])


pairs_cat["left"] = pairs_cat["category_noun"].map(cat_receipts.get)
pairs_cat["right"] = pairs_cat["category_noun_candidate"].map(cat_receipts.get)

pairs_cat["cat_both_left_frac"] = pairs_cat["both"] / pairs_cat["left"]
pairs_cat["cat_both_right_frac"] = pairs_cat["both"] / pairs_cat["right"]

pairs_cat = pairs_cat.drop(["category_1", "category_2", "left", "right"], axis=1)

pairs_cat = pd.concat([pairs_cat.copy().rename(columns={"category_noun": "category_noun_candidate", "category_noun_candidate": "category_noun"}), pairs_cat], axis=0) \
  .drop_duplicates(subset=["category_noun_candidate", "category_noun"]) \
  .reset_index(drop=True)

94966it [11:52, 133.35it/s]


In [88]:
pairs_cat.to_csv("/content/drive/MyDrive/1.0-super/pairs_categories.csv", index=False)

In [89]:
pairs_cat.sample(2)

Unnamed: 0,both,category_noun_candidate,category_noun,cat_both_left_frac,cat_both_right_frac
15462,6,вода,игрушка,0.000878,0.04918
69148,1,сода,пюре,0.003077,0.002033


In [90]:
super_valid_agg = super_valid_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

super_val_agg = super_val_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

In [91]:
assert super_valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert super_val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

In [104]:
vocab = cat_model_super.wv.key_to_index
mask = super_val_agg["category_noun"].isin(vocab) & super_val_agg["category_noun_candidate"].isin(vocab)

In [105]:
super_valid_agg["w2v_sim"] = super_valid_agg.progress_apply(lambda x:
                                          cosine_similarity(cat_model_super.wv.get_vector(x["category_noun"]).reshape(1, -1),
                                                            cat_model_super.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0],
                                          axis=1
                                          )

super_val_agg.loc[mask, "w2v_sim"] = super_val_agg[mask].progress_apply(lambda x:
                                          cosine_similarity(cat_model_super.wv.get_vector(x["category_noun"]).reshape(1, -1),
                                                            cat_model_super.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0],
                                          axis=1
                                          )

100%|██████████| 1682750/1682750 [11:33<00:00, 2427.24it/s]


In [106]:
super_valid_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)
super_val_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)

In [107]:
super_valid_agg.to_csv("/content/drive/MyDrive/1.0-super/valid_agg_features.csv", index=False)
super_val_agg.to_csv("/content/drive/MyDrive/1.0-super/valid_agg_features.csv", index=False)

## Fit classifier

In [108]:
super_valid_agg["y"] = (super_valid_agg["target"] == super_valid_agg["candidate"]).astype(int)
super_val_agg["y"] = (super_val_agg["target"] == super_val_agg["candidate"]).astype(int)

In [109]:
non_features = ["device_id", "receipt_id", "item_id", "target", "candidate", "y", "local_date"]

### Select parameters

In [110]:
%%time
class_weights = compute_class_weight("balanced", classes=np.unique(super_valid_agg["y"]), y=super_valid_agg["y"])

param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
}

best_score = 0
best_params = {}

for n_estimators in tqdm(param_grid["n_estimators"], position=0, leave=False):
    for max_depth in tqdm(param_grid["max_depth"], position=1, leave=False):
        for min_samples_split in tqdm(param_grid["min_samples_split"], position=2, leave=False):
            for min_samples_leaf in tqdm(param_grid["min_samples_leaf"], position=3, leave=False):
                  rf = RandomForestClassifier(
                      n_estimators=n_estimators,
                      max_depth=max_depth,
                      min_samples_split=min_samples_split,
                      min_samples_leaf=min_samples_leaf,
                      class_weight=dict(enumerate(class_weights))
                  )
                  rf.fit(super_valid_agg.drop(non_features, axis=1).fillna(0),
                          super_valid_agg["y"])

                  y_pred = rf.predict(super_valid_agg.drop(non_features, axis=1).fillna(0))

                  f1 = f1_score(super_valid_agg["y"], y_pred)

                  if f1 > best_score:
                      best_score = f1
                      best_params = {
                          "n_estimators": n_estimators,
                          "max_depth": max_depth,
                          "min_samples_split": min_samples_split,
                          "min_samples_leaf": min_samples_leaf,
                      }

print("Лучшие параметры: ", best_params)
print("Лучший F1-скор: ", best_score)

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎      | 1/3 [00:53<01:47, 53.70s/it][A[A[A


 67%|██████▋   | 2/3 [01:37<00:47, 47.61s/it][A[A[A


100%|██████████| 3/3 [02:24<00:00, 47.39s/it][A[A[A


                                             [A[A[A

 50%|█████     | 1/2 [02:24<02:24, 144.18s/it][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎      | 1/3 [00:44<01:28, 44.14s/it][A[A[A


 67%|██████▋   | 2/3 [01:30<00:45, 45.62s/it][A[A[A


100%|██████████| 3/3 [02:13<00:00, 44.44s/it][A[A[A


                                             [A[A[A

100%|██████████| 2/2 [04:38<00:00, 138.10s/it][A[A

                                              [A[A
 50%|█████     | 1/2 [04:38<04:38, 278.04s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎     

Лучшие параметры:  {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 2}
Лучший F1-скор:  0.06025779789432255
CPU times: user 33min 48s, sys: 5.47 s, total: 33min 53s
Wall time: 34min 1s




### Fit final model

In [111]:
best_params["class_weight"] = dict(enumerate(class_weights))
model = RandomForestClassifier(**best_params)

In [112]:
%%time
model.fit(super_valid_agg.drop(non_features, axis=1).fillna(0),
          super_valid_agg["y"])

CPU times: user 2min 7s, sys: 181 ms, total: 2min 7s
Wall time: 2min 7s


In [113]:
joblib.dump({"model": model, "params": best_params},
            "/content/drive/MyDrive/1.0-super/classifier_model.joblib")

['/content/drive/MyDrive/1.0-super/classifier_model.joblib']

In [114]:
y_pred = model.predict(super_val_agg.drop(non_features, axis=1).fillna(0))
y_proba = model.predict_proba(super_val_agg.drop(non_features, axis=1).fillna(0))[:, 1]

In [115]:
print(classification_report(super_val_agg["y"], y_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97   1680681
           1       0.02      0.80      0.05      2599

    accuracy                           0.95   1683280
   macro avg       0.51      0.88      0.51   1683280
weighted avg       1.00      0.95      0.97   1683280



In [116]:
super_val_agg["proba"] = y_proba

In [117]:
result = super_val_agg.sort_values("proba", ascending=False) \
  .groupby(["receipt_id", "target"], sort=False)["candidate"].first() \
  .reset_index()

In [118]:
(result["candidate"] == result["target"]).mean() * 100

1.1243996609850269

## Generate final predictions

In [119]:
predict = pd.read_csv("/content/drive/MyDrive/vseros/supermarket_val.tsv", sep="\t")
predict["local_date"] = pd.to_datetime(predict["local_date"])

In [120]:
predict["receipt_id"].nunique()

53226

In [121]:
predict = predict.merge(super_target.rename(columns={"item_id": "target"}), on=["receipt_id"], how="left") \
  .drop("name", axis=1)

predict = predict.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

predict["receipt_cat"] = predict["item_id"].map(receipt_2idx.get)
predict["item_cat"] = predict["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

predict["preds"] = predict.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

predict = predict \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

predict = pd.concat([predict, pd.DataFrame(predict["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

100%|██████████| 53226/53226 [03:11<00:00, 277.86it/s]


In [122]:
predict = predict.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left") \
  .merge(super_val[["receipt_id", "local_date"]].drop_duplicates(), on=["receipt_id"], how="left")

In [123]:
predict = predict \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

In [125]:
mask = predict["category_noun"].isin(vocab) & predict["category_noun_candidate"].isin(vocab)
mask = mask * predict["candidate"].notna()

  mask = mask * predict["candidate"].notna()


In [126]:
predict.loc[mask, "w2v_sim"] = predict[mask].progress_apply(lambda x:
                                          cosine_similarity(cat_model_super.wv.get_vector(x["category_noun"]).reshape(1, -1),
                                                            cat_model_super.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0],
                                          axis=1
                                          )
predict.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)

100%|██████████| 1682750/1682750 [11:16<00:00, 2485.86it/s]


In [127]:
predict["proba"] = (model.predict_proba(predict.drop([i for i in non_features if i != "y"], axis=1).fillna(0))[:, 1] * 100).round(2)

In [128]:
predict_result = predict.sort_values("proba", ascending=False) \
  .groupby(["receipt_id", "target"], sort=False).agg({"candidate": "first",
                                                      "proba": "first"}) \
  .reset_index()

In [129]:
predict_result["candidate"] = predict_result["candidate"].fillna(-1).astype(int)

In [130]:
predict_result.to_csv("/content/drive/MyDrive/1.0-super/predict.csv", index=False)