In [1]:
!pip install implicit



In [2]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from gensim.models.word2vec import Word2Vec
import joblib
import implicit
from itertools import combinations, product
import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix, save_npz, coo_array
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
tqdm.pandas()



In [4]:
cosmetic_train = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_train.tsv", sep="\t")
cosmetic_val = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_val.tsv", sep="\t")
cosmetic_target = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_val_target.tsv", sep="\t") \
    .drop_duplicates() \
    .reset_index(drop=True)

In [5]:
cosmetic_train["local_date"] = pd.to_datetime(cosmetic_train["local_date"])
cosmetic_val["local_date"] = pd.to_datetime(cosmetic_val["local_date"])

# Cosmetic

## Create third dataset

In [6]:
cosmetic_train.shape

(223908, 7)

In [7]:
# удаление чеков с 1м товаром
cosmetic_train = cosmetic_train[cosmetic_train.groupby(["receipt_id"])["item_id"].transform(lambda x: x.nunique() > 1)] \
  .reset_index(drop=True)

cosmetic_train.shape

(223906, 7)

In [8]:
unique_items = cosmetic_train["item_id"]
unique_items.nunique()

667

In [9]:
# сборка таргета из товаров, которые встречаются более 1го раза на датасете
hight_support_items = cosmetic_train["item_id"].value_counts().where(lambda x: x > 1).dropna().index

target = cosmetic_train[cosmetic_train["item_id"].isin(hight_support_items)].groupby(["receipt_id"])["item_id"].apply(lambda x: x.sample(1).iloc[0]) \
  .to_dict()

cosmetic_train["target"] = cosmetic_train.apply(lambda x: int(target.get(x["receipt_id"], 0) == x["item_id"]), axis=1)
del target

In [10]:
cosmetic_train["receipt_id"].nunique(),  cosmetic_train["target"].sum()

(68282, 68283)

In [11]:
cosmetic_train["has_unique_item"] = ~cosmetic_train["item_id"].isin(hight_support_items)
cosmetic_train["has_unique_item"] = cosmetic_train.groupby(["receipt_id"])["has_unique_item"].transform("max")

In [12]:
#  разделение трейна на 2 выборки по идентификатору чека
val_receipts = cosmetic_train["receipt_id"].drop_duplicates().sample(frac=.2, random_state=42).tolist()

cosmetic_valid = cosmetic_train[(cosmetic_train["receipt_id"].isin(val_receipts)) & (~cosmetic_train["has_unique_item"])]
cosmetic_train = cosmetic_train[(~cosmetic_train["receipt_id"].isin(val_receipts)) | (cosmetic_train["has_unique_item"])]

del val_receipts

In [13]:
# все товары, которые не встречаются на обучении (не в таргете)
lost_items = unique_items[~unique_items.isin(cosmetic_train.loc[cosmetic_train["target"] != 1, "item_id"])].drop_duplicates()
lost_items = cosmetic_valid.loc[cosmetic_valid["item_id"].isin(lost_items),
                                ["receipt_id", "item_id"]].groupby(["item_id"])["receipt_id"] \
  .apply(lambda x: x.sample(1).values) \
  .explode() \
  .tolist()

In [14]:
# перенос всех уникальных товаров с валидации на обучение
cosmetic_train = pd.concat([cosmetic_train,
                            cosmetic_valid[cosmetic_valid["receipt_id"].isin(lost_items)]], axis=0) \
  .reset_index(drop=True)

cosmetic_valid = cosmetic_valid[~cosmetic_valid["receipt_id"].isin(lost_items)].reset_index(drop=True)

In [15]:
cosmetic_valid.shape, cosmetic_train.shape

((44744, 9), (179162, 9))

In [16]:
assert cosmetic_train.loc[cosmetic_train["target"] != 1, "item_id"].nunique() >= (unique_items.nunique() * .95) # поправка максимум на 5% потерянных товаров
assert cosmetic_train["receipt_id"].isin(cosmetic_valid["receipt_id"]).sum() == 0
assert cosmetic_valid["receipt_id"].isin(cosmetic_train["receipt_id"]).sum() == 0

In [17]:
# выделение целевого товара в отдельный столбец
cosmetic_train.loc[cosmetic_train["target"] == 1, "target"] =\
  cosmetic_train.loc[cosmetic_train["target"] == 1, "item_id"]

cosmetic_train["target"] = cosmetic_train.groupby(["receipt_id"])["target"].transform("max")
cosmetic_train = cosmetic_train[cosmetic_train["item_id"] != cosmetic_train["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)



cosmetic_valid.loc[cosmetic_valid["target"] == 1, "target"] =\
  cosmetic_valid.loc[cosmetic_valid["target"] == 1, "item_id"]

cosmetic_valid["target"] = cosmetic_valid.groupby(["receipt_id"])["target"].transform("max")
cosmetic_valid = cosmetic_valid[cosmetic_valid["item_id"] != cosmetic_valid["target"]] \
  .reset_index(drop=True) \
  .drop(["has_unique_item"], axis=1)

In [18]:
cosmetic_train.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,local_date,name,price,quantity,target
60785,356645110622299,10864974748,200404,2022-03-10 15:12:03,"Пенка для умывания с авокадо NEW, 150мл",890.0,1,200144
75899,356645110652080,15458089928,200551,2023-06-24 20:55:59,"Скраб кофе, 250 г (товар)",644.0,1,200366
18613,352398085964585,12646699457,200171,2022-09-08 14:12:00,Коробка подарочная Letique Blooming Glades,0.0,1,200378
10921,356645110237411,16088529840,200543,2023-08-20 18:51:45,"Скраб какао, 250 г (товар)",297.0,1,200668
78183,352398090018898,12030875490,200353,2022-07-09 20:52:04,"Обертывание для тела VINOTHERAPY, 200 мл",1490.0,1,200192


## Prepare matrix

In [19]:
# агрегация чеков
cosmetic_train_mtx = cosmetic_train.groupby(["receipt_id", "item_id"])["quantity"].min() \
    .reset_index()
cosmetic_train_mtx["quantity"] = 1
cosmetic_train_mtx = cosmetic_train_mtx.drop_duplicates() \
    .reset_index(drop=True)

receipt_items = cosmetic_train_mtx.groupby(["receipt_id"])["item_id"].apply(lambda x: tuple(set(x))).to_dict()
cosmetic_train_mtx["items"] = cosmetic_train_mtx["receipt_id"].map(receipt_items.get)

# удаление чеков из 1го товара и удаление одинаковых чеков
cosmetic_train_mtx = cosmetic_train_mtx[cosmetic_train_mtx["items"].apply(len) > 1] \
  .drop_duplicates(subset=["items", "item_id"]) \
  .reset_index(drop=True)

# преобразование типов данных
cosmetic_train_mtx["receipt_cat"] = cosmetic_train_mtx["receipt_id"].astype("category").cat.codes
cosmetic_train_mtx["item_cat"] = cosmetic_train_mtx["item_id"].astype("category").cat.codes

In [20]:
# словари для обращения к эмбедам ALS
receipt_2idx = cosmetic_train_mtx.drop_duplicates(subset=["receipt_cat"]) \
  .set_index("items")["receipt_cat"].to_dict()

item_2idx = cosmetic_train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_id")["item_cat"].to_dict()
idx_2item = cosmetic_train_mtx.drop_duplicates(subset=["item_id"]) \
  .set_index("item_cat")["item_id"].to_dict()

In [21]:
# построение матрицы чек-товар
sparse_receipt_item = csr_matrix((cosmetic_train_mtx["quantity"].astype(float),
                                 (cosmetic_train_mtx["receipt_cat"], cosmetic_train_mtx["item_cat"])))

In [22]:
save_npz("/content/drive/MyDrive/1.0-cosmetic/sparse_receipt_item.npz", sparse_receipt_item)

with open("/content/drive/MyDrive/1.0-cosmetic/receipt_2idx.pkl", "wb") as f:
    pickle.dump(receipt_2idx, f)

with open("/content/drive/MyDrive/1.0-cosmetic/item_2idx.pkl", "wb") as f:
    pickle.dump(item_2idx, f)

with open("/content/drive/MyDrive/1.0-cosmetic/idx_2item.pkl", "wb") as f:
    pickle.dump(idx_2item, f)

## Fit and tune ALS

In [23]:
!pip install scikit-optimize



In [24]:
def recommend_to_receipt(receipt_cat, sparse_user_item,
                         receipt_vecs, item_vecs, idx_2item, num_items=5):

    receipt_interactions = sparse_user_item[receipt_cat, :].toarray()

    receipt_interactions = receipt_interactions.reshape(-1) + 1
    receipt_interactions[receipt_interactions > 1] = 0

    rec_vector = receipt_vecs[receipt_cat, :].dot(item_vecs.T).toarray()

    recommend_vector = (receipt_interactions * rec_vector)[0]

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    result = []

    for idx in set(item_idx):
      result.append((idx_2item[idx], recommend_vector[idx]))

    return result

In [25]:
def recommend_to_items(items_cat, item_norms, item_vecs, idx_2item, num_items=5):

    scores = item_vecs.dot(item_vecs[items_cat].T).T  / item_norms.reshape(1, -1)
    top_idx = np.argpartition(scores, -num_items, axis=1)[:, -(num_items+1):]
    scores = np.array([scores[idx, row] for idx, row in enumerate(top_idx)])
    scores = scores / item_norms[items_cat].reshape(-1, 1)
    result = []
    for i in sorted(zip(top_idx.reshape(-1), scores.reshape(-1)), key=lambda x: -x[1]):
      if i[0] not in items_cat and idx_2item[i[0]] not in [j[0] for j in result]:
        result.append((idx_2item[i[0]], i[1]))

    return result[:num_items]

In [26]:
cosmetic_valid_agg = cosmetic_valid.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

cosmetic_valid_agg["receipt_cat"] = cosmetic_valid_agg["item_id"].map(receipt_2idx.get)
cosmetic_valid_agg["item_cat"] = cosmetic_valid_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

In [27]:
cosmetic_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat
9705,356645110652080,12192977749,"(200099, 200404)",200626,2576.0,"[97, 393]"
2466,352398088914876,13679690449,"(200200, 200161, 200483, 200511)",200598,,"[197, 158, 470, 497]"
13095,356645110824226,13681730889,"(200098,)",200631,,[96]
2642,352398089967352,9966541833,"(200645, 200252, 200133, 200574)",200582,,"[625, 247, 130, 558]"
12602,356645110800358,13624954772,"(200161, 200430)",200429,,"[158, 418]"


### Select parameters

In [28]:
num_recs = 10
alpha_val = 15

param_grid = {
  "factors": [10, 20, 50],
  "regularization": [0.01, 0.1],
  "iterations": [10, 20]
}

all_param_combinations = list(product(*param_grid.values()))
result = []

for params in tqdm(all_param_combinations):
  factors, regularization, iterations = params
  params = {
      "factors": factors,
      "regularization": regularization,
      "iterations": iterations,
      "calculate_training_loss": False,
      "random_state": 42
  }
  model = implicit.als.AlternatingLeastSquares(**params)
  model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=False)

  receipt_vecs = model.user_factors
  item_vecs = model.item_factors

  receipt_vecs_csr = csr_matrix(receipt_vecs)
  item_vecs_csr = csr_matrix(item_vecs)

  item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

  cosmetic_valid_agg_copy = cosmetic_valid_agg.copy()
  cosmetic_valid_agg_copy["preds"] = cosmetic_valid_agg_copy.apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

  hit = ((cosmetic_valid_agg_copy.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100)
  result.append((hit, params))

100%|██████████| 12/12 [04:53<00:00, 24.48s/it]


In [29]:
result = pd.DataFrame(result, columns=["hit@10", "params"]) \
  .sort_values("hit@10", ascending=False)
result

Unnamed: 0,hit@10,params
1,28.906479,"{'factors': 10, 'regularization': 0.01, 'itera..."
5,28.840516,"{'factors': 20, 'regularization': 0.01, 'itera..."
7,28.818528,"{'factors': 20, 'regularization': 0.1, 'iterat..."
3,28.730578,"{'factors': 10, 'regularization': 0.1, 'iterat..."
0,28.210202,"{'factors': 10, 'regularization': 0.01, 'itera..."
2,28.078276,"{'factors': 10, 'regularization': 0.1, 'iterat..."
4,27.887716,"{'factors': 20, 'regularization': 0.01, 'itera..."
6,27.858399,"{'factors': 20, 'regularization': 0.1, 'iterat..."
11,26.729698,"{'factors': 50, 'regularization': 0.1, 'iterat..."
9,26.502492,"{'factors': 50, 'regularization': 0.01, 'itera..."


### Fit final model

In [30]:
best_params = result["params"].iloc[0].copy()
best_params["calculate_training_loss"] = True
print(best_params)

{'factors': 10, 'regularization': 0.01, 'iterations': 20, 'calculate_training_loss': True, 'random_state': 42}


In [31]:
model = implicit.als.AlternatingLeastSquares(**best_params)
model.fit((sparse_receipt_item * alpha_val).astype("double"), show_progress=True)

  0%|          | 0/20 [00:00<?, ?it/s]

In [32]:
joblib.dump({"model": model, "params": best_params, "alpha_val": alpha_val},
            "/content/drive/MyDrive/1.0-cosmetic/candidate_model.joblib")

['/content/drive/MyDrive/1.0-cosmetic/candidate_model.joblib']

In [33]:
# извлечение эмбедов из ALS
receipt_vecs = model.user_factors
item_vecs = model.item_factors

receipt_vecs_csr = csr_matrix(receipt_vecs)
item_vecs_csr = csr_matrix(item_vecs)

item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

### Generate candidates

In [34]:
cosmetic_valid_agg["preds"] = cosmetic_valid_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

cosmetic_valid_agg = cosmetic_valid_agg[cosmetic_valid_agg["preds"].apply(len) > 0].reset_index(drop=True)

100%|██████████| 13644/13644 [00:10<00:00, 1303.25it/s]


In [35]:
print("accuracy: ", ((cosmetic_valid_agg["preds"].apply(lambda x: x[0][0]) == cosmetic_valid_agg["target"]).mean() * 100))
print("hit@10: ", ((cosmetic_valid_agg.apply(lambda x: x["target"] in [i[0] for i in x["preds"]], axis=1)).mean() * 100))

accuracy:  9.315450014658458
hit@10:  28.90647903840516


In [36]:
cosmetic_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,receipt_cat,item_cat,preds
12265,356645110747963,14431084061,"(200391,)",200154,,[380],"[(200589, 0.94088095), (200246, 0.8881504), (2..."
1281,352398085964585,14267069584,"(200183, 200220, 200221, 200062, 200063)",200067,,"[180, 216, 217, 62, 63]","[(200058, 0.9986763), (200276, 0.99287474), (2..."
1151,352398085964585,13715920260,"(200220, 200222)",200221,4528.0,"[216, 218]","[(200559, 0.3674946129322052), (200067, 0.3540..."
11209,356645110714724,13696530507,"(200161, 200547, 200259, 200009, 200332, 20014...",200535,,"[158, 532, 254, 9, 325, 141, 298, 207, 51, 175]","[(200485, 0.96618605), (200073, 0.95139116), (..."
1779,352398088070463,12081396410,"(200286,)",200275,,[280],"[(200606, 0.9677569), (200623, 0.94389176), (2..."


In [37]:
cosmetic_valid_agg = cosmetic_valid_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

cosmetic_valid_agg = pd.concat([cosmetic_valid_agg,
                                pd.DataFrame(cosmetic_valid_agg["preds"].tolist(), columns=["candidate", "als_score"])],
                              axis=1) \
  .drop(["preds"], axis=1)

In [38]:
cosmetic_valid_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
293788,356645110800358,16133329740,200008,200051,200301,0.97705
8751,352398083991747,14335990409,200232,200051,200525,0.950369
90343,352398090018898,10655416563,200610,200002,200372,0.822582
202084,356645110583194,15480832223,200345,200651,200651,0.670774
67344,352398089986709,10482281183,200275,200345,200292,0.782745


In [39]:
cosmetic_val_agg = cosmetic_val.merge(cosmetic_target.rename(columns={"item_id": "target"}), on=["receipt_id"], how="left") \
  .drop("name", axis=1)

cosmetic_val_agg = cosmetic_val_agg.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

cosmetic_val_agg["receipt_cat"] = cosmetic_val_agg["item_id"].map(receipt_2idx.get)
cosmetic_val_agg["item_cat"] = cosmetic_val_agg["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

cosmetic_val_agg["preds"] = cosmetic_val_agg.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

cosmetic_val_agg = cosmetic_val_agg \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

cosmetic_val_agg = pd.concat([cosmetic_val_agg, pd.DataFrame(cosmetic_val_agg["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

100%|██████████| 22761/22761 [00:10<00:00, 2093.51it/s]


In [40]:
cosmetic_val_agg.sample(5)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
458380,356645110747963,13309500792,200009,200268,200428.0,0.935756
318648,356645110489244,14558478760,200049,200530,200051.0,0.616613
153340,352398090018898,10821576201,200478,200193,200486.0,0.977588
307927,356645110489244,12998889328,200605,200620,200234.0,0.646654
390309,356645110691534,9849209063,200588,200514,200345.0,0.96071


In [41]:
cosmetic_val_agg = cosmetic_val_agg.dropna(subset=["candidate"]) \
  .astype({"candidate": int})

In [42]:
cosmetic_train.to_csv("/content/drive/MyDrive/1.0-cosmetic/train.csv", index=False)
cosmetic_valid_agg.to_csv("/content/drive/MyDrive/1.0-cosmetic/valid_agg_raw.csv", index=False)
cosmetic_val_agg.to_csv("/content/drive/MyDrive/1.0-cosmetic/val_agg_raw.csv", index=False)

In [43]:
assert cosmetic_valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert cosmetic_val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

## Calculate pair features

### By products

In [44]:
le = LabelEncoder()
cosmetic_train["item_id_enc"] = le.fit_transform(cosmetic_train["item_id"])

checks = cosmetic_train.groupby(["receipt_id"])["item_id_enc"].apply(lambda x: list(set(x))).tolist()
pairs = []

for check in checks:
  if len(check) == 2:
    pairs.append(check)
  else:
    pairs += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs = pd.DataFrame(pairs, columns=["item_1", "item_2"]) \
  .drop_duplicates(subset=["item_1", "item_2"])
pairs = pairs[pairs["item_1"] != pairs["item_2"]].reset_index(drop=True)

mtx = cosmetic_train[["receipt_id", "item_id_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["item_id_enc"])),
                shape=(cosmetic_train["receipt_id"].nunique(), cosmetic_train["item_id_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_res = []

for idx, row in tqdm(pairs.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_res.append((idx, (m == 2).sum()))

del mtx

pairs_res = pd.DataFrame(pairs_res, columns=[0, "both"]).set_index(0)
item_receipts = cosmetic_train.groupby(["item_id"])["receipt_id"].nunique().to_dict()

pairs = pd.concat([pairs, pairs_res], axis=1)

del pairs_res

pairs["item_id"] = le.inverse_transform(pairs["item_1"])
pairs["candidate"] = le.inverse_transform(pairs["item_2"])

unique_receipts = cosmetic_train["receipt_id"].nunique()

pairs["left"] = pairs["item_id"].map(item_receipts.get)
pairs["right"] = pairs["candidate"].map(item_receipts.get)

pairs["left_frac"] = pairs["left"] / unique_receipts
pairs["right_frac"] = pairs["right"] / unique_receipts
pairs["both_left_frac"] = pairs["both"] / pairs["left"]
pairs["both_right_frac"] = pairs["both"] / pairs["right"]

pairs = pairs.drop(["item_1", "item_2"], axis=1)

pairs = pd.concat([pairs.copy().rename(columns={"item_id": "candidate", "candidate": "item_id"}), pairs], axis=0) \
  .drop_duplicates(subset=["candidate", "item_id"]) \
  .reset_index(drop=True)

32710it [01:17, 423.36it/s]


In [45]:
pairs.to_csv("/content/drive/MyDrive/1.0-cosmetic/pairs.csv", index=False)

In [46]:
pairs.sample(5)

Unnamed: 0,both,candidate,item_id,left,right,left_frac,right_frac,both_left_frac,both_right_frac
31443,2,200195,200570,151,694,0.002764,0.012702,0.013245,0.002882
16290,3,200200,200355,805,324,0.014733,0.00593,0.003727,0.009259
23078,3,200163,200253,195,332,0.003569,0.006076,0.015385,0.009036
46831,4,200564,200259,385,222,0.007046,0.004063,0.01039,0.018018
53608,3,200305,200332,1000,237,0.018302,0.004338,0.003,0.012658


In [47]:
assert pairs.duplicated(subset=["item_id", "candidate"]).sum() == 0

In [48]:
quantity_total_hist_device = cosmetic_train.groupby(["device_id", "item_id"])["quantity"].sum().rename("quantity_total_hist_device").reset_index()
quantity_total_hist = cosmetic_train.groupby(["item_id"])["quantity"].sum().rename("quantity_total_hist").reset_index()

quantity_total_hist_device.to_csv("/content/drive/MyDrive/1.0-cosmetic/quantity_total_hist_device.csv", index=False)
quantity_total_hist.to_csv("/content/drive/MyDrive/1.0-cosmetic/quantity_total_hist.csv", index=False)

In [49]:
cosmetic_valid_agg.sample(1)

Unnamed: 0,device_id,receipt_id,item_id,target,candidate,als_score
94825,352398090018898,11517950503,200433,200002,200136,0.942686


In [50]:
cosmetic_valid_agg = cosmetic_valid_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left") \
  .merge(cosmetic_valid[["receipt_id", "local_date"]].drop_duplicates(), on=["receipt_id"], how="left")

cosmetic_val_agg = cosmetic_val_agg.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left") \
  .merge(cosmetic_val[["receipt_id", "local_date"]].drop_duplicates(), on=["receipt_id"], how="left")

In [51]:
assert cosmetic_valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert cosmetic_val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

### By product type

In [52]:
cat_model_cosmetic = Word2Vec.load("/content/drive/MyDrive/vseros/cosmetic_word2vec.model")

item2category = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_item_id_categ_map.csv", sep=";")
item2category.sample(2)

Unnamed: 0,item_id,category_noun
444,200082,гель
498,200019,криомасло


In [53]:
cosmetic_train = cosmetic_train.merge(item2category, on=["item_id"], how="left")

In [54]:
le = LabelEncoder()
cosmetic_train["category_noun_enc"] = le.fit_transform(cosmetic_train["category_noun"])

checks = cosmetic_train.groupby(["receipt_id"])["category_noun_enc"].apply(lambda x: list(set(x))).tolist()
pairs_cat = []

for check in checks:
  if len(check) == 2:
    pairs_cat.append(check)
  else:
    pairs_cat += [list(set(sublist)) for sublist in combinations(check, 2)]

del checks

pairs_cat = pd.DataFrame(pairs_cat, columns=["category_1", "category_2"]) \
  .drop_duplicates(subset=["category_1", "category_2"])
pairs_cat = pairs_cat[pairs_cat["category_1"] != pairs_cat["category_2"]].reset_index(drop=True)

mtx = cosmetic_train[["receipt_id", "category_noun_enc"]].drop_duplicates()
mtx = coo_array((np.ones(mtx.shape[0]),
                 (mtx["receipt_id"].astype("category").cat.codes, mtx["category_noun_enc"])),
                shape=(cosmetic_train["receipt_id"].nunique(), cosmetic_train["category_noun_enc"].nunique())) \
                .tocsr().astype(np.int8).toarray()

pairs_cat_res = []

for idx, row in tqdm(pairs_cat.iterrows()):
    m = mtx[:, row.values].sum(axis=1)
    pairs_cat_res.append((idx, (m == 2).sum()))

del mtx

pairs_cat_res = pd.DataFrame(pairs_cat_res, columns=[0, "both"]).set_index(0)
cat_receipts = cosmetic_train.groupby(["category_noun"])["receipt_id"].nunique().to_dict()

pairs_cat = pd.concat([pairs_cat, pairs_cat_res], axis=1)

del pairs_cat_res

pairs_cat["category_noun"] = le.inverse_transform(pairs_cat["category_1"])
pairs_cat["category_noun_candidate"] = le.inverse_transform(pairs_cat["category_2"])


pairs_cat["left"] = pairs_cat["category_noun"].map(cat_receipts.get)
pairs_cat["right"] = pairs_cat["category_noun_candidate"].map(cat_receipts.get)

pairs_cat["cat_both_left_frac"] = pairs_cat["both"] / pairs_cat["left"]
pairs_cat["cat_both_right_frac"] = pairs_cat["both"] / pairs_cat["right"]

pairs_cat = pairs_cat.drop(["category_1", "category_2", "left", "right"], axis=1)

pairs_cat = pd.concat([pairs_cat.copy().rename(columns={"category_noun": "category_noun_candidate", "category_noun_candidate": "category_noun"}), pairs_cat], axis=0) \
  .drop_duplicates(subset=["category_noun_candidate", "category_noun"]) \
  .reset_index(drop=True)

2408it [00:02, 1181.39it/s]


In [55]:
pairs_cat.to_csv("/content/drive/MyDrive/1.0-cosmetic/pairs_categories.csv", index=False)

In [56]:
pairs_cat.sample(2)

Unnamed: 0,both,category_noun_candidate,category_noun,cat_both_left_frac,cat_both_right_frac
1812,61,коробка,карандаш,0.00998,0.079118
3856,1,тушь,саше,0.003436,0.001961


In [57]:
cosmetic_valid_agg = cosmetic_valid_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

cosmetic_val_agg = cosmetic_val_agg \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

In [58]:
cosmetic_valid_agg.shape

(310909, 21)

In [59]:
assert cosmetic_valid_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0
assert cosmetic_val_agg.duplicated(subset=["receipt_id", "item_id", "candidate"]).sum() == 0

In [60]:
cosmetic_valid_agg["w2v_sim"] = cosmetic_valid_agg.progress_apply(lambda x:
                                          cosine_similarity(cat_model_cosmetic.wv.get_vector(x["category_noun"]).reshape(1, -1),
                                                            cat_model_cosmetic.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0],
                                          axis=1
                                          )

cosmetic_val_agg["w2v_sim"] = cosmetic_val_agg.progress_apply(lambda x:
                                          cosine_similarity(cat_model_cosmetic.wv.get_vector(x["category_noun"]).reshape(1, -1),
                                                            cat_model_cosmetic.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0],
                                          axis=1
                                          )

100%|██████████| 310909/310909 [02:07<00:00, 2445.39it/s]
100%|██████████| 516349/516349 [03:23<00:00, 2535.62it/s]


In [61]:
cosmetic_valid_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)
cosmetic_val_agg.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)

In [62]:
cosmetic_valid_agg.to_csv("/content/drive/MyDrive/1.0-cosmetic/valid_agg_features.csv", index=False)
cosmetic_val_agg.to_csv("/content/drive/MyDrive/1.0-cosmetic/val_agg_features.csv", index=False)

## Fit classifier

In [63]:
cosmetic_valid_agg["y"] = (cosmetic_valid_agg["target"] == cosmetic_valid_agg["candidate"]).astype(int)
cosmetic_val_agg["y"] = (cosmetic_val_agg["target"] == cosmetic_val_agg["candidate"]).astype(int)

In [64]:
non_features = ["device_id", "receipt_id", "item_id", "target", "candidate", "y", "local_date"]

### Select parameters

In [65]:
class_weights = compute_class_weight("balanced", classes=np.unique(cosmetic_valid_agg["y"]), y=cosmetic_valid_agg["y"])

param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
}

best_score = 0
best_params = {}

for n_estimators in tqdm(param_grid["n_estimators"], position=0, leave=False):
    for max_depth in tqdm(param_grid["max_depth"], position=1, leave=False):
        for min_samples_split in tqdm(param_grid["min_samples_split"], position=2, leave=False):
            for min_samples_leaf in tqdm(param_grid["min_samples_leaf"], position=3, leave=False):
                  rf = RandomForestClassifier(
                      n_estimators=n_estimators,
                      max_depth=max_depth,
                      min_samples_split=min_samples_split,
                      min_samples_leaf=min_samples_leaf,
                      class_weight=dict(enumerate(class_weights))
                  )
                  rf.fit(cosmetic_valid_agg.drop(non_features, axis=1).fillna(0),
                         cosmetic_valid_agg["y"])

                  y_pred = rf.predict(cosmetic_val_agg.drop(non_features, axis=1).fillna(0))

                  f1 = f1_score(cosmetic_val_agg["y"], y_pred)

                  if f1 > best_score:
                      best_score = f1
                      best_params = {
                          "n_estimators": n_estimators,
                          "max_depth": max_depth,
                          "min_samples_split": min_samples_split,
                          "min_samples_leaf": min_samples_leaf,
                      }

print("Лучшие параметры: ", best_params)
print("Лучший F1-скор: ", best_score)

  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/3 [00:00<?, ?it/s][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎      | 1/3 [00:42<01:24, 42.49s/it][A[A[A


 67%|██████▋   | 2/3 [01:22<00:40, 40.88s/it][A[A[A


100%|██████████| 3/3 [01:59<00:00, 39.38s/it][A[A[A


                                             [A[A[A

 50%|█████     | 1/2 [01:59<01:59, 119.85s/it][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎      | 1/3 [00:42<01:24, 42.22s/it][A[A[A


 67%|██████▋   | 2/3 [01:21<00:40, 40.78s/it][A[A[A


100%|██████████| 3/3 [02:01<00:00, 39.97s/it][A[A[A


                                             [A[A[A

100%|██████████| 2/2 [04:00<00:00, 120.55s/it][A[A

                                              [A[A
 33%|███▎      | 1/3 [04:00<08:01, 240.90s/it][A

  0%|          | 0/2 [00:00<?, ?it/s][A[A


  0%|          | 0/3 [00:00<?, ?it/s][A[A[A


 33%|███▎     

Лучшие параметры:  {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 4}
Лучший F1-скор:  0.2289002557544757




### Fit final model

In [66]:
best_params["class_weight"] = dict(enumerate(class_weights))
model = RandomForestClassifier(**best_params)

In [67]:
%%time
model.fit(cosmetic_valid_agg.drop(non_features, axis=1).fillna(0),
          cosmetic_valid_agg["y"])

CPU times: user 32.4 s, sys: 28.3 ms, total: 32.4 s
Wall time: 32.6 s


In [68]:
joblib.dump({"model": model, "params": best_params},
            "/content/drive/MyDrive/1.0-cosmetic/classifier_model.joblib")

['/content/drive/MyDrive/1.0-cosmetic/classifier_model.joblib']

In [69]:
y_pred = model.predict(cosmetic_val_agg.drop(non_features, axis=1).fillna(0))
y_proba = model.predict_proba(cosmetic_val_agg.drop(non_features, axis=1).fillna(0))[:, 1]

In [70]:
print(classification_report(cosmetic_val_agg["y"], y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96    500837
           1       0.17      0.33      0.23     15512

    accuracy                           0.93    516349
   macro avg       0.58      0.64      0.60    516349
weighted avg       0.95      0.93      0.94    516349



In [71]:
cosmetic_val_agg["proba"] = y_proba

In [72]:
result = cosmetic_val_agg.sort_values("proba", ascending=False) \
  .groupby(["receipt_id", "target"], sort=False)["candidate"].first() \
  .reset_index()

In [73]:
(result["candidate"] == result["target"]).mean() * 100

13.51054481546573

## Generate final predictions

In [74]:
predict = pd.read_csv("/content/drive/MyDrive/vseros/cosmetic_val.tsv", sep="\t")
predict["local_date"] = pd.to_datetime(predict["local_date"])

In [75]:
predict["receipt_id"].nunique()

22761

In [76]:
predict = predict.merge(cosmetic_target.rename(columns={"item_id": "target"}), on=["receipt_id"], how="left") \
  .drop("name", axis=1)

predict = predict.groupby(["device_id", "receipt_id"]).agg({"item_id": lambda x: tuple(set(x)), "target": "max"}) \
  .reset_index()

predict["receipt_cat"] = predict["item_id"].map(receipt_2idx.get)
predict["item_cat"] = predict["item_id"].apply(lambda x: [item_2idx.get(i) for i in x if i in item_2idx])

predict["preds"] = predict.progress_apply(
    lambda x:
    recommend_to_receipt(int(x["receipt_cat"]), sparse_receipt_item, receipt_vecs_csr, item_vecs_csr, idx_2item, num_recs)
    if not np.isnan(x["receipt_cat"])
    else recommend_to_items(x["item_cat"], item_norms, item_vecs, idx_2item, num_recs), axis=1)

predict = predict \
  .drop(["receipt_cat", "item_cat"], axis=1) \
  .explode("preds") \
  .explode("item_id") \
  .reset_index(drop=True)

predict = pd.concat([predict, pd.DataFrame(predict["preds"].tolist(), columns=["candidate", "als_score"])], axis=1) \
  .drop(["preds"], axis=1)

100%|██████████| 22761/22761 [00:05<00:00, 3834.42it/s]


In [77]:
predict = predict.merge(pairs, on=["item_id", "candidate"], how="left") \
  .merge(quantity_total_hist_device.rename(columns={"item_id": "candidate"}), on=["device_id", "candidate"], how="left") \
  .merge(quantity_total_hist.rename(columns={"item_id": "candidate"}), on=["candidate"], how="left") \
  .merge(cosmetic_val[["receipt_id", "local_date"]].drop_duplicates(), on=["receipt_id"], how="left")

In [78]:
predict = predict \
  .merge(item2category, on=["item_id"], how="left") \
  .merge(item2category.rename(columns={"item_id": "candidate", "category_noun": "category_noun_candidate"}), on=["candidate"], how="left") \
  .merge(pairs_cat, on=["category_noun", "category_noun_candidate"], how="left")

In [79]:
predict.loc[predict["candidate"].notna(), "w2v_sim"] = predict[predict["candidate"].notna()].progress_apply(lambda x:
                                          cosine_similarity(cat_model_cosmetic.wv.get_vector(x["category_noun"]).reshape(1, -1),
                                                            cat_model_cosmetic.wv.get_vector(x["category_noun_candidate"]).reshape(1, -1))[0, 0],
                                          axis=1
                                          )
predict.drop(["category_noun", "category_noun_candidate"], axis=1, inplace=True)

100%|██████████| 516349/516349 [03:24<00:00, 2528.68it/s]


In [80]:
predict["proba"] = (model.predict_proba(predict.drop([i for i in non_features if i != "y"], axis=1).fillna(0))[:, 1] * 100).round(2)

In [81]:
predict_result = predict.sort_values("proba", ascending=False) \
  .groupby(["receipt_id", "target"], sort=False).agg({"candidate": "first",
                                                      "proba": "first"}) \
  .reset_index()

In [82]:
predict_result["candidate"] = predict_result["candidate"].fillna(-1).astype(int)

In [83]:
predict_result.to_csv("/content/drive/MyDrive/1.0-cosmetic/predict.csv", index=False)