In [1]:
!pip install mlxtend



In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
import pandas as pd
import re
import seaborn as sns

In [3]:
cosmetic_train = pd.read_csv("/kaggle/input/vseros-data/cosmetic_train.tsv", sep="\t")
cosmetic_val = pd.read_csv("/kaggle/input/vseros-data/cosmetic_val.tsv", sep="\t")
cosmetic_target = pd.read_csv("/kaggle/input/vseros-data/cosmetic_val_target.tsv", sep="\t") \
    .drop_duplicates() \
    .reset_index(drop=True)

super_train = pd.read_csv("/kaggle/input/vseros-data/supermarket_train.tsv", sep="\t")
super_val = pd.read_csv("/kaggle/input/vseros-data/supermarket_val.tsv", sep="\t")
super_target = pd.read_csv("/kaggle/input/vseros-data/supermarket_val_target.tsv", sep="\t") \
    .drop_duplicates() \
    .reset_index(drop=True)

# Cosmetic

In [4]:
cosmetic_train_checks = cosmetic_train.groupby(["receipt_id"])["item_id"] \
    .apply(lambda x: list(set(x.tolist()))) 
cosmetic_train_checks = cosmetic_train_checks[cosmetic_train_checks.apply(len) > 1].tolist()

In [5]:
%%time
cosmetic_te = TransactionEncoder()
te_ary = cosmetic_te.fit(cosmetic_train_checks).transform(cosmetic_train_checks)
df_cosmetic = pd.DataFrame(te_ary, columns=cosmetic_te.columns_)

CPU times: user 88.2 ms, sys: 20.9 ms, total: 109 ms
Wall time: 108 ms


In [6]:
%%time
cosmetic_itemsets = apriori(df_cosmetic, min_support=1e-3, use_colnames=True, max_len=2)
cosmetic_itemsets = cosmetic_itemsets[cosmetic_itemsets["itemsets"].apply(len) > 1]
cosmetic_itemsets["first_item"] = cosmetic_itemsets["itemsets"].apply(lambda x: list(x)[0])
cosmetic_itemsets["second_item"] = cosmetic_itemsets["itemsets"].apply(lambda x: list(x)[1])
cosmetic_itemsets.drop("itemsets", axis=1, inplace=True)

CPU times: user 15.8 s, sys: 2.31 s, total: 18.1 s
Wall time: 18.1 s


In [52]:
first = cosmetic_val.merge( 
    cosmetic_itemsets.rename(columns={"first_item": "item_id", 
                                      "second_item": "support_item"}), 
    on=["item_id"], how="left"
)[["receipt_id", "support", "support_item"]]

second = cosmetic_val.merge( 
    cosmetic_itemsets.rename(columns={"second_item": "item_id", 
                                      "first_item": "support_item"}), 
    on=["item_id"], how="left"
)[["receipt_id", "support", "support_item"]]

cosmetic_res = pd.concat([first, second], axis=0)

del first, second

In [53]:
cosmetic_target = cosmetic_res.sort_values("support", ascending=False) \
    .drop_duplicates(subset=["receipt_id"]) \
    .drop("support", axis=1) \
    .merge(cosmetic_target, on=["receipt_id"], how="right") \
    .fillna({"support_item": -1}) \
    .astype({"support_item": int})

In [54]:
cosmetic_target["accuracy_1"] = cosmetic_target["support_item"] == cosmetic_target["item_id"]
print("accuracy valid: %.2f" % (cosmetic_target["accuracy_1"].mean() * 100))

accuracy valid: 12.75


In [56]:
cosmetic_target = cosmetic_res.sort_values("support", ascending=False) \
    .groupby(["receipt_id"]).apply(lambda x: x.drop_duplicates(subset=["support_item"]).iloc[:5]) \
    .reset_index(drop=True) \
    .drop("support", axis=1) \
    .merge(cosmetic_target, on=["receipt_id"], how="right") \
    .fillna({"support_item_x": -1}) \
    .astype({"support_item_x": int})

In [74]:
cosmetic_target["accuracy_2"] = cosmetic_target["support_item_x"] == cosmetic_target["item_id"]
print("hit@5 valid: %.2f" % (cosmetic_target.groupby(["receipt_id"])["accuracy_2"].max().mean() * 100))

hit@5 valid: 25.29


# Super

In [10]:
super_train_checks = super_train.groupby(["receipt_id"])["item_id"] \
    .apply(lambda x: list(set(x.tolist()))) 
super_train_checks = super_train_checks[super_train_checks.apply(len) > 1].tolist()

In [11]:
%%time
super_te = TransactionEncoder()
te_ary = super_te.fit(super_train_checks).transform(super_train_checks)
df_super = pd.DataFrame(te_ary, columns=super_te.columns_)

CPU times: user 874 ms, sys: 1.99 s, total: 2.87 s
Wall time: 2.87 s


In [12]:
%%time
super_itemsets = apriori(df_super, min_support=1e-2, use_colnames=True, max_len=2)
super_itemsets = super_itemsets[super_itemsets["itemsets"].apply(len) > 1]
super_itemsets["first_item"] = super_itemsets["itemsets"].apply(lambda x: list(x)[0])
super_itemsets["second_item"] = super_itemsets["itemsets"].apply(lambda x: list(x)[1])
super_itemsets.drop("itemsets", axis=1, inplace=True)

CPU times: user 3.36 s, sys: 185 ms, total: 3.54 s
Wall time: 3.54 s


In [58]:
first = super_val.merge( 
    super_itemsets.rename(columns={"first_item": "item_id", 
                                      "second_item": "support_item"}), 
    on=["item_id"], how="left"
)[["receipt_id", "support", "support_item"]]

second = super_val.merge( 
    super_itemsets.rename(columns={"second_item": "item_id", 
                                      "first_item": "support_item"}), 
    on=["item_id"], how="left"
)[["receipt_id", "support", "support_item"]]

super_res = pd.concat([first, second], axis=0)

del first, second

In [59]:
super_target = super_res.sort_values("support", ascending=False) \
    .drop_duplicates(subset=["receipt_id"]) \
    .drop("support", axis=1) \
    .merge(super_target, on=["receipt_id"], how="right") \
    .fillna({"support_item": -1}) \
    .astype({"support_item": int})

In [60]:
super_target["accuracy_1"] = super_target["support_item"] == super_target["item_id"]
print("accuracy valid: %.2f" % (super_target["accuracy_1"].mean() * 100))

accuracy valid: 1.93


In [61]:
super_target = super_res.sort_values("support", ascending=False) \
    .groupby(["receipt_id"]).apply(lambda x: x.drop_duplicates(subset=["support_item"]).iloc[:5]) \
    .reset_index(drop=True) \
    .drop("support", axis=1) \
    .merge(super_target, on=["receipt_id"], how="right") \
    .fillna({"support_item_x": -1}) \
    .astype({"support_item_x": int})

In [72]:
super_target["accuracy_2"] = super_target["support_item_x"] == super_target["item_id"]
print("hit@5 valid: %.2f" % (super_target.groupby(["receipt_id"])["accuracy_2"].max().mean() * 100))

hit@5 valid: 3.05
