In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../scripts/")

from utils import logger
import metrics

In [3]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
from typing import List
from dataclasses import dataclass
import pandas as pd
pd.set_option("mode.chained_assignment", "raise")

from scripts.first_stage_models.BaseModelv2 import BaseRecommenderv2

attribute_name_key = "attribute_name"
min_support_key = "min_support_key"
item_id_column_key = "item_id_column_key"
user_id_column_key = "user_id_column_key"
items_key = "items_key"


class ARulesRecommender(BaseRecommenderv2):
    def __init__(self,
                 config: dict,
                 cold_items_recommender=None,
                 int_article_id=None,
                 int_customer_id=None):
        """

        :param config:
        :param cold_items_recommender:
        :param int_article_id:
        :param int_customer_id:
        """
        super().__init__(cold_items_recommender=cold_items_recommender,
                         int_article_id=int_article_id,
                         int_customer_id=int_customer_id)

        self.config = config

        self.encoder = None
        self.frequent_itemsets = None
        self.rules = None
        self.previous_interactions = None

    def _rename_duplicates(self, items: List[str]):
        seen = set()
        dupes = [item for item in items if item in seen or seen.add(item)]
        renamed_items = [item + "_dup" for item in dupes]
        return list(set(items + renamed_items))

    def _interactions_to_item_lists(self, interactions: pd.DataFrame) -> pd.DataFrame:
        """
        Create a list of items for each user
        :param interactions:
        :return:
        """
        grouped_interactions = interactions.groupby(by=[self.config[user_id_column_key]])[
            [self.config[attribute_name_key]]].agg(list).reset_index()

        grouped_interactions.loc[:, self.config[attribute_name_key]] = grouped_interactions[
            self.config[attribute_name_key]].apply(
            self._rename_duplicates)
        return grouped_interactions

    def _association_rules_analysis(self, user_item_lists: pd.DataFrame):
        encoder = TransactionEncoder()
        interaction_matrix = encoder.fit_transform(user_item_lists[self.config[attribute_name_key]])
        df = pd.DataFrame(interaction_matrix, columns=encoder.columns_)

        frequent_itemsets = fpgrowth(df,
                                     min_support=self.config[min_support_key],
                                     use_colnames=True,
                                     max_len=10)
        rules = association_rules(frequent_itemsets,
                                  metric="confidence",
                                  min_threshold=self.config[min_support_key])
        return frequent_itemsets, rules, encoder

    def _recommend_items(self, previous_user_items):
        # TODO: fix const = 30
        tmp = self.rules[self.rules['antecedents'].apply(lambda x: x.issubset(previous_user_items))][
                  "consequents"].drop_duplicates().tolist()[:20]
        result = list(set().union(*tmp))[:12]
        return result

    def _predict(self,
                customers: list,
                top_k: int = 12) -> pd.DataFrame:
        """
        Make predictions for c in customers which were  presented in train  interactions
        :param customers:
        :param top_k:
        :return:
        """
        previous_interactions = self.previous_interactions[
            self.previous_interactions[self.config[user_id_column_key]].isin(customers)]

        predictions = self._interactions_to_item_lists(previous_interactions)
        predictions["predicted_ids"] = predictions[self.config[attribute_name_key]].apply(
            self._recommend_items)
        predictions = predictions.rename(columns={config[item_id_column_key]: "previous_ids"})
        predictions["predicted_ids"] = predictions["predicted_ids"].apply(lambda x: [int(xi.split('_')[0]) for xi in x])
        predictions["predicted_ids"] = predictions["predicted_ids"].apply(lambda x: x[:top_k])
        predictions = predictions[predictions.predicted_ids.apply(len)>0]
        predictions["score"] = 1
        return predictions

    def fit(self, interactions: pd.DataFrame):
        interactions = interactions.copy()
        items = self.config[items_key]
        interactions.loc[:, self.config[attribute_name_key]] = interactions.loc[:, self.config[attribute_name_key]].astype(str)
        if self.config[item_id_column_key] != self.config[attribute_name_key]:
            interactions = interactions.merge(items[[self.config[item_id_column_key], self.config[attribute_name_key]]],
                                              on=self.config[item_id_column_key],
                                              how="left")
        user_item_lists = self._interactions_to_item_lists(interactions)
        frequent_itemsets, rules, encoder = self._association_rules_analysis(user_item_lists)
        self.encoder = encoder
        self.frequent_itemsets = frequent_itemsets
        self.rules = rules
        self.previous_interactions = interactions

In [4]:
import numpy as np
import os
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules
from typing import List
from dataclasses import dataclass
import pandas as pd
from tqdm import tqdm, tqdm_notebook

In [5]:
import seaborn as sns
from matplotlib import pyplot as plt

In [6]:
sns.set()
plt.rcParams['figure.figsize'] = [10, 4]
plt.rcParams['figure.dpi'] = 100
plt.style.use('seaborn-whitegrid')
np.set_printoptions(precision=4)

pd.options.display.max_rows = 50
pd.options.display.max_columns = 50
np.set_printoptions(edgeitems=10)
np.core.arrayprint._line_width = 500
pd.set_option('display.width', 1000)



In [7]:
transactions = pd.read_parquet('../data/compressed_dataset/transactions.parquet')
articles = pd.read_parquet('../data/compressed_dataset/articles.parquet')
customers = pd.read_parquet('../data/compressed_dataset/customers.parquet')

article_id_int = pd.read_pickle('../data/compressed_dataset/article_id_int.pickle')
int_article_id = pd.read_pickle('../data/compressed_dataset/int_article_id.pickle')

customer_id_int = pd.read_pickle('../data/compressed_dataset/customer_id_int.pickle')
int_customer_id = pd.read_pickle('../data/compressed_dataset/int_customer_id.pickle')

transactions["t_dat"] = pd.to_datetime(transactions["t_dat"])

# Association Rules

In [8]:
def split_transactions(transactions: pd.DataFrame,
                       assessed_date: pd.Timestamp,
                       history_size_days: int
                       ):
    """
    Split all transaction into two parts:
    train_transactions - [assessed_date - history_size_days, assesed_days)
    test_transactions - [assessed_date, assessed_date + 6d]
    :param transactions:
    :param assessed_date:
    :param history_size_days:
    :return:
    """
    max_test_date = assessed_date + pd.Timedelta("6d")
    min_train_date = assessed_date - pd.Timedelta(f"{history_size_days}d")
    test_transactions = transactions[
        (transactions["t_dat"] >= assessed_date) &
        (transactions["t_dat"] <= max_test_date)]
    train_transactions = transactions[
        (transactions["t_dat"] >= min_train_date) &
        (transactions["t_dat"] < assessed_date)]
    return train_transactions, test_transactions

In [9]:
assessed_date = transactions["t_dat"].max() - pd.Timedelta("12d")
train_transactions, test_transactions = split_transactions(transactions,
                                                           assessed_date,
                                                           history_size_days=5)

In [10]:
print("n_train_transactions = ", len(train_transactions))
print("n_test_transactions = ", len(test_transactions))

n_train_transactions =  193917
n_test_transactions =  240643


In [11]:
print(train_transactions["t_dat"].min(), train_transactions["t_dat"].max())

2020-09-05 00:00:00 2020-09-09 00:00:00


In [12]:
print(test_transactions["t_dat"].min(), test_transactions["t_dat"].max())

2020-09-10 00:00:00 2020-09-16 00:00:00


In [13]:
config = {
    attribute_name_key: "article_id",
    min_support_key: 0.0001,
    item_id_column_key: "article_id",
    user_id_column_key: "customer_id",
    items_key: articles
}
recommender = ARulesRecommender(config)
recommender.fit(train_transactions)
recommender.rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(104157),(17134),0.004375,0.001882,0.000161,0.036885,19.593794,0.000153,1.036343
1,(17134),(104157),0.001882,0.004375,0.000161,0.085714,19.593794,0.000153,1.088965
2,(3091),(17134),0.006741,0.001882,0.000143,0.021277,11.302330,0.000131,1.019816
3,(17134),(3091),0.001882,0.006741,0.000143,0.076190,11.302330,0.000131,1.075177
4,(105258),(103303),0.004984,0.003406,0.000108,0.021583,6.335895,0.000091,1.018577
...,...,...,...,...,...,...,...,...,...
8545,(100418_dup),(100418),0.000108,0.000502,0.000108,1.000000,1992.035714,0.000108,inf
8546,(103557_dup),(103557),0.000108,0.000843,0.000108,1.000000,1186.744681,0.000107,inf
8547,(103557),(103557_dup),0.000843,0.000108,0.000108,0.127660,1186.744681,0.000107,1.146218
8548,(66517),(66517_dup),0.001614,0.000108,0.000108,0.066667,619.744444,0.000107,1.071313


# Test predictions

In [14]:
test_customers = list(test_transactions.customer_id.unique())
predictions = recommender.predict(customers=test_customers)
predictions

Unnamed: 0,customer_id,previous_ids,predicted_ids,score
0,38,"[100027, 104448, 105273, 103170, 17134]","[17125, 105274, 3091, 17132, 104157, 104155, 1...",1
2,327,"[66525, 80304, 69749, 80298, 70911]","[99655, 25799, 70911, 70916, 105179, 66517, 94...",1
4,402,"[85058, 105181, 70640, 105179]","[104986, 105258, 105179, 105181, 105306, 70911...",1
5,527,"[105306, 103791_dup, 105306_dup, 103791, 10176...","[103186, 105179, 105181, 105306, 105306, 10548...",1
6,919,"[97918, 101279, 93738, 71106, 105270, 98987]","[71101, 105270, 99398, 100282, 104434, 101278,...",1
...,...,...,...,...
10375,1370855,"[90436, 99254]","[93370, 99254, 99255]",1
10376,1371336,"[100162_dup, 43705, 43701, 100162, 97520]","[43711, 43695, 43705, 94657, 43701, 43708, 975...",1
10378,1371462,"[7762, 103668]","[103669, 103667, 53914, 103670, 103665, 53892,...",1
10380,1371691,"[104157, 104192_dup, 104192, 104193_dup, 104193]","[17125, 97251, 101192, 98237, 53896, 104434, 1...",1


In [15]:
def average_prec(true, pred):
    n_correct_items = 0.0
    score = 0.0
    for k, pred_item in enumerate(pred):
        if pred_item in true:
            n_correct_items += 1
            prec_k = n_correct_items / (k + 1)
            score += prec_k
    return score / len(pred)


def mean_average_prec(true, pred):
    assert len(true) == len(pred), "Different number of users"
    n = 0
    score = 0.0
    for true_items, pred_items in tqdm(list(zip(true, pred))):
        if len(true_items) > 0:
            score += average_prec(true_items, pred_items)
            n += 1
    print('users with purchases = ', n)
    return score / n

In [16]:
true_items = test_transactions.groupby(by="customer_id")["article_id"].agg(list).reset_index().rename(
    columns={"article_id": "true_article_ids"}
)
true_items["true_ids"] = true_items["true_article_ids"].apply(lambda x: x[:12])
df = true_items.merge(predictions, on="customer_id")[
    ["customer_id", "true_article_ids", "previous_ids", "predicted_ids"]]
df

Unnamed: 0,customer_id,true_article_ids,previous_ids,predicted_ids
0,38,[61916],"[100027, 104448, 105273, 103170, 17134]","[17125, 105274, 3091, 17132, 104157, 104155, 1..."
1,327,"[98445, 98445]","[66525, 80304, 69749, 80298, 70911]","[99655, 25799, 70911, 70916, 105179, 66517, 94..."
2,402,"[100947, 102939]","[85058, 105181, 70640, 105179]","[104986, 105258, 105179, 105181, 105306, 70911..."
3,527,"[104215, 103156]","[105306, 103791_dup, 105306_dup, 103791, 10176...","[103186, 105179, 105181, 105306, 105306, 10548..."
4,919,[104372],"[97918, 101279, 93738, 71106, 105270, 98987]","[71101, 105270, 99398, 100282, 104434, 101278,..."
...,...,...,...,...
7417,1370855,[90436],"[90436, 99254]","[93370, 99254, 99255]"
7418,1371336,"[100162, 75213]","[100162_dup, 43705, 43701, 100162, 97520]","[43711, 43695, 43705, 94657, 43701, 43708, 975..."
7419,1371462,[7762],"[7762, 103668]","[103669, 103667, 53914, 103670, 103665, 53892,..."
7420,1371691,"[104157, 104157, 104157, 87698, 2480, 82631]","[104157, 104192_dup, 104192, 104193_dup, 104193]","[17125, 97251, 101192, 98237, 53896, 104434, 1..."


In [17]:
mean_average_prec(df["true_article_ids"].tolist(),
                  df["predicted_ids"].tolist())

100%|██████████| 7422/7422 [00:00<00:00, 354117.60it/s]

users with purchases =  7422





0.03056040177816023

# Try to include all users (last 5 transactions (or maybe more if they were within short period of time))

In [18]:
def split_transactions2(transactions: pd.DataFrame,
                       assessed_date: pd.Timestamp,
                       last_interactions_n
                       ):
    """
    Split all transaction into two parts:
    train_transactions - [assessed_date - history_size_days, assesed_days)
    test_transactions - [assessed_date, assessed_date + 6d]
    :param transactions:
    :param assessed_date:
    :param history_size_days:
    :return:
    """
    max_test_date = assessed_date + pd.Timedelta("6d")
    test_transactions = transactions[
        (transactions["t_dat"] >= assessed_date) &
        (transactions["t_dat"] <= max_test_date)]
    train_transactions = transactions[
        transactions["t_dat"] < assessed_date]
    train_transactions = train_transactions.sort_values(by = ["customer_id", "t_dat"]).groupby(by="customer_id").head(last_interactions_n)
    return train_transactions, test_transactions

In [19]:
assessed_date = transactions["t_dat"].max() - pd.Timedelta("6d")
train_transactions, test_transactions = split_transactions2(transactions,
                                                           assessed_date,
                                                           5)

In [None]:
config = {
    attribute_name_key: "article_id",
    min_support_key: 0.001,
    item_id_column_key: "article_id",
    user_id_column_key: "customer_id",
    items_key: articles
}
recommender = ARulesRecommender(config)
recommender.fit(train_transactions)
recommender.rules

In [None]:
test_customers = list(test_transactions.customer_id.unique())
predictions = recommender.predict(customers=test_customers)
predictions

In [None]:
true_items = test_transactions.groupby(by="customer_id")["article_id"].agg(list).reset_index().rename(
    columns={"article_id": "true_article_ids"}
)
true_items["true_ids"] = true_items["true_article_ids"].apply(lambda x: x[:12])
df = true_items.merge(predictions, on="customer_id")[
    ["customer_id", "true_article_ids", "previous_ids", "predicted_ids"]]
df

In [None]:
mean_average_prec(df["true_article_ids"].tolist(),
                  df["predicted_ids"].tolist())