In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install implicit

In [None]:
import datetime
from tqdm.notebook import tqdm
import scipy
import implicit

In [None]:
def apk(actual, predicted, k=3):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
        A list of elements that are to be predicted (order doesn't matter)
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
        The average precision at k over the input lists
    """
    if actual is None or len(actual) == 0:
        return 0.0

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(actual), k)


def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
            A list of lists of elements that are to be predicted
            (order doesn't matter in the lists)
    predicted : list
            A list of lists of predicted elements
            (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])
    

def get_date_k_weeks_before(cur_date: str, k: int = 2) -> str:
    """
    Return date k weeks before cur_date in format YYYY-MM-DD
    """
    cur_date = datetime.datetime.strptime(cur_date, "%Y-%m-%d")
    return (cur_date - datetime.timedelta(weeks=k)).strftime("%Y-%m-%d")


def remove_duplicates_order(x: list) -> list:
    """
    Removes duplicates from a list while preserving the order
    """
    return list(dict.fromkeys(x))


def get_validation_split(
    df: pd.DataFrame, timestamp_to_split: str
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Splits the dataframe into train and validation sets
    WARNING: Something might be missed here...
    Из val надо убрать товары, которые покупались этими пользователями до этого
    """
    df = df.sort_values("timestamp", ascending=False)
    data_train = df[df["timestamp"] < timestamp_to_split]
    val = df[df["timestamp"] >= timestamp_to_split]

    gt_val = val.groupby("user_id")[["product_id"]].agg(list).reset_index()
    gt_val["product_id"] = gt_val["product_id"].apply(remove_duplicates_order)

    timestamp_to_val_split = get_date_k_weeks_before(timestamp_to_split, k=2)
    data_val = data_train[data_train["timestamp"] >= timestamp_to_val_split]
    data_train = data_train[
        data_train["timestamp"] < timestamp_to_val_split
    ].reset_index(drop=True)

    gt_val = gt_val[gt_val["user_id"].isin(set(data_val["user_id"]))].reset_index(
        drop=True
    )
    data_val = data_val[data_val["user_id"].isin(set(gt_val["user_id"]))].reset_index(
        drop=True
    )

    gt_val_val = data_val.groupby("user_id")[["product_id"]].agg(list).reset_index()

    merged = gt_val_val.merge(gt_val, on="user_id")

    changed = []
    for idx in merged.index:
        prev, fut = merged.iloc[idx, 1:]
        if isinstance(prev, int):
            prev = [prev]
        if isinstance(fut, int):
            fut = [fut]
        out = list(set(fut).difference(set(prev)))
        if len(out) > 0:
            changed.append(out)
        else:
            changed.append(np.NAN)

    merged["product_id"] = changed

    merged = merged.dropna()

    return (
        data_train,
        data_val,
        merged[["user_id", "product_id"]],
        merged[["user_id", "product_id_x"]],
    )

In [None]:
data_tr = pd.read_parquet('/kaggle/input/ef-msu-2024-comp-3/archive/train.parquet')
data_te = pd.read_parquet('/kaggle/input/ef-msu-2024-comp-3/archive/test.parquet')
data_tr["price"] = data_tr["price"].astype(float)
data_te["price"] = data_te["price"].astype(float)

In [None]:
data_tr_feb = data_tr.loc[data_tr["timestamp"] >= "2020-02-01"]

data_tr_feb["timestamp"].quantile(0.5, interpolation="nearest")

In [None]:
data_train, data_val, gt_val, gt_test = get_validation_split(
    df=data_tr_feb, timestamp_to_split="2020-02-14"
)
gt_test.columns = ["user_id", "product_id"]

In [None]:
data_grouped = (
    data_val.groupby(["user_id", "product_id"])[["timestamp"]]
    .count()
    .reset_index()
    .rename(columns={"timestamp": "count"})
)

In [None]:
pred_set = set(data_grouped["product_id"])
pred_dict = {url: idurl for url, idurl in zip(pred_set, range(len(pred_set)))}
usr_set = set(data_grouped["user_id"])
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

In [None]:
values = np.array(data_grouped["count"])
rows = np.array(data_grouped["user_id"].map(usr_dict))
cols = np.array(data_grouped["product_id"].map(pred_dict))
print(values.max(), rows.max(), cols.max())
matrix = scipy.sparse.coo_matrix(
    (values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1)
)

In [None]:
from implicit.nearest_neighbours import bm25_weight
user_product = bm25_weight(matrix, K1=100, B=0.8)
user_product = user_product.tocsr()

In [None]:
als = implicit.als.AlternatingLeastSquares(
    factors=16, regularization=0.01, alpha=2.0, iterations=100, random_state=42
)
als.fit(user_product)

In [None]:
preds = []
print(gt_val["user_id"].shape[0])

popular_from_train = (
    data_val.groupby(["product_id"])[["event_time"]]
    .count()
    .reset_index()
    .sort_values(by="event_time", ascending=False)
    .rename(columns={"event_time": "count"})
).reset_index(drop=True)

for i, user in tqdm(enumerate(gt_val["user_id"])):
    userid = usr_dict[user]
    ids, scores = als.recommend(
        userid, user_product[userid], N=3, filter_already_liked_items=True
    )
    if scores[0] > 0.3:
        preds.append(list(np.array(list(pred_dict.keys()))[ids]))
    else:
        prod = gt_test.iloc[i, 1]
        preds.append(
            popular_from_train[~popular_from_train["product_id"].isin(prod)]
            .head(3)["product_id"]
            .to_list()
        )

In [None]:
round(
    mapk(
        actual=gt_val["product_id"].to_list(),
        predicted=preds,
        k=3,
    ),
    5,
)

In [None]:
data_grouped = (
    data_te.groupby(["user_id", "product_id"])[["timestamp"]]
    .count()
    .reset_index()
    .rename(columns={"timestamp": "count"})
)
data_grouped.info()

In [None]:
pred_set = set(data_grouped["product_id"])
print(f"{len(pred_set)} unique product_id")
pred_dict = {url: idurl for url, idurl in zip(pred_set, range(len(pred_set)))}
usr_set = set(data_grouped["user_id"])
print(f"{len(usr_set)} unique users")
usr_dict = {usr: user_id for usr, user_id in zip(usr_set, range(len(usr_set)))}

In [None]:
values = np.array(data_grouped["count"])
rows = np.array(data_grouped["user_id"].map(usr_dict))
cols = np.array(data_grouped["product_id"].map(pred_dict))
print(values.max(), rows.max(), cols.max())
matrix = scipy.sparse.coo_matrix(
    (values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1)
)

In [None]:
from implicit.nearest_neighbours import bm25_weight

# weight the matrix, both to reduce impact of users that have played the same artist thousands of times
# and to reduce the weight given to popular items
user_product = bm25_weight(matrix, K1=100, B=0.8)

# get the transpose since the most of the functions in implicit expect (user, item) sparse matrices instead of (item, user)
user_product = user_product.tocsr()

In [None]:
als = implicit.als.AlternatingLeastSquares(
    factors=16, regularization=0.01, alpha=2.0, iterations=100, random_state=42
)

In [None]:
als.fit(user_product)

In [None]:
gt_val_test = data_te.groupby("user_id")[["product_id"]].agg(list).reset_index()
gt_val_test["product_id"] = gt_val_test["product_id"].apply(remove_duplicates_order)


preds = []

popular_from_train = (
    data_te.groupby(["product_id"])[["event_time"]]
    .count()
    .reset_index()
    .sort_values(by="event_time", ascending=False)
    .rename(columns={"event_time": "count"})
).reset_index(drop=True)

for i, user in enumerate(gt_val_test["user_id"]):
    userid = usr_dict[user]
    ids, scores = als.recommend(
        userid, user_product[userid], N=3, filter_already_liked_items=True
    )
    if scores[0] > 0.3:
        preds.append(list(np.array(list(pred_dict.keys()))[ids]))
    else:
        prod = gt_val_test.iloc[i, 1]
        preds.append(
            popular_from_train[~popular_from_train["product_id"].isin(prod)]
            .head(3)["product_id"]
            .to_list()
        )

In [None]:
df_sub = pd.DataFrame()
df_sub["user_id"] = gt_val_test["user_id"]
df_sub["product_id"] = preds
df_sub["product_id"] = df_sub["product_id"].apply(lambda x: " ".join(map(str, x)))
df_sub

In [None]:
df_sub.to_csv("sample_submission.csv", index=False)