In [1]:
from collections import defaultdict
import os
import random

from tqdm.auto import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
from tqdm.auto import tqdm, trange
from implicit.nearest_neighbours import CosineRecommender, BM25Recommender, TFIDFRecommender
from implicit.als import AlternatingLeastSquares
from implicit import gpu
from lightfm.data import Dataset
from lightfm import LightFM

In [2]:
gpu.HAS_CUDA

True

In [3]:
sns.set_style("darkgrid")

In [4]:
%load_ext autoreload
%autoreload 2

from recs_utils.metrics import compute_metrics, model_cross_validate, mean_average_prec, join_true_pred_and_preprocess
from recs_utils.load_data import load_users, load_items, load_interactions, sample_true_rec_data
from recs_utils.simple_rec import PopularRecommender, PopularRecommenderPerAge
from recs_utils.split import train_test_split, TimeRangeSplit
from recs_utils.matrix_ops import interactions_to_csr_matrix
from recs_utils.implicit_model import ImplicitRecommender, LightFMRecommender
from recs_utils.utils import get_direct_and_inv_mapping

In [5]:
data_dir = "data"

In [16]:
df = load_interactions(os.path.join(data_dir, "interactions.csv"))
df_users = load_users(os.path.join(data_dir, "users.csv"))
df_items = load_items(os.path.join(data_dir, "items.csv"))

In [17]:
len(df)

1532998

In [13]:
df = df.groupby(["user_id", "item_id"]).sample(frac=0.25)

In [15]:
len(df)

1532998

In [7]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,progress,rating,start_date
user_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
126706,14433,80,,2018-01-01
127290,140952,58,,2018-01-01
66991,198453,89,,2018-01-01
46791,83486,23,5.0,2018-01-01
79313,188770,88,5.0,2018-01-01


In [8]:
df_users.head()

Unnamed: 0_level_0,age,sex
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,45_54,
2,18_24,0.0
3,65_inf,0.0
4,18_24,0.0
5,35_44,0.0


In [9]:
df_items.head()

Unnamed: 0_level_0,title,genres,authors,year
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
128115,Ворон-челобитчик,"Зарубежные детские книги,Сказки,Зарубежная кла...",Михаил Салтыков-Щедрин,1886
210979,Скрипка Ротшильда,"Классическая проза,Литература 19 века,Русская ...",Антон Чехов,1894
95632,Испорченные дети,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1869
247906,Странный человек,"Пьесы и драматургия,Литература 19 века",Михаил Лермонтов,1831
294280,Господа ташкентцы,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1873


In [10]:
users_mapping, users_inv_mapping = get_direct_and_inv_mapping(df, "user_id")
len(users_mapping)

151600

In [11]:
items_mapping, items_inv_mapping = get_direct_and_inv_mapping(df, "item_id")
len(items_mapping)

59599

In [12]:
df_items["title"] = df_items["title"].str.strip().str.lower()
df_items["genres"] = df_items["genres"].str.strip().str.lower()

In [39]:
last_date = df['start_date'].max().normalize()
folds = 7
start_date = last_date - pd.Timedelta(weeks=folds)
start_date, last_date

(Timestamp('2019-11-12 00:00:00'), Timestamp('2019-12-31 00:00:00'))

In [40]:
df["start_date"].min().normalize()

Timestamp('2018-01-01 00:00:00')

In [41]:
cv = TimeRangeSplit(start_date=start_date, periods=folds + 1, freq="W")
cv.max_n_splits, cv.get_n_splits(df, datetime_column='start_date')

(7, 6)

In [43]:
folds_with_stats = list(cv.split(
    df, 
    user_column='user_id',
    item_column='item_id',
    datetime_column='start_date',
    fold_stats=True
)
)

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

In [44]:
folds_info_with_stats

Unnamed: 0,Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
0,2019-11-17,2019-11-24,1438057,26,37,0,0,0,14809
1,2019-11-24,2019-12-01,1452903,15,20,0,0,0,14695
2,2019-12-01,2019-12-08,1467618,19,38,0,0,0,14380
3,2019-12-08,2019-12-15,1482036,6,8,0,0,0,14638
4,2019-12-15,2019-12-22,1496682,16,23,0,0,0,14977
5,2019-12-22,2019-12-29,1511682,7,10,0,0,0,14940


In [45]:
top_N = 10

In [46]:
top_N_negative_sampling = top_N * 2

In [47]:
train_idx, test_idx, info = folds_with_stats[0]

train = df.loc[train_idx, :]
test = df.loc[test_idx, :]
train.shape, test.shape

((1438057, 3), (14809, 3))

In [48]:
recommender = ImplicitRecommender(
    TFIDFRecommender(K=top_N_negative_sampling),
    users_mapping, items_mapping,
    items_inv_mapping
)

In [49]:
recommender.fit(train)



  0%|          | 0/59599 [00:00<?, ?it/s]

In [50]:
train_pred_neg_sampling = recommender.recommend(train.index.get_level_values("user_id").unique(), n=top_N_negative_sampling).reset_index("item_id")

In [51]:
train_pred_neg_sampling.head()

Unnamed: 0_level_0,item_id,rank
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
126706,307855,1
126706,315738,2
126706,20631,3
126706,108460,4
126706,145074,5


In [54]:
train.shape

(1438057, 3)

In [52]:
def sample_neg_examples(interactions: pd.DataFrame, neg_predictions_per_user: pd.DataFrame):
    assert "user_id" in neg_predictions_per_user.index.names, "Cannot find"
    new_train = []

    for _, row in tqdm(interactions.reset_index().iterrows(), miniters=4, total=len(interactions)):
        neg_samples = train_pred_neg_sampling.loc[row["user_id"], "item_id"].sample(top_N, random_state=12213).reset_index()
        neg_samples["start_date"] = row["start_date"]

        neg_samples = pd.concat(
            (
            row.to_frame(), neg_samples
            )
        )
        neg_samples["target"] = 0
        neg_samples.iloc[0, neg_samples.columns.get_loc("target")] = 1
        new_train.append(neg_samples)
    return new_train

In [53]:
sample_neg_examples(df, train_pred_neg_sampling)

  0%|          | 0/1532998 [00:00<?, ?it/s]

KeyboardInterrupt: 