In [None]:
import warnings

import implicit
import numpy as np
import pandas as pd
from implicit.evaluation import mean_average_precision_at_k
from pandas import DataFrame
from scipy.sparse import coo_matrix, csr_matrix

warnings.filterwarnings("ignore")

In [None]:
import sys

sys.path.append("../src")

import constants
from utils import MovieEncoder, train_test_split

# Read data

In [None]:
ratings = pd.read_csv(constants.RATINGS_PATH_SANDBOX)
movies = pd.read_csv(constants.MOVIE_PATH_SANDBOX)

# Data preprocessing

In [None]:
# In train propouses we will use only 30% of all ratings dataset
# rand_userIds = np.random.choice(
#     ratings["userId"].unique(),
#     size=int(len(ratings["userId"].unique()) * 0.3),
#     replace=False,
# )

# ratings = ratings.loc[ratings["userId"].isin(rand_userIds)]
# print("There are {} rows of data from {} users".format(len(ratings), len(rand_userIds)))

In [None]:
ALL_USERS = ratings["userId"].unique().tolist()
ALL_ITEMS = movies["movieId"].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

ratings["mapped_user_id"] = ratings["userId"].map(user_map)
ratings["mapped_movie_id"] = ratings["movieId"].map(item_map)

### Train-test split

In [None]:
train_ratings, test_ratings = train_test_split(ratings)

### Check that model works ok with data

In [None]:
row = train_ratings["mapped_user_id"].values
col = train_ratings["mapped_movie_id"].values
data = np.ones(train_ratings.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)

# Validation

### Functions required for validation

In [None]:
def to_user_item_coo(df: DataFrame):
    """Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df["mapped_user_id"].values
    col = df["mapped_movie_id"].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)))
    return coo

*The code for hyperparameter selection used to be here*

It's now in src/grid_search_ALS_hyperparams.py

In [None]:
best_params = {"factors": 100, "iterations": 12, "regularization": 0}

# Training over the full dataset

In [None]:
coo_train = to_user_item_coo(train_ratings)
csr_train = coo_train.tocsr()

coo_test = to_user_item_coo(test_ratings)
csr_test = coo_test.tocsr()

In [None]:
def train(
    coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True
):
    model = implicit.cpu.als.AlternatingLeastSquares(
        factors=factors,
        iterations=iterations,
        regularization=regularization,
        random_state=42,
    )
    model.fit(coo_train, show_progress=show_progress)
    return model

In [None]:
model = train(csr_train, **best_params)

---DEBUG ZONE---

In [None]:
import os

if not os.path.exists(r"weights"):
    os.makedirs(r"weights")

model.save(r"weights/als.npz")

In [None]:
model = implicit.cpu.als.AlternatingLeastSquares.load(r"../src/weights/als.npz")

--- END OF DEBUG ---

In [None]:
mean_average_precision_at_k(model, csr_train, csr_test, K=6, show_progress=True)
# 0.08810264778725628


In [None]:
implicit.evaluation.ndcg_at_k(model, csr_train, csr_test, K=20, show_progress=True)
# 0.13433445514353154


# Debug Zone (смотрим адекватна ли модель или нет)

In [None]:
# Из списка выше выберем произвольного пользователя и оценим рекоменадцию "на глаз"
USERID = 8


encoder = MovieEncoder(movie_csv_path=constants.MOVIE_PATH_SANDBOX)
user_viewed_movie_ids = ratings[ratings["userId"] == USERID]["movieId"].values
print(f"Пользователь просмотрел эти фильмы ({len(user_viewed_movie_ids)} шт.):")
for movieId in user_viewed_movie_ids:
    print(encoder.to_title(movieId))

print()

ids, scores = model.recommend(USERID, csr_train[USERID])
print("Мы рекомендуем ему эти:")
for id, score in zip(ids, scores):
    # Этого ID не существует в выборке
    if id == 1195:
        continue
    print(encoder.to_title(id), score)

In [None]:
model.similar_users(USERID)