In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np

import implicit
from implicit.evaluation import mean_average_precision_at_k
from scipy.sparse import coo_matrix
from tqdm import tqdm


In [None]:
import sys
sys.path.append('../src')

import constants
from utils import train_test_split, MovieEncoder

# Read data

In [None]:
ratings = pd.read_csv(r"..\MovieLens_20M_Dataset\rating.csv")
movies = pd.read_csv(r"..\MovieLens_20M_Dataset\movie.csv")

# Data preprocessing

In [None]:
# In train propouses we will use only 30% of all ratings dataset
rand_userIds = np.random.choice(ratings['userId'].unique(),
                                size=int(len(ratings['userId'].unique())*0.3),
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]
print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))

### Train-test split

In [None]:
train_ratings, test_ratings = train_test_split(ratings)

### Converting the dataset into an implicit feedback dataset

In [None]:
train_ratings.loc[:, 'rating'] = 1

train_ratings.sample(5)

In [None]:
row = train_ratings['userId'].values
col = train_ratings['movieId'].values
data = train_ratings['rating'].values
coo_train = coo_matrix((data, (row, col)))
coo_train

### Check that model works ok with data

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)

# Validation

### Functions required for validation

In [None]:
def to_user_item_coo(df: DataFrame):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['userId'].values
    col = df['movieId'].values
    data = df['rating'].values
    coo = coo_matrix((data, (row, col)))
    return coo

def get_val_matrices(df: DataFrame):
    """
    Returns a dictionary with the following keys:
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    """
    df_train, df_test = train_test_split(df)

    coo_train = to_user_item_coo(df_train)
    coo_test = to_user_item_coo(df_test)

    csr_train = coo_train.tocsr()
    csr_test = coo_test.tocsr()

    return {'csr_train': csr_train,
            'csr_test': csr_test
          }

def validate(matrices: dict, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension)
    for <<iterations>> over matrices and validate with MAP@30
    """
    csr_train, csr_test = matrices['csr_train'], matrices['csr_test']

    model = implicit.als.AlternatingLeastSquares(factors=factors,
                                                 iterations=iterations,
                                                 regularization=regularization)
    model.fit(csr_train, show_progress=show_progress)

    # The MAP@K by implicit doesn't allow to calculate allowing repeated items, which is the case.
    map30 = mean_average_precision_at_k(model, csr_train, csr_test, K=30, show_progress=show_progress)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@30: {map30:6.5f}")
    return map30

In [None]:
matrices = get_val_matrices(ratings)
matrices

In [None]:
# Grid Search
## TQDM initialization
# factors_params = [40, 50, 60, 100, 200, 500, 1000]
# iter_params = [3, 12, 14, 15, 20]
# regularization_params = [0, 0.1, 0.01]

# total_iterations = len(factors_params) * len(iter_params) * len(regularization_params)
# pbar = tqdm(total=total_iterations, desc="Progress")

# best_map30 = 0
# for factors in factors_params:
#     for iterations in iter_params:
#         for regularization in regularization_params:
#             map30 = validate(matrices, factors, iterations, regularization, show_progress=False)
#             pbar.update(1)
#             if map30 > best_map30:
#                 best_map30 = map30
#                 best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
#                 print(f"Best MAP@30 found. Updating: {best_params}")

In [None]:
# Дошли до factors = 500 потом слишком долго выполнялся перебор (~80 sec/it при начальных 15 sec/it)
best_params = {'factors': 100, 'iterations': 14, 'regularization': 0.1}

In [None]:
del matrices

# Training over the full dataset

In [None]:
ratings = pd.read_csv(r"..\MovieLens_20M_Dataset\rating.csv")
train_ratings, test_ratings = train_test_split(ratings)

In [None]:
coo_train = to_user_item_coo(train_ratings)
csr_train = coo_train.tocsr()

coo_test = to_user_item_coo(test_ratings)
csr_test = coo_test.tocsr()

In [None]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors,
                                                 iterations=iterations,
                                                 regularization=regularization,
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [None]:
model = train(csr_train, **best_params)

In [None]:
mean_average_precision_at_k(model, csr_train, csr_test, K=30, show_progress=True)

# Debug Zone (смотрим адекватна ли модель или нет)

In [None]:
# Получим список пользователей и количества просмотренных ими фильмов
user_movie_counts = ratings.groupby('userId')['movieId'].count()
users_with_multiple_movies = user_movie_counts[user_movie_counts > 1]
users_with_multiple_movies

In [None]:
# Из списка выше выберем произвольного пользователя и оценим рекоменадцию "на глаз"
USERID = 4


encoder = MovieEncoder(movie_csv_path=constants.MOVIE_PATH)
print('Пользователь просмотрел эти фильмы:')
user_viewed_movie_ids = ratings[ratings['userId'] == USERID]['movieId'].values
for movieId in user_viewed_movie_ids:
    print(encoder.to_title(movieId))
print()
ids, scores = model.recommend(USERID, csr_train[USERID])
print('Мы рекомендуем ему эти')
for id, score in zip(ids, scores):
    print(encoder.to_title(id), score)