# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from tqdm import tqdm
import scipy.sparse as sp

import warnings
warnings.simplefilter('ignore')

from numpy.linalg import svd

from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from rectools.metrics.ranking import MAP
import scipy.sparse as scs

import os

In [2]:
us = os.getcwd()
us

'/Users/liliyaivannikova/Documents/project/git/RecSys_Films/recsyc_part1'

In [3]:
if 'liliyaivannikova' in us:
    PATH = r'/Users/liliyaivannikova/Documents/project/ml-latest/'
    movies = pd.read_csv(PATH + r'movies.csv')
    rating = pd.read_csv(PATH + r'ratings.csv')
elif 'Владислав' in us:
    movies = pd.read_csv(r'dataset/movies.csv')
    rating = pd.read_csv(r'dataset/ratings.csv')

In [4]:
print(f'''
movies: {movies.shape}
rating: {rating.shape}
''')


movies: (86537, 3)
rating: (33832162, 4)



In [5]:
movies.columns = [col.upper() for col in movies.columns]

movies['REALEASE'] = movies['TITLE'].str.extract("\((\d{4})\)", expand=True)
movies['REALEASE'] = pd.to_datetime(movies['REALEASE'], format='%Y')
movies['REALEASE'] = movies['REALEASE'].dt.year
movies['TITLE'] = movies['TITLE'].str[:-7]

In [6]:
movies['TITLE'] = movies['TITLE'].astype('category')
movies['REALEASE'] = movies['REALEASE'].astype('float16')
movies['GENRES'] = movies['GENRES'].str.replace('|', ',')

In [7]:
rating.columns = [col.upper() for col in rating.columns]
rating['TIMESTAMP'] = pd.to_datetime(rating['TIMESTAMP'], unit='s')

# Обработка полей

In [8]:
rating_stat = rating.groupby('USERID')['MOVIEID'].count().reset_index()
rating_stat['MOVIEID'].quantile(0.05)

3.0

In [9]:
#rating_stat['FLAG_05'] = np.where(rating_stat['MOVIEID'] > rating_stat['MOVIEID'].quantile(0.05), 1, 0)
rating_stat['FLAG_05'] = np.where(rating_stat['MOVIEID'] > 20, 1, 0)

In [10]:
rating = rating.merge(rating_stat[['USERID', 'FLAG_05']], how = 'left', on = 'USERID')

In [11]:
rating.head()

Unnamed: 0,USERID,MOVIEID,RATING,TIMESTAMP,FLAG_05
0,1,1,4.0,2008-11-03 17:52:19,1
1,1,110,4.0,2008-11-05 06:04:46,1
2,1,158,4.0,2008-11-03 17:31:43,1
3,1,260,4.5,2008-11-03 18:00:04,1
4,1,356,5.0,2008-11-03 17:58:39,1


In [12]:
# удаляем пропуски
rating = rating[~((rating.TIMESTAMP.isna())|(rating.USERID.isna())|(rating.RATING.isna()))]
# удаляем странного юзера
rating.query('USERID != 189614', inplace = True)
# удалим юзеров, у которых оценок меньше 3 (по 5 квантилю)
rating.query('FLAG_05 == 1', inplace = True)
print(rating.shape)

(32346391, 5)


# train-test split

In [13]:
rating.rename({'USERID':'user_id',
            'MOVIEID':'item_id',
            'RATING':'weight',
            'TIMESTAMP':'datetime'}, axis=1, inplace=True)

Разделим выборку на обучение и тест следующим образом: для каждого пользователя в тестовую выборку попадут 10 его последних оценок.

In [14]:
num_users, num_movies = rating['user_id'].nunique(), rating['item_id'].nunique()
num_users, num_movies

(198927, 81555)

In [15]:
user_df = pd.DataFrame(np.vstack((np.arange(1, num_users+1), rating['user_id'].unique())).T, 
                         columns= ['user_id_mod', 'user_id'])
item_df = pd.DataFrame(np.vstack((np.arange(1, num_movies+1), rating['item_id'].unique())).T, 
                         columns= ['item_id_mod', 'item_id'])

In [16]:
rating = rating.merge(user_df, how = 'left', on = 'user_id')
rating = rating.merge(item_df, how = 'left', on = 'item_id')

In [17]:
train_ratings, test_ratings = [], []
num_test_samples = 10

for userId, user_data in rating.groupby('user_id'):
    train_ratings += [user_data[:-num_test_samples]]
    test_ratings += [user_data[-num_test_samples:]]

train_ratings = pd.concat(train_ratings)
test_ratings = pd.concat(test_ratings)
train_ratings.shape, test_ratings.shape

((30357121, 7), (1989270, 7))

In [18]:
del rating

In [19]:
train_ratings.head(3)

Unnamed: 0,user_id,item_id,weight,datetime,FLAG_05,user_id_mod,item_id_mod
0,1,1,4.0,2008-11-03 17:52:19,1,1,1
1,1,110,4.0,2008-11-05 06:04:46,1,1,2
2,1,158,4.0,2008-11-03 17:31:43,1,1,3


In [20]:
user_items = scs.coo_array((train_ratings['weight'], (train_ratings['user_id_mod'], train_ratings['item_id_mod'])), 
                  shape=(num_users + 1, num_movies + 1)).tocsr()

In [21]:
from implicit.als import AlternatingLeastSquares

In [22]:
als_model = AlternatingLeastSquares(iterations=50, regularization = 0.05,
                                    random_state=0, 
                                    alpha=2.0, num_threads=2)

In [23]:
als_model.fit(user_items)

100%|██████████| 50/50 [20:49<00:00, 24.98s/it]


In [32]:
userid = 1
ids, scores = als_model.recommend(userid, user_items[:, [userid]], 
                                  N=10, filter_already_liked_items=False)

In [33]:
scores, ids

(array([1.073283  , 1.0533159 , 1.0225102 , 1.0154611 , 1.0140581 ,
        0.98006964, 0.9677618 , 0.95948553, 0.956915  , 0.932565  ],
       dtype=float32),
 array([ 27,  39,  41,   2,  13,  11,  46,  23,   4, 226], dtype=int32))

In [34]:
test_user_items = scs.coo_array((test_ratings['weight'], 
                                 (test_ratings['user_id_mod'], test_ratings['item_id_mod'])), 
                                  shape=(num_users + 1, num_movies + 1)).tocsr()

# recommend

In [35]:
ids, scores = als_model.recommend(test_ratings['user_id_mod'].unique(), 
                                  test_user_items, filter_already_liked_items=False)

In [36]:
reco = pd.DataFrame()
reco['user_id'] = pd.DataFrame(np.repeat(test_ratings['user_id_mod'].unique(), 
                                             10, axis=0), columns=['user_id_mod'])
reco['item_id'] = np.ravel(ids)
reco['score'] = np.ravel(scores)
reco['rank'] = reco.groupby("user_id")["score"].rank(method="first", ascending=False)

In [37]:
reco.head()

Unnamed: 0,user_id,item_id,score,rank
0,1,27,1.073283,1.0
1,1,39,1.053316,2.0
2,1,41,1.02251,3.0
3,1,2,1.015461,4.0
4,1,13,1.014058,5.0


In [38]:
# это пока выглядит как костыль, я поправлю :)
train_ratings['user_id_true'] = train_ratings['user_id']
train_ratings['user_id'] = train_ratings['user_id_mod']
train_ratings['item_id_true'] = train_ratings['item_id']
train_ratings['item_id'] = train_ratings['item_id_mod']

test_ratings['user_id_true'] = test_ratings['user_id']
test_ratings['user_id'] = test_ratings['user_id_mod']
test_ratings['item_id_true'] = test_ratings['item_id']
test_ratings['item_id'] = test_ratings['item_id_mod']

In [41]:
MAP(k=3).calc_per_user(reco, test_ratings).mean() * 100

0.016136572712603117