# Import

numpy==1.26.4
pandas==1.5.3
python-dateutil==2.8.2
pytz==2024.1
six==1.16.0
implicit==0.7.2
scipy==1.11.4

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from tqdm import tqdm
import scipy.sparse as sp

import warnings
warnings.simplefilter('ignore')

from numpy.linalg import svd

from rectools.dataset import Dataset
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from rectools.metrics.ranking import MAP
import scipy.sparse as scs

import os

In [2]:
us = os.getcwd()
us

'C:\\Users\\Владислав\\PycharmProjects\\RecSys_Films\\recsyc_part1'

In [3]:
%%time
us = os.getcwd()

if 'liliyaivannikova' in us:
    PATH = r'/Users/liliyaivannikova/Documents/project/ml-latest/'
elif 'Владислав' in us:
    PATH = 'C:/Users/Владислав/test_python_scripts/project/test project/dataset/'
    
movies = pd.read_csv(PATH + r'movies.csv')
rating = pd.read_csv(PATH + r'ratings.csv')

CPU times: total: 7.12 s
Wall time: 7.12 s


%%time
movies = pd.read_csv(
    "ml-1m/movies.dat",
    sep="::",
    engine="python",
    header=None,
    names=['movieId','title','genres'],
    encoding='latin-1'
)
rating = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    engine="python",
    header=None,
    names=['userId','movieId','rating','timestamp'],
)

In [4]:
print(f'''
movies: {movies.shape}
rating: {rating.shape}
''')


movies: (86537, 3)
rating: (33832162, 4)



In [5]:
movies.columns = [col.upper() for col in movies.columns]

movies['REALEASE'] = movies['TITLE'].str.extract("\((\d{4})\)", expand=True)
movies['REALEASE'] = pd.to_datetime(movies['REALEASE'], format='%Y')
movies['REALEASE'] = movies['REALEASE'].dt.year
movies['TITLE'] = movies['TITLE'].str[:-7]

In [6]:
movies['TITLE'] = movies['TITLE'].astype('category')
movies['REALEASE'] = movies['REALEASE'].astype('float16')
movies['GENRES'] = movies['GENRES'].str.replace('|', ',')

In [7]:
rating.columns = [col.upper() for col in rating.columns]
rating['TIMESTAMP'] = pd.to_datetime(rating['TIMESTAMP'], unit='s')

# Обработка полей

In [8]:
rating_stat = rating.groupby('USERID')['MOVIEID'].count().reset_index()
rating_stat['MOVIEID'].quantile(0.05)

3.0

In [9]:
#rating_stat['FLAG_05'] = np.where(rating_stat['MOVIEID'] > rating_stat['MOVIEID'].quantile(0.05), 1, 0)
rating_stat['FLAG_05'] = np.where(rating_stat['MOVIEID'] > 20, 1, 0)

In [10]:
rating = rating.merge(rating_stat[['USERID', 'FLAG_05']], how = 'left', on = 'USERID')

In [11]:
rating.head()

Unnamed: 0,USERID,MOVIEID,RATING,TIMESTAMP,FLAG_05
0,1,1,4.0,2008-11-03 17:52:19,1
1,1,110,4.0,2008-11-05 06:04:46,1
2,1,158,4.0,2008-11-03 17:31:43,1
3,1,260,4.5,2008-11-03 18:00:04,1
4,1,356,5.0,2008-11-03 17:58:39,1


In [12]:
# удаляем пропуски
rating = rating[~((rating.TIMESTAMP.isna())|(rating.USERID.isna())|(rating.RATING.isna()))]
# удаляем странного юзера
rating.query('USERID != 189614', inplace = True)
# удалим юзеров, у которых оценок меньше 3 (по 5 квантилю)
rating.query('FLAG_05 == 1', inplace = True)
print(rating.shape)

(32346391, 5)


# train-test split

In [13]:
rating.rename({'USERID':'user_id_true',
            'MOVIEID':'item_id_true',
            'RATING':'weight',
            'TIMESTAMP':'datetime'}, axis=1, inplace=True)

Разделим выборку на обучение и тест следующим образом: для каждого пользователя в тестовую выборку попадут 10 его последних оценок.

In [14]:
num_users, num_movies = rating['user_id_true'].nunique(), rating['item_id_true'].nunique()
num_users, num_movies

(198927, 81555)

In [15]:
user_df = pd.DataFrame(np.vstack((np.arange(1, num_users+1), rating['user_id_true'].unique())).T, 
                         columns= ['user_id', 'user_id_true'])
item_df = pd.DataFrame(np.vstack((np.arange(1, num_movies+1), rating['item_id_true'].unique())).T, 
                         columns= ['item_id', 'item_id_true'])

In [16]:
rating = rating.merge(user_df, how = 'left', on = 'user_id_true')
rating = rating.merge(item_df, how = 'left', on = 'item_id_true')

In [17]:
rating.sort_values(['user_id_true', 'datetime'], inplace=True)

In [18]:
train_ratings, test_ratings = [], []
num_test_samples = 10

for userId, user_data in rating.groupby('user_id'):
    train_ratings += [user_data[:-num_test_samples]]
    test_ratings += [user_data[-num_test_samples:]]

train_ratings = pd.concat(train_ratings)
test_ratings = pd.concat(test_ratings)
train_ratings.shape, test_ratings.shape

((30357121, 7), (1989270, 7))

In [19]:
del rating

In [20]:
train_ratings.head(3)

Unnamed: 0,user_id_true,item_id_true,weight,datetime,FLAG_05,user_id,item_id
2,1,158,4.0,2008-11-03 17:31:43,1,1,3
37,1,4896,4.0,2008-11-03 17:31:56,1,1,38
6,1,596,4.0,2008-11-03 17:32:04,1,1,7


In [21]:
user_items = scs.coo_array((pd.to_numeric(train_ratings['weight'],downcast="float"),
                            (pd.to_numeric(train_ratings['user_id'],downcast="integer"),
                             pd.to_numeric(train_ratings['item_id'],downcast="integer")
                            )), 
                  shape=(num_users + 1, num_movies + 1)).tocsr()

# Перебор гиперпараметров

In [22]:
from tqdm.contrib import itertools
import time


mapk_list = []

regularization_list = np.array([0.001, 0.01, 0.1, 0.5, 1])
factors_list = [3,5,10,100]
alpha_list = np.array([2.,3.,4.])

for r, a, f in itertools.product(regularization_list,alpha_list,factors_list):

    from implicit.cpu.als import AlternatingLeastSquares
    als_model = AlternatingLeastSquares(factors = f,
                                        iterations=5,
                                        regularization = r,
                                        random_state=0, 
                                        alpha=a,
                                        num_threads=8)
    als_model.fit(user_items,show_progress = False)
    
    test_user_items = scs.coo_array((test_ratings['weight'], 
                                 (test_ratings['user_id'],
                                  test_ratings['item_id'])), 
                                  shape=(num_users + 1, num_movies + 1)).tocsr()
    
    
    ids, scores = als_model.recommend(test_ratings['user_id'].unique(),
                                      test_user_items,
                                      N=10,
                                      filter_already_liked_items=False)
    reco = pd.DataFrame()
    reco['user_id'] = pd.DataFrame(np.repeat(test_ratings['user_id'].unique(),
                                             repeats = 10,
                                             axis=0), columns=['user_id'])
    reco['item_id'] = np.ravel(ids)
    reco['score'] = np.ravel(scores)
    reco['rank'] = reco.groupby("user_id")["score"].rank(method="first", ascending=False)
    
    map_k = MAP(k=10).calc_per_user(reco, test_ratings).mean() * 100
    del reco
    mapk_list.append([f,r,a,map_k])
    

  0%|          | 0/60 [00:00<?, ?it/s]

In [23]:
pd.set_option('display.max_rows', None)
pd.DataFrame(mapk_list, columns = ['Factors','regularization','alpha', 'map_10']).sort_values(['map_10'],ascending=False).head(10)

Unnamed: 0,Factors,regularization,alpha,map_10
56,3,1.0,4.0,1.02579
44,3,0.5,4.0,1.024351
32,3,0.1,4.0,1.022763
52,3,1.0,3.0,1.018938
40,3,0.5,3.0,1.016231
20,3,0.01,4.0,1.016032
8,3,0.001,4.0,1.015062
0,3,0.001,2.0,1.014536
12,3,0.01,2.0,1.014319
24,3,0.1,2.0,1.014299


# Обучение модельки + реки для трейна

In [33]:
from implicit.cpu.als import AlternatingLeastSquares

In [34]:
als_model = AlternatingLeastSquares(factors = 3,
                                    iterations=50,
                                    regularization = 1.0,
                                    random_state=0, 
                                    alpha=4.0,
                                    num_threads=8)
als_model.fit(user_items)

  0%|          | 0/50 [00:00<?, ?it/s]

In [35]:
userid = 1
ids, scores = als_model.recommend(userid,
                                  user_items[:, [userid]], 
                                  N=10,
                                  filter_already_liked_items=False)
scores, ids

(array([0.8401008 , 0.81823945, 0.8179725 , 0.80663353, 0.7931324 ,
        0.7846249 , 0.7821287 , 0.75964725, 0.7567396 , 0.7543485 ],
       dtype=float32),
 array([115,   5, 111, 146,   4, 137,  27,   2, 134,  77]))

# Реки для теста + замер метрики

In [36]:
# кол-во рекомендаций

In [37]:
test_user_items = scs.coo_array((test_ratings['weight'], 
                                 (test_ratings['user_id'],
                                  test_ratings['item_id'])), 
                                  shape=(num_users + 1, num_movies + 1)).tocsr()

In [38]:
ids, scores = als_model.recommend(test_ratings['user_id'].unique(),
                                  test_user_items,
                                  N=10,
                                  filter_already_liked_items=False)

In [39]:
reco = pd.DataFrame()
reco['user_id'] = pd.DataFrame(np.repeat(test_ratings['user_id'].unique(), 
                                             10, axis=0), columns=['user_id'])
reco['item_id'] = np.ravel(ids)
reco['score'] = np.ravel(scores)
reco['rank'] = reco.groupby("user_id")["score"].rank(method="first", ascending=False)

In [40]:
reco.head()

Unnamed: 0,user_id,item_id,score,rank
0,1,115,0.840101,1.0
1,1,5,0.818239,2.0
2,1,111,0.817973,3.0
3,1,146,0.806634,4.0
4,1,4,0.793132,5.0


In [41]:
MAP(k=10).calc_per_user(reco, test_ratings).mean() * 100

1.0667182808785007