# Import

In [14]:
import pandas as pd
import plotly.graph_objs as go
import string
import re
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
#import cufflinks as cf
from plotly.offline import iplot
from tqdm import tqdm

from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
import plotly.express as px

#cf.go_offline()
#cf.set_config_file(offline = False, world_readable = True)

import plotly.io as pio
pio.renderers.default='notebook'

import warnings
warnings.simplefilter('ignore')

from rectools import Columns
from rectools.dataset import Dataset

from lightfm import LightFM as lf
from lightfm.data import Dataset
import scipy.sparse as sp
from lightfm.evaluation import precision_at_k

import os

In [2]:
us = os.getcwd()
us

'/Users/liliyaivannikova/Documents/project/git/RecSys_Films'

In [3]:
if 'liliyaivannikova' in us:
    PATH = r'/Users/liliyaivannikova/Documents/project/ml-latest/'
    movies = pd.read_csv(PATH + r'movies.csv')
    rating = pd.read_csv(PATH + r'ratings.csv')
elif 'Владислав' in us:
    movies = pd.read_csv(r'dataset/movies.csv')
    rating = pd.read_csv(r'dataset/ratings.csv')

In [4]:
print(f'''
movies: {movies.shape}
rating: {rating.shape}
''')


movies: (86537, 3)
rating: (33832162, 4)



In [6]:
movies.columns = [col.upper() for col in movies.columns]
movies['REALEASE'] = movies['TITLE'].str.extract("\((\d{4})\)", expand=True)
movies['REALEASE'] = pd.to_datetime(movies['REALEASE'], format='%Y')
movies['REALEASE'] = movies['REALEASE'].dt.year
movies['TITLE'] = movies['TITLE'].str[:-7]

In [7]:
# dummy-кодирование жанров
dfx = movies['GENRES'].str.get_dummies(sep='|')
for col in dfx.columns:
    dfx[col] = dfx[col].astype('int8')

In [8]:
movies = pd.concat([movies, dfx], axis=1) #.drop(columns=['GENRES'])
movies['TITLE'] = movies['TITLE'].astype('category')
movies['REALEASE'] = movies['REALEASE'].astype('float16')
movies['GENRES'] = movies['GENRES'].str.replace('|', ',')

In [10]:
rating.columns = [col.upper() for col in rating.columns]
rating['TIMESTAMP'] = pd.to_datetime(rating['TIMESTAMP'], unit='s')

# Фильтрация строк

In [11]:
# пустые поля
print(rating.shape)
rating = rating[~((rating.TIMESTAMP.isna())|(rating.USERID.isna())|(rating.RATING.isna()))]
print(rating.shape)

(33832162, 4)
(33832162, 4)


In [None]:
# при eda нашла странного юзера, уберем его при построении
print(rating.shape)
rating.query('USERID != 189614', inplace = True)
print(rating.shape)

# LightFM

In [13]:
def TrainTestSplit(df, n_days):
    '''
    df - датафрейм для разделения выборки
    n_days - количество дней, которые попадут в тест
    '''
    #df['weight'] = np.where(df['RATING'] >= 4, 1, 0)
    train_df = df.loc[df['datetime'] < df['datetime'].max() - pd.DateOffset(days=n_days),
                     ['item_id', 'user_id', 'weight']].copy()
    test_df = df.loc[df['datetime'] >= df['datetime'].max() - pd.DateOffset(days=n_days),
                    ['item_id', 'user_id', 'weight']].copy()

    return train_df, test_df

In [35]:
class LightFM():

    def __init__(self, model = None, df = None, IMDb_df = None, Genre = None):
        
        if model is None:
            self.model = LightFM(learning_rate = 0.05,
                                 loss = 'warp', 
                                 random_state = 111)
        else:
            self.model = model
        
        self.df = df
        self.IMDb_df = IMDb_df
        self.Genre = Genre
        
    def IMDb(self):
        """
            Обработка кейсов, когда нет оценок по user_id
        """
        if self.df is None:
            return None
        else:
       
            return self.IMDb_df['TITLE']
               
        
    def fit(self, df_train, epochs, item_features, num_threads):
        """
            Обучение модели LightFM
        """
        for i in tqdm(range(epochs)): 
            self.model.fit_partial(df_train, 
                            epochs = epochs, 
                            item_features = item_features,
                            num_threads = num_threads)
            
    def predict(self, user_id, k):
        """
            Рекомендации для пользователей
        """
        predictions = {}
        for user in user_id:
            predictions[user] = self.recommend(user_id=user, k=k)
        return predictions


    def recommend(self, user_id, k, movies_to_predict):
        """
            Расчет рекомендаций
        """
        # Если ранее такого пользователя не было, используем IMDb
        try:
            
            n_movies = self.df['item_id'].unique()

            prediction = self.model.predict(user_ids = user_id * len(movies_to_predict), 
                                        item_ids = movies_to_predict * len(user_id)
                                        )
            predict = np.array(n_movies)[np.argsort(-prediction)][:k]
            return predict.tolist()
        
        except:
            return self.IMDb()[:k].to_list()

In [40]:
rating.rename({'USERID':'user_id',
            'MOVIEID':'item_id',
            'RATING':'weight',
            'TIMESTAMP':'datetime'}, axis=1, inplace=True)

In [38]:
movies.rename({'MOVIEID':'item_id'}, axis=1, inplace=True)

In [41]:
train_df, test_df = TrainTestSplit(rating, 180)

In [50]:
dataset = Dataset()
dataset.fit(train_df['user_id'], train_df['item_id'], item_features=movies['GENRES'].values)
dataset.fit_partial(users=test_df['user_id'], items=test_df['item_id'])

In [51]:
interactions, _ = dataset.build_interactions(train_df.loc[:, ['user_id', 'item_id']].values.tolist())
interactions_test, _ = dataset.build_interactions(test_df.loc[:, ['user_id', 'item_id']].values.tolist())

In [52]:
train_movies = movies.loc[movies['item_id'].isin(train_df['item_id']), ['item_id', 'GENRES']]
features = [[v[0], v[1:].values.tolist()] for i, v in train_movies.loc[:, ['item_id', 'GENRES']].iterrows()]
movies_features = dataset.build_item_features(features)

In [53]:
movies_to_predict = [dataset._item_id_mapping[x] for x in test_df['item_id'].unique()]

In [56]:
model = lf(no_components=10,
                loss='warp', 
                random_state=111,
                learning_rate=0.05)

lfm_model = LightFM(model, rating)

In [57]:
# Обучение
lfm_model.fit(interactions, 
              epochs=5, 
              item_features = movies_features,
              num_threads = 40)

100%|████████████████████████████████████████████| 5/5 [13:49<00:00, 165.91s/it]


In [58]:
predict = lfm_model.recommend(user_id = [22], 
                              k = 10, 
                              movies_to_predict = movies_to_predict)

In [59]:
predict

[119145, 1639, 259, 510, 520, 8644, 533, 457, 85, 1923]

In [69]:
def mapk(model, df_test, k, item_features):
    test_precision = precision_at_k(model=model, 
                            test_interactions=df_test,
                            k=k,
                            item_features = movies_features)
    map_at_k = sum(test_precision)/len(test_precision)
    return map_at_k

In [70]:
mapk(model = lfm_model.model, df_test = interactions_test, k = 10, item_features = movies_features)

0.11825195914336005