Что делать?  

1.Датасет ml-latest  
2.Вспомнить подходы, которые мы разбирали  
3.Выбрать понравившийся подход к гибридным системам  
4.Написать свою  

Материалы здесь: https://github.com/ALKONDR/netology-recsys/blob/master/lecture-5/lecture-5-part-2.ipynb

In [1]:

from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
# movies_with_ratings.head()

In [46]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [47]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [48]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

# learning

In [71]:
from surprise import SVD

In [72]:
%%time
algo_1 = SVD(n_factors=20, n_epochs=20)
algo_1.fit(trainset)

Wall time: 2.31 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x26050454af0>

In [73]:
test_pred_1 = algo_1.test(testset)

In [74]:
accuracy.rmse(test_pred_1, verbose=True)

RMSE: 0.8708


0.8708154840990902

In [75]:
from surprise import KNNWithMeans

In [76]:
# %%time
algo_2 = KNNWithMeans(k=20, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo_2.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Wall time: 17.3 s


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2605048c9d0>

In [77]:
test_pred_2 = algo_2.test(testset)

In [78]:
accuracy.rmse(test_pred_2, verbose=True)

RMSE: 0.8740


0.8739785042204141

In [79]:
from surprise import SVDpp

In [80]:
# %%time
algo_3 = SVDpp()
algo_3.fit(trainset)

Wall time: 11min 57s


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x260502b6310>

In [81]:
test_pred_3 = algo_3.test(testset)

In [82]:
accuracy.rmse(test_pred_3, verbose=True)

RMSE: 0.8583


0.8583040210997777

In [83]:
from surprise import SlopeOne

In [84]:
# %%time
algo_4 = SlopeOne()
algo_4.fit(trainset)

Wall time: 5.76 s


<surprise.prediction_algorithms.slope_one.SlopeOne at 0x260503d4070>

In [85]:
test_pred_4 = algo_4.test(testset)

In [86]:
accuracy.rmse(test_pred_4, verbose=True)

RMSE: 0.8959


0.8959120981252762

# stacking

In [122]:
predicts_df  = dataset.copy().drop(['uid','iid'], axis=1)

for tqdm_notebook(index, row in dataset.iterrows()):
    predicts_df.loc[index,'algo_1'] = algo_1.predict(uid=row['uid'], iid=row['iid']).est
    predicts_df.loc[index,'algo_2'] = algo_2.predict(uid=row['uid'], iid=row['iid']).est
    predicts_df.loc[index,'algo_3'] = algo_3.predict(uid=row['uid'], iid=row['iid']).est
    predicts_df.loc[index,'algo_4'] = algo_4.predict(uid=row['uid'], iid=row['iid']).est
#     break

In [123]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [126]:
# predicts_df

In [128]:
X = predicts_df.drop('rating',axis=1)
y = predicts_df['rating']

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [130]:
algo_5 = LinearRegression()

In [131]:
algo_5.fit(X_train, y_train)

LinearRegression()

In [133]:
final_pred = algo_5.predict(X_test)

In [134]:
from sklearn.metrics import mean_squared_error

In [135]:
rmse_final = np.sqrt(mean_squared_error(y_test, final_pred))

In [143]:
def stacking_recommendation(user_id, item_id):
    predictions = pd.DataFrame()
    
    predictions.loc[0,'algo_1'] = algo_1.predict(uid=row['uid'], iid=row['iid']).est
    predictions.loc[0,'algo_2'] = algo_2.predict(uid=row['uid'], iid=row['iid']).est
    predictions.loc[0,'algo_3'] = algo_3.predict(uid=row['uid'], iid=row['iid']).est
    predictions.loc[0,'algo_4'] = algo_4.predict(uid=row['uid'], iid=row['iid']).est
    
    fnal_est = algo_5.predict(predictions)
    
    return fnal_est

In [148]:
stacking_recommendation(184,'Toy Story (1995)')

array([3.95768502])

In [149]:
# предсказан рейтинг конкретного фильма для конкретного пользователя при помощи стакинга моделей

# метарекомендации

In [183]:
# predicts_df['algo_1_r'] = np.abs(predicts_df['algo_1'] - predicts_df['rating'])
# predicts_df['algo_2_r'] = np.abs(predicts_df['algo_2'] - predicts_df['rating'])
# predicts_df['algo_3_r'] = np.abs(predicts_df['algo_3'] - predicts_df['rating'])
# predicts_df['algo_4_r'] = np.abs(predicts_df['algo_4'] - predicts_df['rating'])

In [187]:
# predicts_df['true_algo'] = predicts_df.copy().loc[:,['algo_1_r','algo_2_r','algo_3_r','algo_4_r']].idxmin(axis="columns")

In [207]:
pred_df = predicts_df.copy()

In [214]:
pred_df = pred_df.join(dataset.drop('rating', axis=1))

In [215]:
pred_df.head()

Unnamed: 0,rating,algo_1,algo_2,algo_3,algo_4,uid,iid
0,4.0,4.63102,4.437411,4.743843,4.688279,1.0,Toy Story (1995)
1,4.0,3.810775,4.031528,3.877734,3.859359,5.0,Toy Story (1995)
2,4.5,3.441314,4.13526,3.368667,3.686063,7.0,Toy Story (1995)
3,2.5,3.68476,2.935026,3.397402,3.670467,15.0,Toy Story (1995)
4,4.5,4.082983,4.296729,4.247865,4.143816,17.0,Toy Story (1995)


In [236]:
best_algo_for_user_dict = {}

for user_id, group in tqdm_notebook(pred_df.groupby('uid')):
    rmse_list = []
    
    rmse_list.append(np.sqrt(mean_squared_error(group['rating'], group['algo_1'])))
    rmse_list.append(np.sqrt(mean_squared_error(group['rating'], group['algo_2'])))
    rmse_list.append(np.sqrt(mean_squared_error(group['rating'], group['algo_3'])))
    rmse_list.append(np.sqrt(mean_squared_error(group['rating'], group['algo_4'])))
    
    best_algo_for_user_index = np.array(rmse_list).argmin()
    
    best_algo_for_user_dict.setdefault(user_id,(f'algo_{best_algo_for_user_index+1}'))
#     break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for user_id, group in tqdm_notebook(pred_df.groupby('uid')):


HBox(children=(FloatProgress(value=0.0, max=610.0), HTML(value='')))




In [273]:
models_dict = {
    'algo_1' : algo_1, 
    'algo_2' : algo_2, 
    'algo_3' : algo_3, 
    'algo_4' : algo_4, 
}

In [252]:
best_algo_for_user_dict.get(184)

'algo_2'

In [275]:
def best_algo_recommendation(user_id, item_id):
    best_algo = best_algo_for_user_dict.get(user_id)

    pred_rait = models_dict.get(best_algo).predict(user_id, item_id)
    
    return pred_rait

In [276]:
best_algo_recommendation(184,'Toy Story (1995)')

Prediction(uid=184, iid='Toy Story (1995)', r_ui=None, est=4.5419459385279914, details={'actual_k': 20, 'was_impossible': False})

# lightfm

In [15]:
from lightfm.cross_validation import random_train_test_split

In [35]:
from scipy.sparse import coo_matrix 

In [39]:
# dataset.head(2)

In [38]:
dataset_coo = coo_matrix(dataset)

In [44]:
train, test = random_train_test_split(dataset_coo, test_percentage=0.3, random_state=42)

AttributeError: 'int' object has no attribute 'shuffle'

In [None]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM()
model.fit(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

In [None]:
model = LightFM(learning_rate=0.05, loss='warp')

model.fit_partial(train, epochs=10)

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

In [None]:
def sample_recommendation(model, data, user_ids):
    n_users, n_items = data['train'].shape
    for user_id in user_ids:
        known_positives = data['item_labels'][data['train'].tocsr()                                    
                          [user_id].indices]
        
        scores = model.predict(user_id, np.arange(n_items))

        top_items = data['item_labels'][np.argsort(-scores)]

        print("User %s" % user_id)
        print("     Known positives:")
        
        for x in known_positives[:3]:
            print("        %s" % x)
        
        print("     Recommended:")
        
        for x in top_items[:3]:
            print("        %s" % x)

In [None]:
sample_recommendation(model, movielens, [10, 25, 451])