**The task is to make a hybrid recommendation system.**

In [68]:
from surprise import KNNWithMeans, SVD
from surprise import accuracy
from surprise import Reader
from surprise import Dataset

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import math

import pandas as pd

from itertools import islice

In [2]:
movies = pd.read_csv('../movies.csv')
ratings = pd.read_csv('../ratings.csv')

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [10]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [5]:
min_r = ratings.rating.min()
max_r = ratings.rating.max()

In [13]:
reader = Reader(rating_scale=(min_r, max_r))
data = Dataset.load_from_df(dataset, reader)

In [77]:
algo = KNNWithMeans(k=41, min_k=3, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(data.build_full_trainset())

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1ee4a50bdc8>

In [79]:
algo2 = SVD(n_factors = 17, n_epochs = 28, lr_all = 0.009, reg_all = 0.07)
algo2.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ee4c219ec8>

In [80]:
# preparing dataset for meta model
def get_meta(row):
    meta_row = pd.Series()
    
    meta_row['a1'] = algo.predict(uid=row['userId'], iid=row['title']).est
    meta_row['a2'] = algo2.predict(uid=row['userId'], iid=row['title']).est
    meta_row['rating'] = row['rating']
    
    return meta_row


meta_set = movies_with_ratings.apply(get_meta, axis=1)

In [81]:
meta_set

Unnamed: 0,a1,a2,rating
0,4.545657,4.791567,4.0
1,4.103373,3.883155,4.0
2,4.025270,3.848303,4.5
3,3.264592,3.133355,2.5
4,4.611026,4.113285,4.5
...,...,...,...
100849,3.705224,3.673726,4.0
100850,3.705224,3.562324,3.5
100851,3.705224,3.614377,3.5
100852,3.705224,3.448547,3.5


In [93]:
X = meta_set[['a1', 'a2']]
y = meta_set['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [83]:
model = LinearRegression(fit_intercept=False) 
model.fit(X_train, y_train)
p = model.predict(X_test)
math.sqrt(mean_squared_error(y_test, p))

0.4375869899637831

RMSE is really lower than for a single model.

In [91]:
model.coef_

array([ 1.25645568, -0.26145054])

Weights is not good. Try to fix them by fit to whole dataset.

In [94]:
model.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

In [95]:
model.coef_

array([ 1.25649682, -0.26129138])

In [None]:
The result is almost the same. Ok, let's transform weights 

In [100]:
new_coef = [c + 0.5 for c in model.coef_]

s = 0

for nc in new_coef:
    s += nc

new_coef = [ x/s for x in new_coef]

In [101]:
new_coef

[0.8803588778041707, 0.11964112219582927]

In [102]:
p = X_test.apply(lambda row: new_coef[0] * row['a1'] + new_coef[1] * row['a2'], axis=1)

In [104]:
math.sqrt(mean_squared_error(y_test, p))

0.470017743799276

Slight increase of error, but now we won't get a rating more than out of range.

In [107]:
def recommend_movie(user_id):
    # recommend only movies that user didn't watch
    not_seen = movies_with_ratings[movies_with_ratings['userId'] != user_id].title.unique()

    mr = {}

    for mov in not_seen:
        # predictions of base models
        x1 = algo.predict(uid=user_id, iid=mov).est
        x2 = algo2.predict(uid=user_id, iid=mov).est

        # uncomment this if you want linearregression fit
        # mr[mov] = model.predict([[x1, x2]])[0]
        
        # fixed weights
        mr[mov] = new_coef[0] * x1 + new_coef[1] * x2 

    # sort by rating
    mr = {k: v for k, v in sorted(mr.items(), key=lambda item: item[1], reverse=True)}

    # return 10 first movies
    return list(islice(mr.items(), 10))

In [106]:
recommend_movie(10)

[('Education, An (2009)', 4.807272556810224),
 ('The Intern (2015)', 4.718238548748828),
 ('Captain Fantastic (2016)', 4.715380623445111),
 ('Wristcutters: A Love Story (2006)', 4.70792288614353),
 ('Three Billboards Outside Ebbing, Missouri (2017)', 4.661925013396432),
 ('Spectre (2015)', 4.622200608701434),
 ('Intouchables (2011)', 4.529189524474996),
 ('Despicable Me (2010)', 4.464844379960308),
 ('The Artist (2011)', 4.461875983054139),
 ('Chasing Liberty (2004)', 4.458064637923434)]