In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

### Сделаем список из 5 понравившихся фильмов для наших пользователей
понравившиеся это те у которых рейтинг выше их же среднего

In [3]:
movies_and_ratings = movies.merge(ratings, on='movieId')
movies_and_ratings.head(3)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946


In [4]:
mean_rating = movies_and_ratings.groupby('userId')['rating'].mean()
movies_and_ratings['mean_rating'] = movies_and_ratings['userId'].apply(lambda x: mean_rating[x])

In [5]:
movies_and_ratings['good_rating'] = movies_and_ratings.apply(lambda x: x['rating'] if x['mean_rating'] <= x['rating'] else np.NaN, axis=1)
movies_and_ratings = movies_and_ratings[ pd.isnull( movies_and_ratings['good_rating'] ) == 0 ]
movies_and_ratings = movies_and_ratings.drop(['mean_rating', 'good_rating'], axis=1).reset_index(drop=True)
movies_and_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,4.0,965705637
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,21,3.5,1407618878


In [6]:
good_feedback = movies_and_ratings

In [7]:
good_feedback = movies_and_ratings.sort_values(['userId' ,'timestamp'], ascending=[True, False])

In [8]:
good_feedback_dict = {}

all_users = good_feedback['userId'].unique()

for user in all_users:
    top_movies = []
    for top in range(5):
        try:
            top_movies.append(good_feedback[good_feedback['userId']==user]['movieId'].values[top])
            good_feedback_dict[user] = top_movies
        except:
            continue

Для предсказания будем использовать каскад. На первом этапе отберем часть фильмов с помощью модели ALS и KNN. KNN будем брать от трех рандомных понравившихся фильмов, которые отобрали ранее

Модель_0 ALS 

In [9]:
movies_and_ratings['userId'] = movies_and_ratings['userId'].astype("category").cat.codes
movies_and_ratings['movieId'] = movies_and_ratings['movieId'].astype("category").cat.codes

In [10]:
shape_0 = len(movies_and_ratings['movieId'].unique())
shape_1 = len(movies_and_ratings['userId'].unique())

In [11]:
users_act = movies_and_ratings.loc[:, ['userId','movieId']].reset_index(drop=True)
users_act['act'] = 1
users_act.head(3)

Unnamed: 0,userId,movieId,act
0,4,0,1
1,6,0,1
2,16,0,1


In [12]:
activity = list(users_act['act'])
cols = users_act['movieId'].astype(int)
rows = users_act['userId'].astype(int)

In [13]:
len(rows), len(activity), len(cols)

(54732, 54732, 54732)

In [14]:
from scipy import sparse
data_sparse = sparse.csr_matrix((activity, (rows, cols)), shape=(shape_1, shape_0))

In [15]:
from implicit.als import AlternatingLeastSquares
algo_0 = AlternatingLeastSquares(factors=50)
algo_0.fit(data_sparse)

100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:00<00:00, 24.86it/s]


In [16]:
userid = 1

user_items = data_sparse.T.tocsr()
recommendations = algo_0.recommend(userid, user_items, N=15)

In [17]:
recommendations_list = []
for i in recommendations:
    recommendations_list.append(i[0])

In [18]:
movies.iloc[recommendations_list]

Unnamed: 0,movieId,title,genres
291,333,Tommy Boy (1995),Comedy
180,212,Bushwhacked (1995),Adventure|Comedy|Crime|Mystery
57,64,Two if by Sea (1996),Comedy|Romance
554,665,Underground (1995),Comedy|Drama|War
42,46,How to Make an American Quilt (1995),Drama|Romance
116,141,"Birdcage, The (1996)",Comedy
583,718,"Visitors, The (Visiteurs, Les) (1993)",Comedy|Fantasy|Sci-Fi
225,261,Little Women (1994),Drama
178,210,Wild Bill (1995),Western
39,43,Restoration (1995),Drama


Теперь обучим KNN модель

In [19]:
movies_ = movies.copy()
movies_['description'] = movies_.apply(lambda x: x['genres'].replace('|', ' '), axis=1)

In [20]:
movies_ = movies_.drop('genres', axis=1)
movies_.head(3)

Unnamed: 0,movieId,title,description
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance


In [21]:
movies_list = []
description_list = []

for mov, desc in movies_[['title', 'description']].values:
    movies_list.append(mov)
    description_list.append(desc)

In [22]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
coutn_v = CountVectorizer()
X_train = coutn_v.fit_transform(description_list)
X_train.toarray(), X_train.toarray().shape

(array([[0, 1, 1, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [1, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64), (9742, 24))

In [23]:
tfidf = TfidfTransformer()
X_train_col = tfidf.fit_transform(X_train)
X_train_col.toarray(), X_train_col.toarray().shape

(array([[0.        , 0.41684567, 0.51622547, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.51236121, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.57860574, 0.        , 0.81560738, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]), (9742, 24))

In [24]:
for i in range(X_train_col.shape[1]):
    col_name = 'd{}'.format(i)
    movies_[col_name] = pd.Series(X_train_col.toarray()[:, i])

In [25]:
movies_ = movies_.drop('description', axis=1)
movies_.head(3)

Unnamed: 0,movieId,title,d0,d1,d2,d3,d4,d5,d6,d7,...,d14,d15,d16,d17,d18,d19,d20,d21,d22,d23
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0


In [26]:
train_data = movies_.iloc[:, 2:]
test_data = movies_[movies_['title'] == 'Jumanji (1995)'].iloc[:, 2:]

In [27]:
from sklearn.neighbors import NearestNeighbors
neighbor = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='euclidean')
neighbor.fit(train_data)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=10, p=2, radius=1.0)

In [28]:
predict = neighbor.kneighbors(test_data, return_distance=True)
movies.iloc[predict[1][0]]

Unnamed: 0,movieId,title,genres
1617,2161,"NeverEnding Story, The (1984)",Adventure|Children|Fantasy
1556,2093,Return to Oz (1985),Adventure|Children|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
1799,2399,Santa Claus: The Movie (1985),Adventure|Children|Fantasy
109,126,"NeverEnding Story III, The (1994)",Adventure|Children|Fantasy
1618,2162,"NeverEnding Story II: The Next Chapter, The (1...",Adventure|Children|Fantasy
1514,2043,Darby O'Gill and the Little People (1959),Adventure|Children|Fantasy
53,60,"Indian in the Cupboard, The (1995)",Adventure|Children|Fantasy
3574,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy
767,1009,Escape to Witch Mountain (1975),Adventure|Children|Fantasy


Теперь обучим два алгоритма из библиотеки surprise: KNNBasic и SVD

In [29]:
from surprise import KNNBasic, Dataset, Reader, accuracy, SVD
from surprise.model_selection import train_test_split, GridSearchCV

In [30]:
movies_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_ratings.dropna(inplace=True)
movies_ratings.head(3)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0


In [31]:
dataset = pd.DataFrame({
    'uid': movies_ratings.userId,
    'iid': movies_ratings.title,
    'rating': movies_ratings.rating
})

In [32]:
dataset.head(3) 

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5


In [33]:
ratings.rating.min()

0.5

In [34]:
ratings.rating.max()

5.0

In [35]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [36]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [37]:
params = {'k':np.arange(10, 101, 10),
          'sim_options': {'name': ['pearson_baseline'], 'user_based': [True]}
         }
grid_algo = GridSearchCV(KNNBasic, params, measures=['rmse', 'mae'], cv=5, n_jobs=-1)
grid_algo.fit(data)

In [38]:
grid_algo.best_params

{'rmse': {'k': 40,
  'sim_options': {'name': 'pearson_baseline', 'user_based': True}},
 'mae': {'k': 40,
  'sim_options': {'name': 'pearson_baseline', 'user_based': True}}}

In [39]:
algo_1 = KNNBasic(k=40, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo_1.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x246681243c8>

In [40]:
test_pred = algo_1.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.9742


0.9741719836539514

In [41]:
algo_2 = SVD(n_factors=20, n_epochs=20)
algo_2.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x24668124e80>

In [42]:
test_pred = algo_2.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8700


0.8699714165613905

Собираем финальную функцию для предсказания

In [43]:
def get_movies(user):
    '''первая часть каскада собранная из предсказания ALS и ближайших соседей к трем понравившимся 
    фильмам'''
    list_for_user = []
    recommendations = algo_0.recommend(user, user_items, N=15)
    for i in recommendations:
        list_for_user.append(i[0])
        
    films = np.random.choice(good_feedback_dict[user], 3)
    for film in films: 
        data_for_pred = movies_[movies_['movieId'] == film].iloc[:, 2:]
        predict = neighbor.kneighbors(data_for_pred, return_distance=True)
        for i in predict[1][0]:
            if i is not list_for_user:
                list_for_user.append(i)

    '''Вторая часть каскада. Находим с помошью двух обученных алгоритмов и усрядняем их оценку'''
    
    user_movies = movies_and_ratings[movies_and_ratings.userId == user].title.unique()
    
    scores = []
    titles = []
    for iid in movies_and_ratings.loc[movies_and_ratings['movieId'].isin(list_for_user)].title.unique():
        if iid is not user_movies:
            scores.append((algo_1.predict(user, iid).est + 
                          algo_2.predict(user, iid).est)/2)
            titles.append(iid)
        
        
    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [44]:
get_movies(27)

Casablanca (1942) 4.354239360141479
Pecker (1998) 4.2083158550150275
Glory (1989) 4.055441452497826
Postman, The (Postino, Il) (1994) 3.970265534634481
Citizen Kane (1941) 3.9375823728940205
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) 3.9158423305297783
Ulee's Gold (1997) 3.9076404947434353
Outsiders, The (1983) 3.8861417879407116
Living in Oblivion (1995) 3.882911211331984
They Live (1988) 3.7490443533211018
