<a href="https://colab.research.google.com/github/Hristo2076/RecSys/blob/main/RecSys_gibrid_hybrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#"ГИБРИДНЫЕ РЕКОМЕНДАТЕЛЬНЫЕ СИСТЕМЫ"

In [18]:
!pip install implicit
!pip install surprise
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [43]:
import pandas as pd
import numpy as np
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')[:200000]
tags = pd.read_csv('tags.csv')

#**1. на основе рейтингов - коллаборативная фильтрация**

In [20]:
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

In [21]:
movi_rat = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movi_rat.dropna(inplace=True)

In [22]:
data_1 = pd.DataFrame({
    'uid': movi_rat.userId,
    'iid': movi_rat.title,
    'rating': movi_rat.rating
})
data_1.head()

Unnamed: 0,uid,iid,rating
0,4.0,Toy Story (1995),4.0
1,10.0,Toy Story (1995),5.0
2,14.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),4.0
4,22.0,Toy Story (1995),4.0


In [23]:
ratings.rating.min(), ratings.rating.max()

(0.5, 5.0)

In [24]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(data_1, reader)

trainset, testset = train_test_split(data, test_size=.15, random_state =777 )
trainset.all_items()

range(0, 12011)

In [25]:
model_KNN = KNNBaseline(k=43, min_k=8, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_KNN.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7febea96b9a0>

In [26]:
test_pred_KNN = model_KNN.test(testset)
rmse_KNN = accuracy.rmse(test_pred_KNN, verbose=True)
rmse_KNN

RMSE: 0.8920


0.8920320233363735

In [27]:
model_KNN.predict(uid=351, iid='Mortal Kombat (1995)').est

3.181529834593676

In [28]:
movies[movies.title.isin(['Mortal Kombat (1995)'])]

Unnamed: 0,movieId,title,genres
43,44,Mortal Kombat (1995),Action|Adventure|Fantasy


In [29]:
ratings[(ratings.movieId==44)&(ratings.rating>3)].userId[:10]

5643      60
6340      72
12122    134
34438    339
36340    351
39230    379
41207    402
49518    489
62122    605
79445    826
Name: userId, dtype: int64

In [30]:
ratings[(ratings.movieId==44)&(ratings.userId==351)]

Unnamed: 0,userId,movieId,rating,timestamp
36340,351,44.0,3.5,1111898000.0


# **2. сделаем на основе рейтингов - SVD, данные подготовлены еще на прошлом этапе**

In [31]:
from surprise import SVD

In [32]:
%%time
model_svd = SVD(n_factors=60, n_epochs=20, random_state=777)
model_svd.fit(trainset)

CPU times: user 3.7 s, sys: 22.4 ms, total: 3.72 s
Wall time: 6.17 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7febe1248460>

In [33]:
test_pred = model_svd.test(testset)
rmse_svd=accuracy.rmse(test_pred, verbose=True)
rmse_svd

RMSE: 0.9058


0.9057997974996631

In [34]:
model_svd.predict(uid=351, iid='Mortal Kombat (1995)').est

2.7391741708885395

#**3. Контентный подход**

In [36]:
ratings = ratings.sort_values("timestamp")

train = ratings.iloc[:-20000].copy()
test = ratings.iloc[-20000:].copy()

In [37]:
pivot = train.pivot_table(index='movieId', 
                          columns='userId',
                          values='rating')

corrs = pivot.corr()

corrs = (
    corrs
    .stack()
    .rename_axis(['userId1', 'userId2'])
    .reset_index()
)
corrs.columns = ['userId1', 'userId2', 'corr']
corrs = corrs[corrs['corr'] >= 0]

corrs

Unnamed: 0,userId1,userId2,corr
0,1,1,1.000000
1,1,4,0.503227
2,1,26,0.212599
4,1,73,1.000000
5,1,79,0.852803
...,...,...,...
1282742,2025,1999,1.000000
1282743,2025,2000,0.664044
1282744,2025,2014,0.520984
1282745,2025,2017,0.242284


In [38]:
### Для каждого юзера из теста 
### Найдем всех "соседей"
### Которые смотрели те же фильмы,
### Что и юзер на тесте


import math

preds = []

for user in test['userId'].unique():
    
    ### Если юзера не было в трейне,
    ### То прогноз в выбранной парадигме дать не сможем
    
    if user in train['userId'].unique():
        part = test[test['userId']==user]

        ### Выделим соседей данного юзера
        
        neighbours = corrs[corrs['userId1']==user]
        neighbours_users = neighbours['userId2'].unique()
        
        ### Если соседей нет, то и предсказывать нечего
        ### Разве что среднее выбранного юзера по фильмам
        ### Но это сильно тупо
        
        if neighbours_users.shape[0]==0:
            continue
        
        ### Выделим фильмы, для которых нужно дать прогноз
        
        movies_ = part['movieId'].unique()

        ### Выделим часть данных с трейна про соседей
        
        train_part = train[train['userId'].isin(neighbours_users)]
        
        ### Посчитаем средние оценки соседей
        
        neighbours_means = train_part.groupby('userId')['rating'].mean()
        
        ### Присоединим эту информацию и посчитаем
        ### Остальные компоненты формулы для рассчета предсказания
        ### Относительно соседей и фильмов,
        ### Для которых возможно сделать прогноз
        
        train_part = train_part[train_part['movieId'].isin(movies_)]
        train_part = pd.merge(train_part,
                              neighbours[['userId2', 'corr']],
                              right_on='userId2',
                              left_on='userId',
                              how='left')
        
        train_part['neighbour_mean'] = train_part['userId2'].map(neighbours_means)
        train_part['diff'] = train_part['rating'] - train_part['neighbour_mean']
        train_part['diff_dot_corr'] = train_part['diff'] * train_part['corr']
        
        ### Посчитаем среднее по юзеру
    
        user_mean = train[train['userId']==user]['rating'].mean()
        
        ### Применим формулу для предсказания 
        
        upper_part = train_part.groupby('movieId')['diff_dot_corr'].sum()
        lower_part = train_part.groupby('movieId')['corr'].sum()
        
        predictions = upper_part / lower_part + user_mean
        predictions = predictions.reset_index()
        predictions.columns = ['movieId', 'prediction']
        predictions['userId'] = user
        
        preds.append(predictions)
        
preds = pd.concat(preds)

preds = pd.merge(
                    preds,
                    test[['userId', 'movieId', 'rating']],
                    on=['userId', 'movieId'],
                    how='left'
)

preds

Unnamed: 0,movieId,prediction,userId,rating
0,29.0,4.174224,667,3.5
1,97.0,4.280780,667,2.0
2,357.0,3.734219,667,4.0
3,455.0,3.019315,667,2.0
4,480.0,3.833709,667,4.0
...,...,...,...,...
2317,159858.0,4.594589,1059,4.5
2318,161634.0,4.424433,1059,5.0
2319,164179.0,5.041600,1059,4.5
2320,165105.0,3.751202,1059,4.0


In [39]:
import re

def find_num(st):
    
    nums_list = re.findall(r'\d+', st)
    
    if len(nums_list) > 0:
        return nums_list[-1]
    else:
        return '0'

def filter_missing_data(num):
    if num > 1900:
        return num
    else:
        return 2000
    
movi_rat['movieYear'] = movi_rat['title'].apply(lambda x: filter_missing_data(int(find_num(x))))

movi_rat.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,movieYear
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,4.0,1113766000.0,1995
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,5.0,948885800.0,1995
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14.0,4.5,1442169000.0,1995
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,4.0,1370810000.0,1995
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22.0,4.0,1237623000.0,1995


In [40]:
### Сделаем One-Hot-Encoding жанров!

all_genres = ['Adventure', 'Comedy', 'Action', 'Mystery', 'Crime', 'Thriller',
              'Drama', 'Animation', 'Children', 'Horror', 'Documentary',
              'Sci-Fi', 'Fantasy', 'Film-Noir', 'Western', 'Musical', 'Romance',
              '(no genres listed)', 'War']

for genre in all_genres:
    movi_rat[genre] = (
        movi_rat['genres']
        .str
        .contains(genre)
        .apply(int)
    )
    
movi_rat = movi_rat.drop('genres', axis=1)

movi_rat.head()

  .contains(genre)


Unnamed: 0,movieId,title,userId,rating,timestamp,movieYear,Adventure,Comedy,Action,Mystery,...,Horror,Documentary,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War
0,1,Toy Story (1995),4.0,4.0,1113766000.0,1995,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,Toy Story (1995),10.0,5.0,948885800.0,1995,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,Toy Story (1995),14.0,4.5,1442169000.0,1995,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,Toy Story (1995),15.0,4.0,1370810000.0,1995,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,Toy Story (1995),22.0,4.0,1237623000.0,1995,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [41]:
### Разделим на трейн-тест

train_new = movi_rat.iloc[:-20000].copy()
test_new = movi_rat.iloc[-20000:].copy()

In [44]:
### Добавим фичи 
### Сколько фильмов юзеры посмотрели
### И какие средние оценки получились (с шумом!)

user_count_views = train_new.groupby('userId').size()
user_means = train_new.groupby('userId')['rating'].mean()

train_new['userViews'] = train_new['userId'].map(user_count_views)

noise = np.random.normal(0, 0.1, [train_new.shape[0],])
train_new['userMeans'] = train_new['userId'].map(user_means) + noise

In [45]:
### Посчитаем среднее кол-во просмотров всех юзеров
### И среднюю оценку по средним оценкам всех юзеров

overall_views_mean = int(user_count_views.mean())
overall_meanrating_mean = int(user_means.mean())

test_new['userViews'] = (
    test_new['userId']
    .map(user_count_views)
    .fillna(overall_views_mean)
)


test_new['userMeans'] = (
    test_new['userId']
    .map(user_means)
    .fillna(overall_meanrating_mean)
)

train_new = train_new.drop(['userId', 'movieId',
                            'timestamp', 'title'], axis=1)

test_new = test_new.drop(['userId', 'movieId',
                          'timestamp', 'title'], axis=1)

In [46]:
train_new[:5]

Unnamed: 0,rating,movieYear,Adventure,Comedy,Action,Mystery,Crime,Thriller,Drama,Animation,...,Sci-Fi,Fantasy,Film-Noir,Western,Musical,Romance,(no genres listed),War,userViews,userMeans
0,4.0,1995,1,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,736,3.340211
1,5.0,1995,1,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,121,4.152281
2,4.5,1995,1,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,87,3.95788
3,4.0,1995,1,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,119,4.258523
4,4.0,1995,1,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,26,3.291814


In [47]:
X_train = train_new.drop('rating', axis=1)
X_test = test_new.drop('rating', axis=1)

y_train = train_new['rating']
y_test = test_new['rating']

In [48]:
from catboost import CatBoostRegressor, Pool

catboost = CatBoostRegressor()


catboost.fit(X_train,
             y_train,
             cat_features=['movieYear'],
             )

Learning rate set to 0.093006
0:	learn: 1.0695207	total: 141ms	remaining: 2m 21s
1:	learn: 1.0539974	total: 234ms	remaining: 1m 56s
2:	learn: 1.0408739	total: 315ms	remaining: 1m 44s
3:	learn: 1.0298106	total: 397ms	remaining: 1m 38s
4:	learn: 1.0206018	total: 481ms	remaining: 1m 35s
5:	learn: 1.0129306	total: 557ms	remaining: 1m 32s
6:	learn: 1.0054811	total: 652ms	remaining: 1m 32s
7:	learn: 0.9991028	total: 719ms	remaining: 1m 29s
8:	learn: 0.9933498	total: 782ms	remaining: 1m 26s
9:	learn: 0.9885749	total: 840ms	remaining: 1m 23s
10:	learn: 0.9845867	total: 911ms	remaining: 1m 21s
11:	learn: 0.9809954	total: 981ms	remaining: 1m 20s
12:	learn: 0.9781059	total: 1.04s	remaining: 1m 19s
13:	learn: 0.9753920	total: 1.11s	remaining: 1m 18s
14:	learn: 0.9731073	total: 1.17s	remaining: 1m 16s
15:	learn: 0.9712920	total: 1.26s	remaining: 1m 17s
16:	learn: 0.9697953	total: 1.3s	remaining: 1m 15s
17:	learn: 0.9683857	total: 1.37s	remaining: 1m 14s
18:	learn: 0.9669151	total: 1.45s	remaining: 

<catboost.core.CatBoostRegressor at 0x7febe0545240>

In [49]:
test_new = movi_rat.iloc[-20000:].copy()

X_test['pred'] = catboost.predict(X_test)
X_test['target'] = y_test
X_test['userId'] = test_new['userId']
X_test['movieId'] = test_new['movieId']

In [50]:
users_dsgs = []

for user in X_test['userId'].unique():
    part = X_test[X_test['userId']==user]
    part = part.sort_values('pred')
    part = part.reset_index()
    user_dsg2 = (np.log2(part.index+1) * part.target)[:2].sum()
    
    users_dsgs.append(user_dsg2)
    
print(f"Среднее DSG@2 по пользователям из теста: {np.mean(users_dsgs)}")

Среднее DSG@2 по пользователям из теста: 3.2842592592592594


In [51]:
new_preds = pd.merge(preds,
                     X_test[['userId', 'movieId', 'pred']],
                     on=['userId', 'movieId'],
                     how='left')

users_dsgs = []

for user in new_preds['userId'].unique():
    part = new_preds[new_preds['userId']==user]
    part = part.sort_values('pred')
    part = part.reset_index()
    user_dsg2 = (np.log2(part.index+1) * part.rating)[:2].sum()

    users_dsgs.append(user_dsg2)
    
print(f"Среднее DSG@2 по пользователям из теста, которые были в трейне: {np.mean(users_dsgs)}")

Среднее DSG@2 по пользователям из теста, которые были в трейне: 3.310344827586207
