In [1]:
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# в 2х Бустингах есть ранжировщики
from lightgbm import LGBMRanker
from xgboost import XGBRanker
from tqdm import tqdm

ratings = pd.read_csv("./data/ratings.csv")
ratings['timestamp'] = ratings['timestamp'].map(lambda x: datetime.fromtimestamp(x))
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,2009-12-14 05:52:24
1,1,1029,3.0,2009-12-14 05:52:59
2,1,1061,3.0,2009-12-14 05:53:02
3,1,1129,2.0,2009-12-14 05:53:05
4,1,1172,4.0,2009-12-14 05:53:25


In [2]:
import warnings

warnings.filterwarnings('ignore')

### Формируем фичи по пользователю

In [3]:
def get_feature_by_user(df):
    res = list()
    for i, v in tqdm(df.groupby('userId')):
        res.append(
            (
                i,
                len(v['movieId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()

            )
        )
    
    # собираем набор данных с кол-вом по дням
    res = pd.DataFrame(
        res,
        columns=[
            'userId', 'revired_products', '5_star_ratings_gave', '4_star_ratings_gave',
            '3_star_ratings_gave', '2_star_ratings_gave', '1_star_ratings_gave',
            'monday_review_count_user', 'tuesday_review_count_user', 'wednesday_review_count_user', 'thursday_review_count_user',
            'friday_review_count_user', 'saturday_review_count_user', 'sunday_review_count_user','evening_reviews_by_user'
        ])
    return res
   
def get_feature_by_product(df):
    res = list()
    for i, v in tqdm(df.groupby('movieId')):
        res.append(
            (
                i,
                len(v['userId']),
                (v['rating'] == 5).sum(),
                (v['rating'] == 4).sum(),
                (v['rating'] == 3).sum(),
                (v['rating'] == 2).sum(),
                (v['rating'] == 1).sum(),
                (v['timestamp'].dt.dayofweek == 0).sum(),
                (v['timestamp'].dt.dayofweek == 1).sum(),
                (v['timestamp'].dt.dayofweek == 2).sum(),
                (v['timestamp'].dt.dayofweek == 3).sum(),
                (v['timestamp'].dt.dayofweek == 4).sum(),
                (v['timestamp'].dt.dayofweek == 5).sum(),
                (v['timestamp'].dt.dayofweek == 6).sum(),
                (v['timestamp'].dt.hour > 17).sum()
            )
        )
    
    # собираем набор данных с кол-вом по дням
    res = pd.DataFrame(
        res,
        columns=[
            'movieId', 'user_count', '1_star_ratings_recieved', '2_star_ratings_recieved',
            '3_star_ratings_recieved', '4_star_ratings_recieved', '5_star_ratings_recieved',
            'monday_review_count_item', 'tuesday_review_count_item', 'wednesday_review_count_item', 'thursday_review_count_item',
            'friday_review_count_item', 'saturday_review_count_item', 'sunday_review_count_item','evening_reviews_by_movie'
        ])
    return res

In [4]:
# самое простое разделенеи на Тестовую и Обучающую выборку
start = min(ratings['timestamp'])
end = max(ratings['timestamp'])
interval = end - start

ratings['rating'] = ratings['rating'].apply(lambda x:int(np.ceil(x)))

train = ratings[ratings['timestamp'] <= (end - interval/3)]
test = ratings[ratings['timestamp'] >= (start + interval/3)]

train_y = train[train['timestamp'] >= (start + interval/3)]
train_X = train[train['timestamp'] < (start + interval/3)]
test_y = test[test['timestamp'] >= (end - interval/3)]
test_X = test[test['timestamp'] < (end - interval/3)]

train_tgt_user = set(train_X['userId']) & set(train_y['userId'])
test_tgt_user = set(test_X['userId']) & set(test_y['userId'])

In [5]:
train_X.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
20,2,10,4,1996-06-21 14:11:33
21,2,17,5,1996-06-21 14:14:41
22,2,39,5,1996-06-21 14:13:24
23,2,47,4,1996-06-21 14:12:32
24,2,50,4,1996-06-21 14:13:06
25,2,52,3,1996-06-21 14:20:31
26,2,62,3,1996-06-21 14:15:49
27,2,110,4,1996-06-21 14:12:12
28,2,144,3,1996-06-21 14:20:16
29,2,150,5,1996-06-21 14:09:55


In [6]:
train_y

Unnamed: 0,userId,movieId,rating,timestamp
351,5,3,4,2006-11-13 02:42:37
352,5,39,4,2006-11-13 02:42:32
353,5,104,4,2006-11-13 02:37:19
354,5,141,4,2006-11-13 02:30:42
355,5,150,4,2006-11-13 02:33:24
...,...,...,...,...
99999,671,6268,3,2003-10-08 05:16:10
100000,671,6269,4,2003-10-03 05:46:41
100001,671,6365,4,2003-12-09 06:26:03
100002,671,6385,3,2003-12-09 17:21:03


In [7]:
train_X_u = get_feature_by_user(train_X)
test_X_u = get_feature_by_user(test_X)

train_X_p = get_feature_by_product(train_X)
test_X_p = get_feature_by_product(test_X)

100%|███████████████████████████████████████████████████████████████████████████████| 313/313 [00:01<00:00, 235.28it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 196/196 [00:00<00:00, 231.65it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3508/3508 [00:14<00:00, 238.39it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5282/5282 [00:22<00:00, 238.16it/s]


In [8]:
train_X_u.head(10)

Unnamed: 0,userId,revired_products,5_star_ratings_gave,4_star_ratings_gave,3_star_ratings_gave,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,friday_review_count_user,saturday_review_count_user,sunday_review_count_user,evening_reviews_by_user
0,2,76,11,23,36,4,2,0,0,0,0,76,0,0,0
1,4,204,119,52,23,5,5,130,7,0,0,0,14,53,32
2,7,88,13,26,41,5,3,0,0,0,0,0,0,88,0
3,9,45,8,23,9,5,0,0,1,44,0,0,0,0,44
4,10,46,9,19,13,5,0,0,46,0,0,0,0,0,46
5,12,61,8,10,14,17,12,56,5,0,0,0,0,0,0
6,14,20,1,4,9,5,1,0,0,0,0,20,0,0,0
7,15,336,56,98,98,40,44,0,0,0,336,0,0,0,0
8,18,51,2,17,25,5,2,0,0,0,0,0,50,1,0
9,19,423,71,128,195,14,15,0,0,0,423,0,0,0,0


In [9]:
# формируем данные для модели
def get_model_input(X_u, X_m, y, tgt_users):
    # формируем данные для модели
    merged = pd.merge(X_u, y, on=['userId'], how='inner')
    merged = pd.merge(X_m, merged, on=['movieId'], how='outer')
    merged = merged.query('userId in @tgt_users')

    merged.fillna(0, inplace=True)
    features_cols = list(merged.drop(columns=['userId', 'movieId', 'rating', 'timestamp']).columns)

    query_list = merged['userId'].value_counts()

    merged = merged.set_index(['userId', 'movieId'])

    query_list = query_list.sort_index()

    merged.sort_index(inplace=True)

    df_x = merged[features_cols]

    df_y = merged['rating']
    
    return df_x, df_y, query_list

X_train, y_train, query_list_train = get_model_input(train_X_u, train_X_p, train_y, train_tgt_user)
X_test, y_test, query_list_test = get_model_input(test_X_u, test_X_p, test_y, test_tgt_user)


In [10]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,user_count,1_star_ratings_recieved,2_star_ratings_recieved,3_star_ratings_recieved,4_star_ratings_recieved,5_star_ratings_recieved,monday_review_count_item,tuesday_review_count_item,wednesday_review_count_item,thursday_review_count_item,...,2_star_ratings_gave,1_star_ratings_gave,monday_review_count_user,tuesday_review_count_user,wednesday_review_count_user,thursday_review_count_user,friday_review_count_user,saturday_review_count_user,sunday_review_count_user,evening_reviews_by_user
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
15.0,2,52.0,8.0,20.0,19.0,5.0,0.0,9.0,7.0,9.0,8.0,...,40.0,44.0,0.0,0.0,0.0,336.0,0.0,0.0,0.0,0.0
15.0,5,36.0,2.0,10.0,19.0,4.0,1.0,5.0,7.0,5.0,7.0,...,40.0,44.0,0.0,0.0,0.0,336.0,0.0,0.0,0.0,0.0
15.0,6,66.0,20.0,22.0,23.0,0.0,1.0,13.0,8.0,10.0,10.0,...,40.0,44.0,0.0,0.0,0.0,336.0,0.0,0.0,0.0,0.0
15.0,10,73.0,8.0,26.0,34.0,4.0,1.0,12.0,12.0,10.0,10.0,...,40.0,44.0,0.0,0.0,0.0,336.0,0.0,0.0,0.0,0.0
15.0,11,57.0,13.0,24.0,18.0,1.0,1.0,10.0,6.0,8.0,8.0,...,40.0,44.0,0.0,0.0,0.0,336.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665.0,5502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,14.0,123.0,97.0,14.0,0.0,19.0,45.0,88.0,0.0
665.0,5679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,14.0,123.0,97.0,14.0,0.0,19.0,45.0,88.0,0.0
665.0,5952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,14.0,123.0,97.0,14.0,0.0,19.0,45.0,88.0,0.0
665.0,5991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,42.0,14.0,123.0,97.0,14.0,0.0,19.0,45.0,88.0,0.0


In [11]:
y_train

userId  movieId
15.0    2          2.0
        5          5.0
        6          4.0
        10         3.0
        11         3.0
                  ... 
665.0   5502       4.0
        5679       3.0
        5952       5.0
        5991       4.0
        6238       3.0
Name: rating, Length: 5694, dtype: float64

In [12]:
query_list_train

userId
15.0      735
30.0      149
95.0      224
99.0        7
185.0      39
219.0      19
220.0       5
311.0     668
350.0      40
367.0     155
380.0     459
381.0      68
387.0       1
388.0     218
407.0     101
427.0     136
452.0     539
472.0     170
509.0     323
510.0       4
529.0     123
547.0    1381
558.0      12
598.0      70
665.0      48
Name: count, dtype: int64

In [13]:
# используем XGB с методом оценкой для ранжирования
model = XGBRanker(objective='rank:ndcg', n_estimators=100, random_state=0,learning_rate=0.1)
model.fit(
    X_train,
    y_train,
    group=query_list_train,
    eval_metric='ndcg',
    eval_set=[(X_test, y_test)],
    eval_group=[list(query_list_test)],
    verbose =False
)

### Тестируем

In [14]:
def predict_at_k(data, model, k):
    user_ids = list()
    product_ids = list()
    ranks = list()
    
    for userId, df in data.groupby('userId'):
        
        pred = model.predict(df.loc[userId])
        productId = np.array(df.reset_index()['movieId'])
        topK_index = np.argsort(pred)[::-1][:k]
        product_ids.extend(list(productId[topK_index]))
        user_ids.extend([userId]*len(topK_index))
        ranks.extend(list(range(1, len(topK_index)+1)))

    results = pd.DataFrame({'userId': user_ids, 'movieId': product_ids, 'rank': ranks})
    
    return results


def coverage(preds,train_X_p):
    test_recs = preds['movieId'].nunique()
    train_movies = train_X_p['movieId'].nunique()
    return test_recs/train_movies

In [15]:
# предикт для всех, с рангом в 5
predicted = predict_at_k(X_test, model, 5)
predicted

Unnamed: 0,userId,movieId,rank
0,15.0,40815,1
1,15.0,3275,2
2,15.0,3639,3
3,15.0,45728,4
4,15.0,34072,5
...,...,...,...
107,648.0,6711,1
108,648.0,33166,2
109,648.0,608,3
110,648.0,27803,4


In [17]:
# покрытие каталога
coverage(predicted,train_X_p)

0.028506271379703536

In [18]:
# пример на 1 пользователя

userId = 648
print('[predicted]')
print(predicted.query(f'userId == {userId}')[['movieId', 'rank']])
print('[actual]')
print(y_test[userId].sort_values(ascending=False))

[predicted]
     movieId  rank
107     6711     1
108    33166     2
109      608     3
110    27803     4
111      175     5
[actual]
movieId
61323     5.0
96488     5.0
57845     5.0
68347     5.0
71156     5.0
         ... 
62250     3.0
69784     3.0
5515      3.0
72011     3.0
102995    2.0
Name: rating, Length: 68, dtype: float64


In [39]:
b = predicted[predicted.userId==648]
b

Unnamed: 0,userId,movieId,rank
107,648.0,6711,1
108,648.0,33166,2
109,648.0,608,3
110,648.0,27803,4
111,648.0,175,5


In [40]:
a = pd.DataFrame(y_test[648]).reset_index()
a

Unnamed: 0,movieId,rating
0,175,4.0
1,608,4.0
2,1213,4.0
3,1394,4.0
4,1884,5.0
...,...,...
63,102995,2.0
64,128616,4.0
65,140152,4.0
66,142488,5.0


In [41]:
result = b.merge(a, on = 'movieId', how = 'inner')