In [1]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

In [2]:
emotions = pd.read_csv('./data/emotions.csv')
emotions

Unnamed: 0,date,user_id,item_id,type,emoji
0,2023-08-03 21:37:22+03:00,user_21883648,video_2247834,v_top,v_top
1,2023-08-03 21:04:54+03:00,user_24016046,video_2247834,v_top,v_top
2,2023-08-03 20:37:06+03:00,user_20748867,video_22669,v_top,v_top
3,2023-08-03 20:44:12+03:00,user_21545120,video_645471,v_top,v_top
4,2023-08-03 20:45:17+03:00,user_28987830,video_2346806,v_top,v_top
...,...,...,...,...,...
369424,2023-08-02 11:43:43+03:00,user_21597217,video_1180057,pos_emotions,Like
369425,2023-07-25 16:26:35+03:00,user_6805283,video_1180057,pos_emotions,Like
369426,2023-07-28 13:07:10+03:00,user_5419928,video_1180057,pos_emotions,Like
369427,2023-08-07 18:16:30+03:00,user_12044809,video_1180057,pos_emotions,Like


In [3]:
emotions = emotions[emotions['type'] != 'neg_emotions']

In [4]:
reactions = set()
for user_id, item_id in zip(emotions.user_id, emotions.item_id):
    reactions.add((user_id, item_id))

In [5]:
train_data = pd.read_parquet(
    './data/player_starts_train.parquet', columns = ['date', 'user_id', 'item_id', 'watch_time']
)
train_data

Unnamed: 0,date,user_id,item_id,watch_time
0,2023-07-21 19:04:50+03:00,user_12964323,video_1042531,51
1,2023-07-21 02:02:41+03:00,user_16517,video_1707159,31
2,2023-07-21 22:00:47+03:00,user_15057892,video_1989987,9
3,2023-07-21 19:09:43+03:00,user_2846972,video_1356486,-1
4,2023-07-21 11:06:58+03:00,user_20517034,video_1380654,11
...,...,...,...,...
69954375,2023-08-21 02:51:53+03:00,user_15478739,video_1449287,291
69954376,2023-08-21 08:40:18+03:00,user_25783543,video_1423321,2
69954377,2023-08-21 05:19:55+03:00,user_3507470,video_464555,261
69954378,2023-08-21 12:32:32+03:00,user_13128840,video_420973,21


In [6]:
train_data = train_data[train_data.date >= "2023-08-01 00:00:00+03:00"].copy()
train_data

Unnamed: 0,date,user_id,item_id,watch_time
23371198,2023-08-01 11:50:52+03:00,user_18556667,video_15830,51
23371199,2023-08-01 11:47:25+03:00,user_12417386,video_2194155,141
23371200,2023-08-01 11:47:14+03:00,user_12417386,video_2194155,141
23371201,2023-08-01 20:01:00+03:00,user_11892063,video_1770182,451
23371202,2023-08-01 20:20:21+03:00,user_6433209,video_854240,2612
...,...,...,...,...
69954375,2023-08-21 02:51:53+03:00,user_15478739,video_1449287,291
69954376,2023-08-21 08:40:18+03:00,user_25783543,video_1423321,2
69954377,2023-08-21 05:19:55+03:00,user_3507470,video_464555,261
69954378,2023-08-21 12:32:32+03:00,user_13128840,video_420973,21


In [7]:
values = []
for user_id, item_id in tqdm(zip(train_data.user_id, train_data.item_id), total=len(train_data)):
    values.append((user_id, item_id) in reactions)
train_data['reaction'] = values

  0%|          | 0/46583182 [00:00<?, ?it/s]

In [8]:
train_data['day'] = train_data['date'].apply(lambda x: int(x.split(' ')[0].split('-')[2]))

In [9]:
train_data["rank"] = train_data.day.max() - train_data.day.to_numpy()
train_data["rating"] = 1 / (train_data['rank'] + 1)**2 * np.log(train_data['watch_time']+2) * (1 + 10*train_data['reaction'])
train_data

Unnamed: 0,date,user_id,item_id,watch_time,reaction,day,rank,rating
23371198,2023-08-01 11:50:52+03:00,user_18556667,video_15830,51,False,1,20,0.009003
23371199,2023-08-01 11:47:25+03:00,user_12417386,video_2194155,141,False,1,20,0.011254
23371200,2023-08-01 11:47:14+03:00,user_12417386,video_2194155,141,False,1,20,0.011254
23371201,2023-08-01 20:01:00+03:00,user_11892063,video_1770182,451,False,1,20,0.013868
23371202,2023-08-01 20:20:21+03:00,user_6433209,video_854240,2612,False,1,20,0.017843
...,...,...,...,...,...,...,...,...
69954375,2023-08-21 02:51:53+03:00,user_15478739,video_1449287,291,False,21,0,5.680173
69954376,2023-08-21 08:40:18+03:00,user_25783543,video_1423321,2,False,21,0,1.386294
69954377,2023-08-21 05:19:55+03:00,user_3507470,video_464555,261,False,21,0,5.572154
69954378,2023-08-21 12:32:32+03:00,user_13128840,video_420973,21,False,21,0,3.135494


In [10]:
popularity = train_data.groupby('item_id')['rating'].sum().sort_values(ascending=False)
popularity

item_id
video_283933     1.090204e+06
video_68646      2.235797e+05
video_1761620    1.693183e+05
video_1594159    9.728119e+04
video_170129     9.678266e+04
                     ...     
video_910569     0.000000e+00
video_1301743    0.000000e+00
video_2229864    0.000000e+00
video_1802329    0.000000e+00
video_1542739    0.000000e+00
Name: rating, Length: 1844876, dtype: float64

In [11]:
const_predict = list(popularity.index[:10])
const_predict

['video_283933',
 'video_68646',
 'video_1761620',
 'video_1594159',
 'video_170129',
 'video_2052690',
 'video_803844',
 'video_1508623',
 'video_1210283',
 'video_1508131']

In [14]:
submission = pd.read_pickle('./data/old_users_prediction.pickle')
submission

Unnamed: 0_level_0,recs
user_id,Unnamed: 1_level_1
user_26511551,"[video_1545210, video_221466, video_2323123, v..."
user_29194819,[]
user_29734049,[]
user_955460,"[video_1371119, video_756683, video_1357467, v..."
user_7065521,"[video_5763, video_364393, video_737808, video..."
...,...
user_29281681,[]
user_3912848,"[video_1938070, video_665164, video_914241, vi..."
user_28389099,"[video_302657, video_1594159, video_144691, vi..."
user_18951296,"[video_1994927, video_1499979, video_2172399, ..."


In [15]:
for user_id, rec in zip(submission.index, submission.recs):
    submission.loc[user_id, 'recs'] = rec + const_predict[:10 - len(rec)]
submission

Unnamed: 0_level_0,recs
user_id,Unnamed: 1_level_1
user_26511551,"[video_1545210, video_221466, video_2323123, v..."
user_29194819,"[video_283933, video_68646, video_1761620, vid..."
user_29734049,"[video_283933, video_68646, video_1761620, vid..."
user_955460,"[video_1371119, video_756683, video_1357467, v..."
user_7065521,"[video_5763, video_364393, video_737808, video..."
...,...
user_29281681,"[video_283933, video_68646, video_1761620, vid..."
user_3912848,"[video_1938070, video_665164, video_914241, vi..."
user_28389099,"[video_302657, video_1594159, video_144691, vi..."
user_18951296,"[video_1994927, video_1499979, video_2172399, ..."


In [16]:
submission.recs.apply(len).value_counts()

10    97240
Name: recs, dtype: int64

In [17]:
submission.to_csv('./submission.csv')