In [1]:
import gc

import implicit
from scipy import sparse

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

In [2]:
%%time

history = pd.read_parquet(
    './data/player_starts_train.parquet', columns = ['date', 'user_id', 'item_id', 'watch_time']
).sort_values('date')
history

CPU times: total: 2min 49s
Wall time: 2min 31s


Unnamed: 0,date,user_id,item_id,watch_time
1527749,2023-07-21 00:00:00+03:00,user_19898812,video_486615,34
1390939,2023-07-21 00:00:00+03:00,user_17676945,video_1448534,2
889025,2023-07-21 00:00:00+03:00,user_6623636,video_1099511,192
1412905,2023-07-21 00:00:00+03:00,user_7117207,video_2254837,3014
1224980,2023-07-21 00:00:00+03:00,user_13848737,video_1482726,22
...,...,...,...,...
69306170,2023-08-21 23:59:59+03:00,user_16859213,video_2025319,-1
69399923,2023-08-21 23:59:59+03:00,user_22953064,video_283933,-1
68750086,2023-08-21 23:59:59+03:00,user_27253715,video_451750,-1
68794261,2023-08-21 23:59:59+03:00,user_25150497,video_2237701,0


In [3]:
user_ids = pd.read_csv('./data/sample_submission.csv').user_id
history = history[history.user_id.isin(user_ids)]

counts = history.user_id.value_counts()
users_to_drop = counts[counts <= 10].index
test_history = history[history.user_id.isin(users_to_drop)].copy()
history = history[~history.user_id.isin(users_to_drop)].copy()

history

Unnamed: 0,date,user_id,item_id,watch_time
896634,2023-07-21 00:00:00+03:00,user_25044016,video_1230539,191
2075489,2023-07-21 00:00:00+03:00,user_565332,video_1979002,80
635725,2023-07-21 00:00:01+03:00,user_23875400,video_1549108,1
2059486,2023-07-21 00:00:02+03:00,user_20278263,video_775169,193
676818,2023-07-21 00:00:03+03:00,user_3646011,video_184682,-1
...,...,...,...,...
69195677,2023-08-21 23:59:58+03:00,user_5673927,video_291501,-1
69094893,2023-08-21 23:59:59+03:00,user_4629059,video_136516,0
69683331,2023-08-21 23:59:59+03:00,user_5039586,video_1548248,1
68413835,2023-08-21 23:59:59+03:00,user_26942238,video_2242952,1


In [4]:
def enumerated_dict(values):
    enum_dict = {}
    reverse_dict = {}
    
    for n, value in enumerate(values):
        enum_dict[value] = n
        reverse_dict[n] = value
        
    return enum_dict, reverse_dict

In [5]:
user_dict, reverse_user_dict = enumerated_dict(history["user_id"].unique())
item_dict, reverse_item_dict = enumerated_dict(history["item_id"].unique())

In [6]:
matrix_shape = (max(reverse_user_dict.keys()) + 1, max(reverse_item_dict.keys()) + 1)
matrix_shape # (36671, 258740)

(36671, 258740)

In [7]:
history['r_pos'] = -1
history['r_pos'] = history.groupby('user_id')['r_pos'].cumsum()
history['r_pos'] -= history.groupby('user_id')['r_pos'].transform('min')
history

Unnamed: 0,date,user_id,item_id,watch_time,r_pos
896634,2023-07-21 00:00:00+03:00,user_25044016,video_1230539,191,5790
2075489,2023-07-21 00:00:00+03:00,user_565332,video_1979002,80,221
635725,2023-07-21 00:00:01+03:00,user_23875400,video_1549108,1,451
2059486,2023-07-21 00:00:02+03:00,user_20278263,video_775169,193,23996
676818,2023-07-21 00:00:03+03:00,user_3646011,video_184682,-1,10098
...,...,...,...,...,...
69195677,2023-08-21 23:59:58+03:00,user_5673927,video_291501,-1,0
69094893,2023-08-21 23:59:59+03:00,user_4629059,video_136516,0,0
69683331,2023-08-21 23:59:59+03:00,user_5039586,video_1548248,1,0
68413835,2023-08-21 23:59:59+03:00,user_26942238,video_2242952,1,0


In [8]:
val_data = history[history.r_pos < 10][['user_id', 'item_id', 'watch_time']]
train_data = history[history.r_pos >= 10][['user_id', 'item_id', 'watch_time']]

In [9]:
train_data

Unnamed: 0,user_id,item_id,watch_time
896634,user_25044016,video_1230539,191
2075489,user_565332,video_1979002,80
635725,user_23875400,video_1549108,1
2059486,user_20278263,video_775169,193
676818,user_3646011,video_184682,-1
...,...,...,...
68856226,user_26186143,video_1386970,12
69359796,user_26186143,video_1961210,22
67656542,user_26186143,video_1841347,2
69603713,user_26186143,video_516461,12


In [10]:
val_data

Unnamed: 0,user_id,item_id,watch_time
1125178,user_11156094,video_249783,-1
777380,user_419176,video_902590,121
369901,user_18905783,video_1264738,50
73709,user_419176,video_836422,1
519390,user_419176,video_902590,991
...,...,...,...
69195677,user_5673927,video_291501,-1
69094893,user_4629059,video_136516,0
69683331,user_5039586,video_1548248,1
68413835,user_26942238,video_2242952,1


In [11]:
enum_users = np.array([user_dict[user] for user in tqdm(train_data["user_id"].values)])
enum_items = np.array([item_dict[item] for item in tqdm(train_data["item_id"].values)])

  0%|          | 0/3566142 [00:00<?, ?it/s]

  0%|          | 0/3566142 [00:00<?, ?it/s]

In [12]:
sparse_matrix = sparse.csr_matrix(
    (np.ones(shape=(len(enum_users))), (enum_users, enum_items)), 
    shape=matrix_shape
)
print("Sparticity: ", 100 - train_data.shape[0] / (sparse_matrix.shape[0] * sparse_matrix.shape[1]))

Sparticity:  99.99962415194649


In [13]:
model = implicit.als.AlternatingLeastSquares(
    factors=512, iterations=30, alpha=60,
    random_state=56
)
model.fit(sparse_matrix, show_progress=True)

  0%|          | 0/30 [00:00<?, ?it/s]

In [14]:
model_nn = implicit.nearest_neighbours.CosineRecommender(K=128)
model_nn.fit((sparse_matrix))




  0%|          | 0/258740 [00:00<?, ?it/s]

In [15]:
ids = val_data.user_id.unique()

In [16]:
def predict_user_nn(model, user_id, items, item_dict, reverse_item_dict):    
    enum_users = np.zeros(len(items))
    # enum_items = np.array([item_dict[item[0]] for item in items])
    enum_items = np.array([item_dict[item] for item in items])
        
    cur_sparse_matrix = sparse.csr_matrix(
        (np.ones(shape=(len(enum_users))), (enum_users, enum_items)), 
        shape=(1, max(reverse_item_dict.keys()) + 1)
    )

    rec = model.recommend(0, cur_sparse_matrix, N=100, recalculate_user=True,
                     filter_already_liked_items=False)
    idx = range(len(rec[0]))
    idx = sorted(idx, key=lambda x: rec[1][x], reverse=True)
    return [(reverse_item_dict[r], score) for r, score in zip(rec[0][idx], rec[1][idx])]

In [17]:
def predict_user_als(als_model, user_id, items, item_dict, reverse_item_dict): 
    enum_users = np.zeros(len(items))
    # enum_items = np.array([item_dict[item[0]] for item in items])
    enum_items = np.array([item_dict[item] for item in items])
    # timespents = np.array([item[1] for item in items])
    timespents =  np.ones(shape=(len(items)))
        
    used_items = set(enum_items)
    
    cur_sparse_matrix = sparse.csr_matrix(
        (timespents, (enum_users, enum_items)), 
        shape=(1, max(reverse_item_dict.keys()) + 1)
    )

    rec = als_model.recommend(0, cur_sparse_matrix, N=100, recalculate_user=True,
                     filter_already_liked_items=False)
    idx = range(len(rec[0]))
    idx = sorted(idx, key=lambda x: rec[1][x], reverse=True)
    return [(reverse_item_dict[r], score) for r, score in zip(rec[0][idx], rec[1][idx])]

In [18]:
items = train_data.groupby('user_id')['item_id'].agg(list)

In [19]:
pred_candidates_nn = {}
for user in tqdm(ids):
    candidates = predict_user_nn(
        model_nn, user, 
        items[user], 
        item_dict, reverse_item_dict
    )
    pred_candidates_nn[user] = candidates

  0%|          | 0/36671 [00:00<?, ?it/s]

In [20]:
pred_candidates = {}
for user in tqdm(ids):
    candidates = predict_user_als(
        model, user, 
        items[user], 
        item_dict, reverse_item_dict
    )
    pred_candidates[user] = candidates

  0%|          | 0/36671 [00:00<?, ?it/s]

In [21]:
from sklearn.model_selection import train_test_split

train_ids, val_ids = train_test_split(ids, train_size=0.7, random_state=56)
len(train_ids), len(val_ids)

(25669, 11002)

In [22]:
correct_candidates = val_data.groupby('user_id')['item_id'].agg(list)

In [23]:
p, q = 0, 0
for user_id in tqdm(ids):
    X1 = [item[0] for item in pred_candidates_nn[user_id]]
    p += len(set(X1) & set(correct_candidates[user_id]))
    q += len(set(correct_candidates[user_id]))
print(p / q)

  0%|          | 0/36671 [00:00<?, ?it/s]

0.5868209181152061


In [24]:
p, q = 0, 0
for user_id in tqdm(ids):
    X2 = [item[0] for item in pred_candidates[user_id]]
    p += len(set(X2) & set(correct_candidates[user_id]))
    q += len(set(correct_candidates[user_id]))
print(p / q)

  0%|          | 0/36671 [00:00<?, ?it/s]

0.6167762069864544


In [25]:
p, q = 0, 0
for user_id in tqdm(ids):
    X1 = [item[0] for item in pred_candidates_nn[user_id]]
    X2 = [item[0] for item in pred_candidates[user_id]]
    p += len((set(X1) | set(X2)) & set(correct_candidates[user_id]))
    q += len(set(correct_candidates[user_id]))
print(p / q)

  0%|          | 0/36671 [00:00<?, ?it/s]

0.6678301677542873


In [26]:
als_item_factors = model.item_factors

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from typing import List

def calculate_similarities(item_factors: np.ndarray, user_candidates: np.ndarray, user_history: List[int]):
    return cosine_similarity(item_factors[user_candidates], item_factors[user_history])

In [28]:
from functools import partial

def calculate_similarity_stats(candidates, history):
    res = model_nn.similarity[candidates] * model_nn.similarity[history].T
    return res.toarray()

SIMILARITY_AGGS_MAP = {
    'similarity_mean': partial(np.mean, axis=1),
    'similarity_min': partial(np.min, axis=1),
    'similarity_max': partial(np.max, axis=1),
    'similarity_std': partial(np.std, axis=1),
    'similarity_var': partial(np.var, axis=1),
}

In [29]:
videos_data = pd.read_parquet(
    './data/videos.parquet',
    columns=[
        'item_id', 
        'video_title', 'author_title',
        'duration', 'channel_sub', 'tv_sub',
        'ctr.CTR_10days_21_07', 'ctr.CTR_10days_01_08', 'ctr.CTR_10days_10_08', 'ctr.CTR_10days_21_08',
        'publicated',
        'category_title'
    ]
).set_index('item_id').fillna(-1)
videos_data = videos_data.loc[history.item_id.unique()]
videos_data['video_title'] = videos_data['video_title'].apply(lambda x: x.lower().strip())
videos_data['author_title'] = videos_data['author_title'].apply(lambda x: x.lower().strip())
videos_data['author_title'] = videos_data['author_title'].astype('category').cat.codes
videos_data['publicated_date'] = pd.to_datetime(videos_data['publicated'].apply(lambda x: x.split(' ')[0]))
videos_data

Unnamed: 0_level_0,video_title,author_title,duration,channel_sub,tv_sub,ctr.CTR_10days_21_07,ctr.CTR_10days_01_08,ctr.CTR_10days_10_08,ctr.CTR_10days_21_08,publicated,category_title,publicated_date
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
video_1230539,кринжово крашу маму // мне конец ч3,6264,382336,5221,0,0.000000,0.000000,-1.000000,0.000000,2023-03-07 19:51:35+03:00,Развлечения,2023-03-07
video_1979002,первый канал. прямой эфир,22593,0,45848,0,0.000000,0.000000,0.000000,0.000000,2022-05-19 15:11:57+03:00,Телепередачи,2022-05-19
video_1549108,некуда идти. мужское / женское. выпуск от 14.0...,22593,2436200,45848,0,0.092534,0.070556,0.060198,0.060447,2023-07-17 17:39:11+03:00,Телепередачи,2023-07-17
video_775169,поездка в крым. 10 часть/ таврида /меганом/ пу...,24472,359606,4985,0,0.000000,0.000000,0.000000,0.000000,2023-07-16 01:50:17+03:00,Путешествия,2023-07-16
video_184682,kranvagn- на скиле,26753,714850,588,0,0.000000,0.000000,0.000000,0.000000,2023-01-28 18:09:35+03:00,Видеоигры,2023-01-28
...,...,...,...,...,...,...,...,...,...,...,...,...
video_538199,детсад банбана 4 взломал чу чу чарли garten o...,4847,914240,809,0,-1.000000,-1.000000,-1.000000,-1.000000,2023-08-21 14:01:17+03:00,Видеоигры,2023-08-21
video_1900477,"прогулка по ладожскому вокзалу, часть 1",15021,311519,6,0,0.000000,0.000000,0.000000,0.000000,2023-07-15 13:41:39+03:00,Разное,2023-07-15
video_553698,днк на нтв сегодняшний выпуск 19 мая 2023 смот...,26315,68002,72,0,0.000000,0.000000,0.000000,-1.000000,2023-05-19 19:39:38+03:00,Развлечения,2023-05-19
video_593391,vid_20230821_230314_421,25041,13734,5,0,0.000000,0.000000,0.000000,0.000000,2023-08-21 23:10:22+03:00,Юмор,2023-08-21


In [30]:
emotions = pd.read_csv('./data/emotions.csv')
emotions

Unnamed: 0,date,user_id,item_id,type,emoji
0,2023-08-03 21:37:22+03:00,user_21883648,video_2247834,v_top,v_top
1,2023-08-03 21:04:54+03:00,user_24016046,video_2247834,v_top,v_top
2,2023-08-03 20:37:06+03:00,user_20748867,video_22669,v_top,v_top
3,2023-08-03 20:44:12+03:00,user_21545120,video_645471,v_top,v_top
4,2023-08-03 20:45:17+03:00,user_28987830,video_2346806,v_top,v_top
...,...,...,...,...,...
369424,2023-08-02 11:43:43+03:00,user_21597217,video_1180057,pos_emotions,Like
369425,2023-07-25 16:26:35+03:00,user_6805283,video_1180057,pos_emotions,Like
369426,2023-07-28 13:07:10+03:00,user_5419928,video_1180057,pos_emotions,Like
369427,2023-08-07 18:16:30+03:00,user_12044809,video_1180057,pos_emotions,Like


In [31]:
reactions_count = emotions.item_id.value_counts()

pos_reactions_count = emotions[emotions.type == 'pos_emotions'].item_id.value_counts()
neg_reactions_count = emotions[emotions.type == 'neg_emotions'].item_id.value_counts()
v_top_reactions_count = emotions[emotions.type == 'v_top'].item_id.value_counts()

pos_reactions_count /= reactions_count[pos_reactions_count.index]
neg_reactions_count /= reactions_count[neg_reactions_count.index]
v_top_reactions_count /= reactions_count[v_top_reactions_count.index]

v_top_reactions_count

video_1196103    0.696301
video_934162     0.666901
video_667301     0.965204
video_1565629    0.603591
video_567158     0.580874
                   ...   
video_22434      1.000000
video_1641207    0.500000
video_623053     0.500000
video_1615213    0.500000
video_2043942    0.500000
Name: item_id, Length: 35305, dtype: float64

In [32]:
mean_watch_time = train_data.groupby('item_id')['watch_time'].mean()
sum_watch_time = train_data.groupby('item_id')['watch_time'].sum()
popularity = train_data.item_id.value_counts()

mean_watch_time

item_id
video_0           531.0
video_1           128.2
video_100000      412.5
video_1000004    1293.0
video_1000016     627.0
                  ...  
video_999945      102.0
video_999947        1.0
video_999952      190.5
video_999979      132.0
video_999981     1145.2
Name: watch_time, Length: 244035, dtype: float64

In [33]:
from fuzzywuzzy import fuzz


def make_df(ids):    
    groups, df = [], []
    for user_id in tqdm(ids):        
        y = set(correct_candidates[user_id])
        
        X1 = [item[0] for item in pred_candidates[user_id]]
        scores1 = [item[1] for item in pred_candidates[user_id]]
        ranks1 = [1 / i for i in range(1, len(X1) + 1)]
        
        X2 = [item[0] for item in pred_candidates_nn[user_id]]
        scores2 = [item[1] for item in pred_candidates_nn[user_id]]
        ranks2 = [1 / i for i in range(1, len(X2) + 1)]
        
        cur_df = pd.DataFrame()
        X = np.unique(X1 + X2)
        cur_df.index = X
        cur_df.loc[X1, 'als_score'] = scores1
        cur_df.loc[X1, 'als_rank'] = ranks1
        cur_df.loc[X2, 'nn_score'] = scores2
        cur_df.loc[X2, 'nn_rank'] = ranks2
        
        last_author = videos_data.loc[items[user_id][-1], 'author_title']
        cur_df['is_author_eq'] = videos_data.loc[X, 'author_title'] == last_author
        last_title = videos_data.loc[items[user_id][-1], 'video_title']
        values = []
        for title in videos_data.loc[X, 'video_title']:
            values.append(fuzz.token_set_ratio(title, last_title))
        cur_df['title_ratio'] = values
        
        popularities = popularity[X]
        cur_df['popularity'] = popularities
        cur_df['popularity/max'] = popularities / np.max(popularities)
        
        cur_videos_data = videos_data.loc[X]
        # cur_df['season'] = cur_videos_data['season']
        cur_df['duration'] = cur_videos_data['duration']
        cur_df['channel_sub'] = cur_videos_data['channel_sub']
        cur_df['tv_sub'] = cur_videos_data['tv_sub']
        # cur_df['tv_title_len'] = cur_videos_data['tv_title_len']
        cur_df['category_title'] = cur_videos_data['category_title']
        cur_df['pub_date'] = (pd.Timestamp(year=2023, month=8, day=22) - cur_videos_data['publicated_date']).dt.days
        for feature in ('ctr.CTR_10days_21_07', 'ctr.CTR_10days_01_08', 'ctr.CTR_10days_10_08', 'ctr.CTR_10days_21_08'):
            cur_df[feature] = cur_videos_data[feature]
        
        cur_df['reactions_count'] = reactions_count.reindex(X).fillna(0)
        cur_df['pos_reactions_count'] = pos_reactions_count.reindex(X)
        cur_df['neg_reactions_count'] = neg_reactions_count.reindex(X)
        cur_df['v_top_reactions_count'] = v_top_reactions_count.reindex(X)
        
        time_sums = sum_watch_time[X]
        cur_df['sum_time'] = time_sums
        cur_df['sum_time/max'] = time_sums / np.max(time_sums)
        
        cur_df['mean_watch_time'] = mean_watch_time[X]
        cur_df['mean_time/duration'] = cur_df['mean_watch_time'] / (cur_df['duration'] + 1)
        
        last_pub_date = videos_data.loc[items[user_id][-1], 'publicated_date']
        cur_df['pub_date_diff'] = (last_pub_date - cur_videos_data['publicated_date']).dt.days
        
        similarities  = calculate_similarity_stats(
            np.array([item_dict[item] for item in X]),
            np.array([item_dict[item] for item in items[user_id]]),
        )
        for agg_name, agg_fn in SIMILARITY_AGGS_MAP.items():
            cur_df[agg_name] = agg_fn(similarities)
        
        als_similarities = calculate_similarities(
            als_item_factors, 
            np.array([item_dict[item] for item in X]), 
            np.array([item_dict[item] for item in items[user_id]])
        )
        for agg_name, agg_fn in SIMILARITY_AGGS_MAP.items():
            cur_df['als_' + agg_name] = agg_fn(als_similarities)
            
        labels = [int(item in y) for item in X]
        cur_df['label'] = labels
        
        mx = np.max(scores1)
        if mx > 0:
            cur_df.loc[X1, 'als_score/max'] = list(np.array(scores1) / mx)
        mx = np.max(scores2)
        if mx > 0:
            cur_df.loc[X2, 'nn_score/max'] = list(np.array(scores2) / mx)
        
        groups += [user_id] * len(X)
        
        df.append(cur_df.fillna(0).reset_index(drop=True))
        
    df = pd.concat(df)
    # df['items_count'] = items[groups].apply(len).values
    
    return df, groups

train_df, train_groups = make_df(train_ids)
val_df, val_groups = make_df(val_ids)
train_df

  0%|          | 0/25669 [00:00<?, ?it/s]

  0%|          | 0/11002 [00:00<?, ?it/s]

Unnamed: 0,als_score,als_rank,nn_score,nn_rank,is_author_eq,title_ratio,popularity,popularity/max,duration,channel_sub,...,similarity_std,similarity_var,als_similarity_mean,als_similarity_min,als_similarity_max,als_similarity_std,als_similarity_var,label,als_score/max,nn_score/max
0,0.982616,0.250000,13.488682,0.333333,True,73,51,0.153153,3011600,207061,...,3.971266,15.770951,0.725824,0.431565,1.000000,0.132008,0.017426,0,0.960616,0.792578
1,0.504969,0.012195,5.640104,0.017241,True,72,9,0.027027,2993600,207061,...,1.737685,3.019551,0.744516,0.460812,0.867193,0.111090,0.012341,0,0.493663,0.331406
2,0.599419,0.017544,7.380912,0.033333,True,68,19,0.057057,2971034,207061,...,2.627089,6.901597,0.733865,0.422266,0.850528,0.127372,0.016224,0,0.585999,0.433693
3,0.972741,0.142857,12.439330,0.100000,True,71,38,0.114114,3025040,207061,...,2.842387,8.079162,0.808992,0.457810,1.000000,0.149129,0.022239,0,0.950962,0.730920
4,0.492360,0.011628,5.313399,0.015385,True,56,14,0.042042,3017400,207061,...,2.858206,8.169344,0.718099,0.451967,0.833030,0.099566,0.009913,0,0.481336,0.312209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,0.890366,0.025641,11.413063,0.016667,False,7,21,0.228261,1015409,2207,...,7.678527,58.959781,0.802759,0.635388,1.000000,0.064674,0.004183,0,0.878823,0.334268
109,0.748163,0.013514,14.332284,0.025641,False,5,32,0.347826,673704,778,...,9.198761,84.617196,0.877226,0.680290,0.957580,0.065216,0.004253,0,0.738463,0.419767
110,0.806807,0.015873,8.592995,0.012048,False,5,16,0.173913,856422,764,...,5.175136,26.782032,0.875076,0.680043,0.957247,0.066101,0.004369,0,0.796347,0.251674
111,0.877085,0.022727,17.790390,0.035714,False,5,18,0.195652,563433,764,...,7.334956,53.801584,0.902614,0.708024,1.000000,0.055725,0.003105,0,0.865714,0.521049


In [34]:
(train_df['label'] > 0).mean(), (train_df['label'] > 0).sum() # (0.031447384168334626, 120083)

(0.031447384168334626, 120083)

In [35]:
from catboost import CatBoostRanker, Pool

cat_features = ['category_title']

train_pool = Pool(
    data=train_df.drop('label', axis=1),
    label=train_df['label'],
    group_id=train_groups,
    cat_features=cat_features
)

val_pool = Pool(
    data=val_df.drop('label', axis=1),
    label=val_df['label'],
    group_id=val_groups,
    cat_features=cat_features
)

  from pandas import MultiIndex, Int64Index



In [36]:
params = {
    'task_type': 'CPU',
    'loss_function': 'YetiRank',
    'eval_metric': 'MAP:top=10',
    # 'iterations': 500,
    'iterations': 1000,
    'max_depth': 8,
}

In [37]:
model_cb = CatBoostRanker(**params, random_seed=56)
model_cb.fit(train_pool, eval_set=val_pool, plot=True, verbose=False, use_best_model=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x22c56d88f70>

In [38]:
np.max(model_cb.evals_result_['validation']['MAP:top=10'])

0.4816107784738649

In [39]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [40]:
actual, predicted = [], []
for user_id in tqdm(correct_candidates.index):
    actual.append(list(set(correct_candidates[user_id])))
    predicted.append(
        [item[0] for item in pred_candidates[user_id]]
    )

  0%|          | 0/36671 [00:00<?, ?it/s]

In [41]:
mapk(actual, predicted)

0.14692099562063712

In [42]:
actual, predicted = [], []
for user_id in tqdm(correct_candidates.index):
    actual.append(list(set(correct_candidates[user_id])))
    predicted.append(
        [item[0] for item in pred_candidates_nn[user_id]]
    )

  0%|          | 0/36671 [00:00<?, ?it/s]

In [43]:
mapk(actual, predicted)

0.1547626799056435

In [88]:
submission = pd.read_csv('./data/sample_submission.csv').set_index('user_id')
submission['recs'] = [list()] * len(submission)
submission

Unnamed: 0_level_0,recs
user_id,Unnamed: 1_level_1
user_26511551,[]
user_29194819,[]
user_29734049,[]
user_955460,[]
user_7065521,[]
...,...
user_29281681,[]
user_3912848,[]
user_28389099,[]
user_18951296,[]


In [89]:
test_items = pd.concat([
    history.groupby('user_id')['item_id'].agg(list), 
    test_history[test_history.item_id.isin(history.item_id)].groupby('user_id')['item_id'].agg(list)
])
test_items

user_id
user_10000266    [video_803844, video_803844, video_803844, vid...
user_10001792    [video_2023333, video_844330, video_1184499, v...
user_10002681    [video_1179953, video_1179953, video_1179953, ...
user_10002686    [video_460130, video_828036, video_1580806, vi...
user_10003740    [video_2142690, video_344825, video_899674, vi...
                                       ...                        
user_9995598     [video_1631397, video_1631397, video_2274041, ...
user_999588             [video_499265, video_1970165, video_68646]
user_9995884     [video_2052690, video_2052690, video_2052690, ...
user_9996635     [video_1508131, video_1593368, video_1593368, ...
user_9998714     [video_1180057, video_1180057, video_214606, v...
Name: item_id, Length: 56383, dtype: object

In [90]:
test_user_ids = list(set(submission.index) & set(test_items.index))
len(test_user_ids), len(test_user_ids) / len(submission) # (56383, 0.5798334018922254)

(56383, 0.5798334018922254)

In [91]:
def predict(user_id):        
    candidates_nn = predict_user_nn(
        model_nn, user_id, 
        test_items[user_id], 
        item_dict, reverse_item_dict
    )
    candidates_als = predict_user_als(
        model, user_id, 
        test_items[user_id], 
        item_dict, reverse_item_dict
    )
        
    X1 = [item[0] for item in candidates_als]
    scores1 = [item[1] for item in candidates_als]
    ranks1 = [1 / i for i in range(1, len(X1) + 1)]
        
    X2 = [item[0] for item in candidates_nn]
    scores2 = [item[1] for item in candidates_nn]
    ranks2 = [1 / i for i in range(1, len(X2) + 1)]
        
    X = np.unique(X1 + X2)
    if len(X) == 0:
        return []
    
    cur_df = pd.DataFrame()
    cur_df.index = X
    cur_df.loc[X1, 'als_score'] = scores1
    cur_df.loc[X1, 'als_rank'] = ranks1
    cur_df.loc[X2, 'nn_score'] = scores2
    cur_df.loc[X2, 'nn_rank'] = ranks2
    
    last_author = videos_data.loc[test_items[user_id][-1], 'author_title']
    cur_df['is_author_eq'] = videos_data.loc[X, 'author_title'] == last_author
    last_title = videos_data.loc[test_items[user_id][-1], 'video_title']
    values = []
    for title in videos_data.loc[X, 'video_title']:
        values.append(fuzz.token_set_ratio(title, last_title))
    cur_df['title_ratio'] = values
        
    popularities = popularity[X]
    cur_df['popularity'] = popularities
    cur_df['popularity/max'] = popularities / np.max(popularities)
        
    cur_videos_data = videos_data.loc[X]
    # cur_df['season'] = cur_videos_data['season']
    cur_df['duration'] = cur_videos_data['duration']
    cur_df['channel_sub'] = cur_videos_data['channel_sub']
    cur_df['tv_sub'] = cur_videos_data['tv_sub']
    # cur_df['tv_title_len'] = cur_videos_data['tv_title_len']
    cur_df['category_title'] = cur_videos_data['category_title']
    cur_df['pub_date'] = (pd.Timestamp(year=2023, month=8, day=22) - cur_videos_data['publicated_date']).dt.days
    for feature in ('ctr.CTR_10days_21_07', 'ctr.CTR_10days_01_08', 'ctr.CTR_10days_10_08', 'ctr.CTR_10days_21_08'):
        cur_df[feature] = cur_videos_data[feature]
        
    cur_df['reactions_count'] = reactions_count.reindex(X).fillna(0)
    cur_df['pos_reactions_count'] = pos_reactions_count.reindex(X)
    cur_df['neg_reactions_count'] = neg_reactions_count.reindex(X)
    cur_df['v_top_reactions_count'] = v_top_reactions_count.reindex(X)
        
    time_sums = sum_watch_time[X]
    cur_df['sum_time'] = time_sums
    cur_df['sum_time/max'] = time_sums / np.max(time_sums)
        
    cur_df['mean_watch_time'] = mean_watch_time[X]
    cur_df['mean_time/duration'] = cur_df['mean_watch_time'] / (cur_df['duration'] + 1)
    
    last_pub_date = videos_data.loc[test_items[user_id][-1], 'publicated_date']
    cur_df['pub_date_diff'] = (last_pub_date - cur_videos_data['publicated_date']).dt.days
        
    similarities  = calculate_similarity_stats(
        np.array([item_dict[item] for item in X]),
        np.array([item_dict[item] for item in test_items[user_id]]),
    )
    for agg_name, agg_fn in SIMILARITY_AGGS_MAP.items():
        cur_df[agg_name] = agg_fn(similarities)
        
    als_similarities = calculate_similarities(
        als_item_factors, 
        np.array([item_dict[item] for item in X]), 
        np.array([item_dict[item] for item in test_items[user_id]])
    )
    for agg_name, agg_fn in SIMILARITY_AGGS_MAP.items():
        cur_df['als_' + agg_name] = agg_fn(als_similarities)
            
    cur_df['als_score/max'] = 0
    if len(scores1) > 0:
        mx = np.max(scores1)
        if mx > 0:
            cur_df.loc[X1, 'als_score/max'] = list(np.array(scores1) / mx)
    cur_df['nn_score/max'] = 0
    if len(scores2) > 0:
        mx = np.max(scores2)
        if mx > 0:
            cur_df.loc[X2, 'nn_score/max'] = list(np.array(scores2) / mx)
        
    groups = [user_id]*len(X)

    test_pool = Pool(
        data=cur_df.fillna(0).reset_index(drop=True),
        group_id=groups,
        cat_features=cat_features
    )
        
    preds = model_cb.predict(test_pool)
    idx = range(len(X))
    idx = sorted(idx, key=lambda x: preds[x], reverse=True)
    candidates = list(X[idx])
        
    return candidates[:10]

In [92]:
for user_id in tqdm(test_user_ids):
    submission.loc[user_id, 'recs'] = predict(user_id)
submission

  0%|          | 0/56383 [00:00<?, ?it/s]

Unnamed: 0_level_0,recs
user_id,Unnamed: 1_level_1
user_26511551,"[video_1545210, video_221466, video_2323123, v..."
user_29194819,[]
user_29734049,[]
user_955460,"[video_1371119, video_756683, video_1357467, v..."
user_7065521,"[video_5763, video_364393, video_737808, video..."
...,...
user_29281681,[]
user_3912848,"[video_1938070, video_665164, video_914241, vi..."
user_28389099,"[video_302657, video_1594159, video_144691, vi..."
user_18951296,"[video_1994927, video_1499979, video_2172399, ..."


In [95]:
submission.recs.apply(len).value_counts()

10    56383
0     40857
Name: recs, dtype: int64

In [96]:
submission.to_pickle('./data/old_users_prediction.pickle')