In [2]:
import numpy as np
import pandas as pd
from psutil import virtual_memory

import recsys
from catboost import CatBoostClassifier
from recsys.boosting.candidates import train_candidates_models
from recsys.boosting.feature_engineering import get_engineering_features
from recsys.boosting.negative_sampling import sample_random_negatives
%load_ext autoreload
%autoreload 2

ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 17.2 gigabytes of available RAM



  from .autonotebook import tqdm as notebook_tqdm


# Data overview

In [3]:
events_df = pd.read_csv('../data/events.csv')
item_features_df = pd.read_csv('../data/item_features.csv')
user_features_df = pd.read_csv('../data/user_features.csv')
item_features_df.shape, user_features_df.shape, events_df.shape

((3706, 19), (6040, 3), (894149, 4))

In [4]:
item_features_df.head(2)

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [5]:
user_features_df.head(2)

Unnamed: 0,user_id,gender,age
0,4855,F,1
1,4065,M,56


In [6]:
events_df.head(2)

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1


In [7]:
n_users = user_features_df['user_id'].nunique()
n_items = item_features_df['item_id'].nunique()
print(f'Total users: {n_users}; Total items: {n_items}')
print('Number of users in events:', events_df['user_id'].nunique())
print('Number of items in events:', events_df['item_id'].nunique())
print('Number of timestamps in events:', events_df['timestamp'].nunique())

Total users: 6040; Total items: 3706
Number of users in events: 6040
Number of items in events: 3690
Number of timestamps in events: 2256


In [8]:
events_df['rating'].value_counts()

rating
4    312170
3    233824
5    201898
2     96001
1     50256
Name: count, dtype: int64

In [9]:
events_df.groupby('user_id')['timestamp'].apply(list).head(5)

user_id
0    [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15...
1    [0, 1, 2, 3, 6, 8, 9, 10, 11, 12, 13, 14, 15, ...
2    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 14, 15, 16,...
3    [0, 1, 2, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 1...
4    [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14...
Name: timestamp, dtype: object

In [10]:
# В среднем у пользователя 148 просмотренных фильма
# По медиане у пользователя 84.5 просмотренных фильма
# Минимум просмотренных фильмов одним пользователем = 14
# Максимум просмотренных фильмов одним пользователем = 2056
# Пользователь не смотрел один фильм два раза
events_df.groupby('user_id')['item_id'].nunique().describe()

count    6040.000000
mean      148.037914
std       173.461681
min        14.000000
25%        38.000000
50%        84.500000
75%       186.000000
max      2056.000000
Name: item_id, dtype: float64

# Candidates scoring

In [11]:
# Для обучения модели отбора кандидатов возьму долю от количества интеракций пользователя
CANDIDATES_TRAIN_SHARE = 2/3

def truncate_rows(group):
    n_rows = int(len(group) * CANDIDATES_TRAIN_SHARE)
    return group.head(n_rows)


candidates_df = events_df.sort_values(
    ['user_id', 'timestamp']
).groupby('user_id').apply(truncate_rows).reset_index(drop=True)

  ).groupby('user_id').apply(truncate_rows).reset_index(drop=True)


In [12]:
candidates_df['user_id'].nunique(), candidates_df['item_id'].nunique()

(6040, 3620)

In [13]:
candidates_df['weight'] = candidates_df['rating'].apply(lambda x: -1 * x if x <= 2 else x)
candidates_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,weight
0,0,1505,4,0,4
1,0,3669,3,1,3
2,0,584,4,2,4
3,0,3390,3,3,3
4,0,2885,4,4,4


In [14]:
candidates_model, candidates_scores = train_candidates_models(candidates_df, n_users, n_items)

  check_blas_config()
100%|█████████████████████████████████████████████████████████████████████████| 50/50 [00:16<00:00,  2.95it/s, loss=0.037]
100%|█████████████████████████████████████████████████| 200/200 [00:16<00:00, 11.89it/s, train_auc=93.93%, skipped=18.11%]
100%|█████████████████████████████████████████████████████████████████████████████████████| 60/60 [00:05<00:00, 11.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3706/3706 [00:00<00:00, 17196.73it/s]


In [15]:
print(f'Number of candidates model: {len(candidates_model) + len(candidates_scores)}')

Number of candidates model: 4


# Ranking

In [25]:
# Для обучения модели ранжирования возьму все до последнего (не включительно) оставшихся интеракций пользователя
def truncate_rows(group):
    n_rows = len(group) - 1
    return group.head(n_rows)


ranking_df = pd.merge(
    events_df,
    candidates_df[['user_id', 'item_id', 'timestamp']].assign(__tmp__=True),
    on=['user_id', 'item_id', 'timestamp'],
    how='left'
)

ranking_df = ranking_df[ranking_df['__tmp__'].isna()].drop('__tmp__', axis=1)
ranking_df = ranking_df.sort_values(
    ['user_id', 'timestamp']
).groupby('user_id').apply(truncate_rows).reset_index(drop=True)
ranking_df['weight'] = ranking_df['rating'].apply(lambda x: -1 * x if x <= 2 else x)
# ranking_df = ranking_df[ranking_df['weight'] > 0]
# ranking_df['target'] = 1

  ).groupby('user_id').apply(truncate_rows).reset_index(drop=True)


## Features for ranking
User-based

Item-based

In [26]:
user_feature_engineering_df, item_feature_engineering_df = get_engineering_features(
    candidates_df,
    user_features_data=user_features_df,
    item_features_data=item_features_df
)

In [27]:
ranking_negatives_df = sample_random_negatives(
    pd.concat(
        (
            candidates_df.loc[candidates_df['weight'] > 0, ['user_id', 'item_id', 'timestamp']],
            ranking_df.loc[ranking_df['weight'] > 0, ['user_id', 'item_id', 'timestamp']]
        ),
        ignore_index=True
    ),
    n_items=n_items
)

In [28]:
ranking_total_df = pd.concat(
    (
        ranking_df[ranking_df['weight'] > 0].assign(target=1)[['user_id', 'item_id', 'target']],
        ranking_negatives_df[['user_id', 'item_id', 'target']]
    ),
    ignore_index=True
).sample(frac=1)

In [29]:
for i, model in enumerate(candidates_model):
    ui_scores = (model.user_factors @ model.item_factors.T)
    ranking_total_df[f'colaborative_model{i}'] = ranking_total_df.apply(lambda row: ui_scores[row['user_id'], row['item_id']], axis=1)

for i, scores in enumerate(candidates_scores):
    ui_scores = np.clip(scores, -10, None)
    ranking_total_df[f'i2i_model{i}'] = ranking_total_df.apply(lambda row: ui_scores[row['user_id'], row['item_id']], axis=1)

In [31]:
ranking_total_df = ranking_total_df.merge(
    user_feature_engineering_df,
    on='user_id',
    how='left'
).merge(
    item_feature_engineering_df,
    on='item_id',
    how='left'
)

In [32]:
clf = CatBoostClassifier(random_state=777, cat_features=['gender'])
clf.fit(ranking_total_df.drop(['user_id', 'item_id', 'target'], axis=1), ranking_total_df['target'], verbose_eval=50);

Learning rate set to 0.249767
0:	learn: 0.4286874	total: 120ms	remaining: 1m 59s
50:	learn: 0.2240355	total: 5.01s	remaining: 1m 33s
100:	learn: 0.2203482	total: 9.79s	remaining: 1m 27s
150:	learn: 0.2177696	total: 14.7s	remaining: 1m 22s
200:	learn: 0.2156786	total: 19.6s	remaining: 1m 17s
250:	learn: 0.2139314	total: 24.4s	remaining: 1m 12s
300:	learn: 0.2122865	total: 29.1s	remaining: 1m 7s
350:	learn: 0.2108105	total: 33.9s	remaining: 1m 2s
400:	learn: 0.2094538	total: 38.9s	remaining: 58.1s
450:	learn: 0.2082191	total: 44s	remaining: 53.6s
500:	learn: 0.2070336	total: 48.6s	remaining: 48.4s
550:	learn: 0.2059478	total: 53.5s	remaining: 43.6s
600:	learn: 0.2047412	total: 58.1s	remaining: 38.6s
650:	learn: 0.2035949	total: 1m 2s	remaining: 33.7s
700:	learn: 0.2026583	total: 1m 7s	remaining: 28.8s
750:	learn: 0.2017174	total: 1m 12s	remaining: 23.9s
800:	learn: 0.2007746	total: 1m 16s	remaining: 19s
850:	learn: 0.1997864	total: 1m 21s	remaining: 14.3s
900:	learn: 0.1988909	total: 1m 

In [23]:
ranking_df.shape, candidates_df.shape

((294017, 5), (594092, 5))

In [33]:
# Refit candidates models
candidates_model, candidates_scores = train_candidates_models(
    pd.concat(
        (
            candidates_df,
            ranking_df
        ),
        ignore_index=True
    ),
    n_users,
    n_items
)

100%|████████████████████████████████████████████████████████████████████████| 50/50 [00:21<00:00,  2.30it/s, loss=0.0518]
100%|█████████████████████████████████████████████████| 200/200 [00:27<00:00,  7.29it/s, train_auc=92.37%, skipped=22.93%]
100%|█████████████████████████████████████████████████████████████████████████████████████| 60/60 [00:07<00:00,  8.31it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 3706/3706 [00:00<00:00, 10363.88it/s]


In [34]:
inference_df = events_df.groupby('user_id')['item_id'].apply(
    lambda x: list(range(n_items))
).reset_index().explode('item_id')

for i, model in enumerate(candidates_model):
    ui_scores = (model.user_factors @ model.item_factors.T)
    inference_df[f'colaborative_model{i}'] = inference_df.apply(lambda row: ui_scores[row['user_id'], row['item_id']], axis=1)

for i, scores in enumerate(candidates_scores):
    ui_scores = np.clip(scores, -10, None)
    inference_df[f'i2i_model{i}'] = inference_df.apply(lambda row: ui_scores[row['user_id'], row['item_id']], axis=1)

In [35]:
inference_df = inference_df.merge(
    user_feature_engineering_df,
    on='user_id',
    how='left'
).merge(
    item_feature_engineering_df,
    on='item_id',
    how='left'
)

In [36]:
result_df = inference_df[['user_id', 'item_id']]
result_df['ranking_score'] = clf.predict_proba(inference_df.drop(['user_id', 'item_id'], axis=1))[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['ranking_score'] = clf.predict_proba(inference_df.drop(['user_id', 'item_id'], axis=1))[:, 1]


In [37]:
result_df.head()

Unnamed: 0,user_id,item_id,ranking_score
0,0,0,0.167862
1,0,1,0.002595
2,0,2,0.120919
3,0,3,0.082452
4,0,4,0.009002


In [38]:
result_df['ranking_score'].describe()

count    2.238424e+07
mean     9.717208e-02
std      1.842990e-01
min      1.388585e-06
25%      1.063400e-03
50%      8.584715e-03
75%      8.441030e-02
max      9.994393e-01
Name: ranking_score, dtype: float64

In [39]:
# Remove previously recommended
result_filtered_df = result_df.merge(
    candidates_df[['user_id', 'item_id']].assign(__tmp__=True),
    on=['user_id', 'item_id'],
    how='left'
)
result_filtered_df = result_filtered_df[result_filtered_df['__tmp__'].isna()].drop('__tmp__', axis=1)

result_filtered_df = result_filtered_df.merge(
    ranking_df[['user_id', 'item_id']].assign(__tmp__=True),
    on=['user_id', 'item_id'],
    how='left'
)
result_filtered_df = result_filtered_df[result_filtered_df['__tmp__'].isna()].drop('__tmp__', axis=1)

result_filtered_df = result_filtered_df.sort_values('ranking_score', ascending=False).groupby('user_id')['item_id'].apply(
    lambda item_ids: ' '.join(map(str, item_ids.tolist()[:10]))
)

In [40]:
result_filtered_df.head()

user_id
0      1001 2335 3298 3036 2999 1699 52 331 2606 583
1         3677 169 584 1811 1039 512 2338 93 232 452
2    1831 1956 2732 1560 640 2603 2256 1711 452 2774
3      169 3022 220 3597 1223 1001 1543 2956 3435 36
4    2338 1560 2342 1583 1809 512 2210 1315 1490 138
Name: item_id, dtype: object

In [41]:
result_filtered_df.reset_index().to_csv('./boosting_submission_5.csv', header=True, index=False)

In [42]:
test_df = events_df.merge(
    candidates_df[['user_id', 'item_id', 'timestamp']].assign(__tmp__=True),
    on=['user_id', 'item_id', 'timestamp'],
    how='left'
)
test_df = test_df[test_df['__tmp__'].isna()].drop('__tmp__', axis=1)

test_df = test_df.merge(
    ranking_df[['user_id', 'item_id', 'timestamp']].assign(__tmp__=True),
    on=['user_id', 'item_id', 'timestamp'],
    how='left'
)
test_df = test_df[test_df['__tmp__'].isna()].drop('__tmp__', axis=1)

In [43]:
test_df.shape[0] + candidates_df.shape[0] + ranking_df.shape[0], events_df.shape[0]

(894149, 894149)

In [44]:
def precision_at_k(gts, predictions, k=10):
    return sum(1 if prediction in gts else 0 for prediction in predictions[:k]) / k


def recall_at_k(gts, predictions, k=10):
    return sum(1 if prediction in gts else 0 for prediction in predictions[:k]) / len(gts)


def evaluate(gts_users, predictions_users):
    return {
        'precision_at_k': np.mean([precision_at_k(gt, prediction) for gt, prediction in zip(gts_users, predictions_users)]),
        'recall_at_k': np.mean([recall_at_k(gt, prediction) for gt, prediction in zip(gts_users, predictions_users)])
    }

In [45]:
evaluate(
    test_df.groupby('user_id')['item_id'].apply(list),
    result_filtered_df.apply(lambda x: list(map(int, x.split())))
)

{'precision_at_k': 0.008576158940397352, 'recall_at_k': 0.0857615894039735}