In [None]:
import datetime
import numpy as np
import random
from itertools import islice, cycle
from more_itertools import pairwise
import keras
import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
sns.set(style='whitegrid')
sns.set(rc={'figure.figsize':(17, 9)})

from IPython.core.display import display, HTML, clear_output
display(HTML('<style>.container { width:80% !important; }</style>'))
display(HTML('<style>.prompt { min-width:10ex !important; }</style>'))
display(HTML('<style>div#notebook { font-size:12px !important; }</style>'))

#from preprocessing import leave_last_out, transform_indices, reindex_data, generate_interactions_matrix, get_interaction_matrix
from datetime import datetime
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import euclidean_distances as ED

In [None]:
import pandas as pd
from scipy.sparse import csr_matrix


#

def leave_last_out(data, userid='userid', timeid='timestamp'):
    data_sorted = data.sort_values(timeid)
    holdout = data_sorted.drop_duplicates(
        subset=[userid], keep='last'
    ) # split the last item from each user's history
    remaining = data.drop(holdout.index) # store the remaining data - will be our training
    return remaining, holdout


def transform_indices(data, users, items):
    '''
    Reindex columns that correspond to users and items.
    New index is contiguous starting from 0.
    '''
    data_index = {}
    for entity, field in zip(['users', 'items'], [users, items]):
        new_index, data_index[entity] = to_numeric_id(data, field)
        data = data.assign(**{f'{field}': new_index}) # makes a copy of dataset!
    return data, data_index


def to_numeric_id(data, field):
    '''
    Get new contiguous index by converting the data field
    into categorical values.
    '''
    idx_data = data[field].astype("category")
    idx = idx_data.cat.codes
    idx_map = idx_data.cat.categories.rename(field)
    return idx, idx_map


def reindex_data(data, data_index, fields=None):
    '''
    Reindex provided data with the specified index mapping.
    By default, will take the name of the fields to reindex from `data_index`.
    It is also possible to specify which field to reindex by providing `fields`.
    '''
    if fields is None:
        fields = data_index.keys()
    if isinstance(fields, str): # handle single field provided as a string
        fields = [fields]
    for field in fields:
        entity_name = data_index[field].name
        new_index = data_index[field].get_indexer(data[entity_name])
        data = data.assign(**{f'{entity_name}': new_index}) # makes a copy of dataset!
    return data


# generate training matrix
def generate_interactions_matrix(data, data_description, rebase_users=False):
    '''
    Converts a pandas dataframe with user-item interactions into a sparse matrix representation.
    Allows reindexing user ids, which help ensure data consistency at the scoring stage
    (assumes user ids are sorted in the scoring array).
    Args:
        data (pandas.DataFrame): The input dataframe containing the user-item interactions.
        data_description (dict): A dictionary containing the data description with the following keys:
            - 'n_users' (int): The total number of unique users in the data.
            - 'n_items' (int): The total number of unique items in the data.
            - 'users' (str): The name of the column in the dataframe containing the user ids.
            - 'items' (str): The name of the column in the dataframe containing the item ids.
            - 'feedback' (str): The name of the column in the dataframe containing the user-item interaction feedback.
        rebase_users (bool, optional): Whether to reindex the user ids to make contiguous index starting from 0. Defaults to False.
    Returns:
        scipy.sparse.csr_matrix: A sparse matrix of shape (n_users, n_items) containing the user-item interactions.
    '''

    n_users = data_description['n_users']
    n_items = data_description['n_items']
    # get indices of observed data
    user_idx = data[data_description['users']].values
    if rebase_users:  # handle non-contiguous index of test users
        # This ensures that all user ids are contiguous and start from 0,
        # which helps ensure data consistency at the scoring stage.
        user_idx, user_index = pd.factorize(user_idx, sort=True)
        n_users = len(user_index)
    item_idx = data[data_description['items']].values
    feedback = data[data_description['feedback']].values
    # construct rating matrix
    return csr_matrix((feedback, (user_idx, item_idx)), shape=(n_users, n_items))



def warm_start_timepoint_split(data, time_split_q=0.95):
    """
    Split data into training, testset, and holdout datasets based on a timepoint split
    and according to the `warm-start` evaluation strategy.
    Parameters
    ----------
    data : pd.DataFrame
        The input dataset containing columns `userid`, `movieid`, and `timestamp`.
    time_split_q : float, optional
        The quantile value used to split the dataset based on the `timestamp` column.
        Default is 0.95.
    Returns
    -------
    Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]
        A tuple of three pandas DataFrames: training, testset, and holdout.
        `training` is a subset of `data` used for training the recommender system.
        `testset` is a subset of `data` used for generating recommendations for the test users.
        `holdout` is a subset excluded from `testset` containing only the most recent interactions for each test user.
    Notes
    -----
    The function splits the input `data` into three subsets: `training`, `testset`, and `holdout`.
    The split is performed based on the `timestamp` column of `data`, using `time_split_q` as the quantile value.
    The `holdout` dataset contains only the immediate interactions following the fixed timepoint for each test user from the `testset`.
    The set of users in `training` is disjoint with the set of users in the `testset`, which implements the `warm-start` scenario.
    """
    timepoint = data.timestamp.quantile(q=time_split_q, interpolation='nearest')
    test_ = data.query('timestamp >= @timepoint')
    rest_ = data.drop(test_.index)
    holdout_ = (
        test_
        .sort_values('timestamp')
        .drop_duplicates(subset=['userid'], keep='first')
    )
    # the holdout dataframe contains interactions closest to certain timepoint from the right,
    # i.e., the corresponding items are the first in each test user profile after this timepoint
    training = rest_.query('userid not in @holdout_.userid')
    train_items = training.movieid.unique()
    testset_ = rest_.query('userid in @holdout_.userid and movieid in @train_items')
    test_users = testset_.userid.unique()
    holdout = holdout_.query(
        # if user is not in `test_users` then no evluation is possible,
        # if item is not in `train_items` it's cold start -> must be excluded
        'userid in @test_users and movieid in @train_items'
    ).sort_values('userid')
    testset = testset_.query(
        # make sure testset and holdout contain the same set of users
        'userid in @holdout.userid'
    ).sort_values('userid')
    return training, testset, holdout


def get_interaction_matrix(data, n_items, userid='user_id', itemid='item_id', rating='watched_pct'):
    data['uid'] = data[userid].astype('category')
    data['uid'] = data['uid'].cat.codes

    data['iid'] = data[itemid].astype('category')
    data['iid'] = data['iid'].cat.codes
    interactions_vec = csr_matrix((data[rating],
                                   (data['uid'], data['iid'])),
                                  shape=(data.uid.nunique(), n_items))
    # res = interactions_vec.sum(axis=1)
    #     val = np.repeat(res, interactions_vec.getnnz(axis=1))
    #     interactions_vec.data /= np.ravel(val)

    iid_to_item_id = data[['iid', itemid]].drop_duplicates().set_index('iid').to_dict()[itemid]
    item_id_to_iid = data[['iid', itemid]].drop_duplicates().set_index(itemid).to_dict()['iid']

    uid_to_user_id = data[['uid', userid]].drop_duplicates().set_index('uid').to_dict()[userid]
    user_id_to_uid = data[['uid', userid]].drop_duplicates().set_index(userid).to_dict()['uid']
    return interactions_vec, iid_to_item_id, item_id_to_iid, uid_to_user_id, user_id_to_uid

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
import zipfile
with zipfile.ZipFile("users.csv.zip","r") as z:
    z.extractall(".")
with zipfile.ZipFile("items.csv.zip","r") as z:
    z.extractall(".")
with zipfile.ZipFile("interactions.csv.zip","r") as z:
    z.extractall(".")

In [None]:
users_df = pd.read_csv('users.csv')
items_df = pd.read_csv('items.csv')
interactions_df = pd.read_csv('interactions.csv', parse_dates=['last_watch_dt'])
interactions_df = pd.read_csv('interactions.csv', parse_dates=['last_watch_dt'])
#submission = pd.read_csv('sample_submission.csv')

In [None]:
from collections import Counter
valid_users = []
c = Counter(interactions_df.user_id)
for user_id, entries in c.most_common():
    if entries > 10:
        valid_users.append(user_id)
# и соберем все фильмы, которые посмотрели больше 10 пользователей
valid_items = []
c = Counter(interactions_df.item_id)
for item_id, entries in c.most_common():
    if entries > 10:
        valid_items.append(item_id)
# отбросим непопулярные фильмы и неактивных юзеров
interactions_df = interactions_df[interactions_df.user_id.isin(valid_users)]
interactions_df = interactions_df[interactions_df.item_id.isin(valid_items)]
print(f"N users after: {interactions_df.user_id.nunique()}")
print(f"N items after: {interactions_df.item_id.nunique()}")

N users after: 128486
N items after: 8175


In [None]:
def preprocessing(interactions_df, users_df_ohe, items_df_ohe, cold_users_split = 5, itemid = 'last_watch_dt'):
    interactions_df = interactions_df[interactions_df.user_id.isin(users_df_ohe.user_id.unique())]
    interactions_df['last_watch_dt_ts'] = interactions_df['last_watch_dt'].apply(lambda x: int(x.timestamp()))
    num_interaction_pu = interactions_df.groupby('user_id')['item_id'].count().sort_values(ascending = False)
    # get cold_users
    cold_users = num_interaction_pu.loc[(num_interaction_pu < 5) & (num_interaction_pu > 2)].index
    

    # warm_users_history
    warm_users_history = interactions_df[~interactions_df.user_id.isin(cold_users)]
    
    # cold_users_history
    cold_users_history = interactions_df[interactions_df.user_id.isin(cold_users)]
    
    # standard scenario train/holdout split
    training, holdout = leave_last_out(warm_users_history, userid='user_id', timeid=itemid)

    train_val, data_index_train = transform_indices(training, 'user_id', 'item_id')
    holdout_val = reindex_data(holdout, data_index_train, fields="items") 
    
    # cold_start_scenario train/holdout split
    training, holdout = leave_last_out(cold_users_history, userid='user_id', timeid=itemid)

    cu_val, data_index_cu = transform_indices(cold_users_history, 'user_id', 'item_id')
    cu_holdout = reindex_data(holdout, data_index_cu, fields="items") 
    
    data_description = dict(
    users = data_index_train['users'].name,
    items = data_index_train['items'].name,
    feedback = 'watched_pct',
    n_users_train = len(data_index_train['users']),
    n_items = data_index_train['items'].shape[0],   #interactions_df.item_id.nunique(),
    user_features = csr_matrix(user_ohe_df[user_ohe_df.user_id.isin(data_index_train['users'])].drop(columns='user_id').values),
    item_features = csr_matrix(item_ohe_df[item_ohe_df.item_id.isin(data_index_train['items'])].drop(columns='item_id').values),
    holdout_standard = holdout_val,
    holdout_cs = cu_holdout,
    cold_start_test = cu_val,
    )
    
    # get interactions matrix
    train_matrix, iid_to_item_id, item_id_to_iid, uid_to_user_id, user_id_to_uid = \
        get_interaction_matrix(train_val, n_items = data_description['n_items'] )
    
    train_matrix_indices = dict(
    iid_to_itemid = iid_to_item_id,
    itemid_to_iid = item_id_to_iid,
    uid_to_user_id = uid_to_user_id,
    user_id_to_uid = user_id_to_uid)
    
    # cold users
    cold_users_matrix, iid_to_item_id_cu, item_id_to_iid_cu, uid_to_user_id_cu, user_id_to_uid_cu = \
get_interaction_matrix(cold_users_history,n_items = data_description['n_items'])
    
    cold_start_matrix_indices = dict(
    iid_to_itemid = iid_to_item_id_cu,
    itemid_to_iid = item_id_to_iid_cu,
    uid_to_user_id = uid_to_user_id_cu,
    user_id_to_uid = user_id_to_uid_cu)
    
    
    return train_val,data_description, train_matrix,train_matrix_indices, cold_users_matrix, cold_start_matrix_indices
    

In [None]:
interactions_df

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0


# User features preprocessing

In [None]:
def ohe(features : list, df, items = False):
    if items:
        ohe_df = df.item_id
    else:
        ohe_df = df.user_id
    for feat in features:
        ohe_feat_df = pd.get_dummies(df[feat], prefix = feat)
        ohe_df = pd.concat([ohe_df,ohe_feat_df],axis=1)
    return ohe_df

In [None]:
# One-hot encoding of cathegorical features
user_cat_features = ['age','income','sex','kids_flg']

user_ohe_df  =  ohe(user_cat_features,users_df) #.drop(columns='user_id')

# Item features preprocessing

In [None]:
item_cat_feats = ['content_type','for_kids','release_year', 'age_rating','studios','countries','directors']
item_ohe_df = ohe(item_cat_feats,items_df, items = True) #.drop(columns='item_id')

In [None]:
train_val,data_description, train_matrix,train_matrix_indices, cold_users_matrix, cold_start_matrix_indices = \
preprocessing(interactions_df, user_ohe_df, item_ohe_df, cold_users_split = 5, itemid = 'last_watch_dt')

data_description['holdout_standard']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uid'] = data[userid].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uid'] = data['uid'].cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['iid'] = data[itemid].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,last_watch_dt_ts
904071,730407,1707,2021-03-13,389,100.0,1615593600
1286551,947857,5555,2021-03-13,2347,43.0,1615593600
4137025,762776,6887,2021-03-13,7369,100.0,1615593600
1906227,446006,7485,2021-03-13,1399,1.0,1615593600
45596,465927,3785,2021-03-13,38,0.0,1615593600
...,...,...,...,...,...,...
4962797,1056443,811,2021-08-22,1641,38.0,1629590400
4876653,972909,5158,2021-08-22,35879,80.0,1629590400
5183141,1002282,124,2021-08-22,1326,20.0,1629590400
3235385,489176,6022,2021-08-22,927,5.0,1629590400


In [None]:
cold_users_matrix

<124842x8175 sparse matrix of type '<class 'numpy.float64'>'
	with 426076 stored elements in Compressed Sparse Row format>

### Video

In [None]:
interactions_df.item_id.value_counts()

10440    202457
15297    193123
9728     132865
13865    122119
4151      91167
          ...  
9266         11
12458        11
4451         11
15557        11
7029         11
Name: item_id, Length: 8175, dtype: int64

In [None]:
common_users = set(interactions_df.user_id.unique()).intersection(set(user_ohe_df.user_id. unique()))
common_items = set(interactions_df.item_id.unique()).intersection(set(item_ohe_df.item_id.unique()))
print(len(common_users))
print(len(common_items))
interactions_df = interactions_df[interactions_df.item_id.isin(common_items)]
interactions_df = interactions_df[interactions_df.user_id.isin(common_users)]
item_ohe_df = item_ohe_df[item_ohe_df.item_id.isin(common_items)]
user_ohe_df = user_ohe_df[user_ohe_df.user_id.isin(common_users)]

743707
8175


In [None]:
interactions_df["uid"] = interactions_df["user_id"].astype("category")
interactions_df["uid"] = interactions_df["uid"].cat.codes
interactions_df["iid"] = interactions_df["item_id"].astype("category")
interactions_df["iid"] = interactions_df["iid"].cat.codes
print(sorted(interactions_df.iid.unique()) [:5])
print(sorted(interactions_df.uid.unique()) [:5])
interactions_df.head()

[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4]


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,uid,iid
0,176549,9506,2021-05-11,4250,72.0,20617,4713
1,699317,1659,2021-05-29,8317,100.0,81922,811
6,1016458,354,2021-08-14,1672,25.0,118861,170
7,884009,693,2021-08-04,703,14.0,103573,337
12,667487,12173,2021-05-29,96,1.0,78173,6012


In [None]:
cold_users_matrix

In [None]:
cold_users_matrix = np.where(cold_users_matrix.A > 0, 1, 0)

AttributeError: ignored

In [None]:
cold_users_matrix = csr_matrix(cold_users_matrix)

In [None]:
cold_users_matrix = np.zeros((cold_users_matrix.shape[0],
                            cold_users_matrix.shape[1]))
for user_id, item_id in zip(interactions_df.uid, interactions_df.iid):
    cold_users_matrix[user_id, item_id] += 1
# res = cold_users_matrix.sum(axis=1)
# for i in range(len(cold_users_matrix.data)):
#     cold_users_matrix[i] /= res[i]

IndexError: ignored

In [None]:
interactions_vec = np.zeros((interactions_df.uid.nunique(),
                             interactions_df.iid.nunique()))
for user_id, item_id in zip(interactions_df.uid, interactions_df.iid):
    interactions_vec[user_id, item_id] += 1
res = interactions_vec.sum(axis=1)
# for i in range(len(interactions_vec)):
#     interactions_vec[i] /= res[i]

In [None]:
iid_to_item_id = interactions_df[["iid", "item_id"]].drop_duplicates().set_index("iid").to_dict()["item_id"]
item_id_to_iid = interactions_df[["iid", "item_id"]].drop_duplicates().set_index("item_id").to_dict()["iid"]
uid_to_user_id = interactions_df[["uid", "user_id"]].drop_duplicates().set_index("uid").to_dict()["user_id"]
user_id_to_uid = interactions_df[["uid", "user_id"]].drop_duplicates().set_index("user_id").to_dict()["uid"]

In [None]:
item_ohe_df["iid"] = item_ohe_df["item_id"].apply(lambda x: item_id_to_iid[x])
item_ohe_df = item_ohe_df.set_index("iid")
user_ohe_df["uid"] = user_ohe_df["user_id"].apply(lambda x: user_id_to_uid[x])
user_ohe_df = user_ohe_df.set_index("uid")

KeyError: ignored

In [None]:
from keras import backend as K

In [None]:
def triplet_loss(y_true, y_pred, n_dims=128, alpha=0.4):
    anchor = y_pred[:, 0:n_dims]
    positive = y_pred[:, n_dims:n_dims*2]
    negative = y_pred[:, n_dims*2:n_dims*3]
    pos_dist = K.sum(K.square(anchor - positive), axis=1)
    neg_dist = K.sum(K.square(anchor - negative), axis=1)
    basic_loss = pos_dist - neg_dist + alpha
    loss = K.maximum(basic_loss, 0.0)
    return loss

In [None]:
def generator(items, users, interactions, batch_size=1024):
    while True:
        uid_meta = []
        uid_interaction = []
        pos = []
        neg = []
        for _ in range(batch_size):
            # берем рандомный uid
            uid_i = random.randint(0, interactions.shape[0]-1)
            # id хорошего айтема
            pos_i = np.random.choice(range(interactions.shape[1]), p=interactions[uid_i])
            # id плохого айтема
            neg_i = np.random.choice(range(interactions.shape[1]))
            # фичи юзера
            uid_meta.append(users.iloc[uid_i])
            # вектор айтемов, с которыми юзер взаимодействовал
            uid_interaction.append(interactions[uid_i])
            # фичи хорошего айтема
            pos.append(items.iloc[pos_i])
            # фичи плохого айтема
            neg.append(items.iloc[neg_i])
        yield [np.array(uid_meta), np.array(uid_interaction), np.array(pos), np.array(neg)], [np.array(uid_meta), np.array(uid_interaction), np.array(pos), np.array(neg)]

In [None]:
gen = generator(items=item_ohe_df.drop(["item_id"], axis=1),
                users=user_ohe_df.drop(["user_id"], axis=1),
                interactions=interactions_vec)
ret = next(gen)


print(f"вектор фичей юзера: {ret[0] [0].shape}")
print(f"вектор взаимодействий юзера с айтемами: {ret[0] [1].shape}")
print(f"вектор 'хорошего' айтема: {ret[0] [2].shape}")
print(f"вектор 'плохого' айтема: {ret[0] [3].shape}")
print()
print(f"вектор фичей юзера: {ret [1] [0].shape}")
print(f"вектор взаимодействий юзера с айтемами: {ret[1] [1].shape}")



NameError: ignored

In [None]:
N_FACTORS = 128

ITEM_MODEL_SHAPE = (item_ohe_df.drop(["item_id"], axis=1).shape[1], )
USER_META_MODEL_SHAPE = (user_ohe_df.drop(["user_id"], axis=1).shape[1], )
USER_INTERACTION_MODEL_SHAPE = (interactions_vec.shape[1], )
print(f"N_FACTORS: {N_FACTORS}")
print(f"ITEM_MODEL_SHAPE: {ITEM_MODEL_SHAPE}")
print(f"USER_META_MODEL_SHAPE: {USER_META_MODEL_SHAPE}")
print(f"USER_INTERACTION_MODEL_SHAPE: {USER_INTERACTION_MODEL_SHAPE}")

In [None]:
def item_model(n_factors=N_FACTORS):
    # входной слой
    inp = keras.layers.Input(shape=ITEM_MODEL_SHAPE)
    # полносвязный слой
    layer_1 = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
    kernel_regularizer=keras.regularizers.l2(1e-6),
    activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp)
    # делаем residual connection - складываем два слоя,
    # чтобы градиенты не затухали во время обучения
    layer_2 = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
    kernel_regularizer=keras.regularizers.l2(1e-6),
    activity_regularizer=keras.regularizers.l2(l2=1e-6))(layer_1)

    add = keras.layers.Add()([layer_1, layer_2])
 
    # выходной слой
    out = keras.layers.Dense(N_FACTORS, activation='linear', use_bias=False,
    kernel_regularizer=keras.regularizers.l2(1e-6),
    activity_regularizer=keras. regularizers.l2(l2=1e-6))(add)
    return keras.models.Model(inp, out)

In [None]:
def user_model(n_factors=N_FACTORS):
    # входной слой для вектора фичей юзера (из users_ohe_df)
    inp_meta = keras.layers.Input(shape=USER_META_MODEL_SHAPE)
    # входной слой для вектора просмотров (из iteractions_vec)
    inp_interaction = keras.layers.Input(shape=USER_INTERACTION_MODEL_SHAPE)
    # полносвязный слой
    layer_1_meta = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                      kernel_regularizer=keras. regularizers.l2(1e-6), 
                                      activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp_meta)
    layer_1_interaction = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                             kernel_regularizer=keras.regularizers.l2(1e-6),
                                             activity_regularizer=keras.regularizers.l2(l2=1e-6))(inp_interaction)
    # делаем residual connection - складываем два слоя,
    # чтобы градиенты не затухали во время обучения
    layer_2_meta = keras.layers.Dense(N_FACTORS, activation='elu', use_bias=False,
                                      kernel_regularizer=keras.regularizers.l2(1e-6),
                                      activity_regularizer=keras.regularizers.l2(l2=1e-6))(layer_1_meta)

    add = keras.layers.Add()([layer_1_meta, layer_2_meta])
    # конкатенируем вектор фичей с вектором просмотров
    concat_meta_interaction = keras.layers.Concatenate()([add, layer_1_interaction])

    # выходной слой
    out = keras.layers.Dense(N_FACTORS, activation='linear', use_bias=False,
                            kernel_regularizer=keras.regularizers.l2(1e-6),
                            activity_regularizer=keras.regularizers.l2(l2=1e-6))(concat_meta_interaction)
    return keras.models.Model([inp_meta, inp_interaction], out)
# инициализируем модели юзера и айтема
i2v = item_model()
u2v = user_model()
# вход для вектора фичей юзера (из users_ohe_df)
ancor_meta_in = keras.layers.Input(shape=USER_META_MODEL_SHAPE)
# вход для вектора просмотра юзера (из interactions_vec)

ancor_interaction_in = keras.layers.Input(shape=USER_INTERACTION_MODEL_SHAPE)
# вход для вектора "хорошего" айтема
pos_in = keras.layers.Input(shape=ITEM_MODEL_SHAPE)

neg_in = keras.layers.Input(shape=ITEM_MODEL_SHAPE)
# получаем вектор юзера
ancor = u2v([ancor_meta_in, ancor_interaction_in])
# получаем вектор "хорошего" айтема
pos = i2v(pos_in)
# получаем вектор "плохого" айтема
neg = i2v(neg_in)
# конкатенируем полученные векторы
res = keras.layers.Concatenate(name="concat_ancor_pos_neg")([ancor, pos, neg])
# собираем модель
model = keras.models.Model([ancor_meta_in, ancor_interaction_in, pos_in, neg_in], res)

In [None]:
model_name = 'DSSM'
# логируем процесс обучения в тензорборд
t_board = keras.callbacks.TensorBoard(log_dir=f'runs/{model_name}')
# уменьшаем learning_rate, если лосс долго не уменьшается (в течение двух эпох)
decay = keras.callbacks.ReduceLROnPlateau(monitor='loss', patience=2, factor=0.8, verbose=1)
# сохраняем модель после каждой эпохи, если лосс уменьшился
check = keras.callbacks.ModelCheckpoint(filepath=model_name + '/epoch{epoch}-{loss:.2f}.h5', monitor="loss")

In [None]:
opt = keras.optimizers.Adam(lr=0.001)
model.compile(loss=triplet_loss, optimizer=opt)

In [None]:
item_model().summary()

In [None]:
user_model().summary()

In [None]:
model.fit(generator(items=item_ohe_df.drop(["item_id"], axis=1),
                    users=user_ohe_df.drop(["user_id"], axis=1),
                    interactions=interactions_vec,
                    batch_size=64), 
          steps_per_epoch = 100,
          epochs = 30,
          initial_epoch = 0,
          callbacks = (decay, t_board, check))

In [None]:
def topn_recommendations(scores, topn=10):
    recommendations = np.apply_along_axis(topidx, 1, scores, topn)
    return recommendations
def topidx(a, topn):
    parted = np.argpartition(a, -topn)[-topn:]
    return parted[np.argsort(-a[parted])]

In [None]:
def model_evaluate(recommended_items, holdout, holdout_description, topn=10):
    itemid = holdout_description['items']
    holdout_items = holdout[itemid].values
    assert recommended_items.shape[0] == len(holdout_items)
    hits_mask = recommended_items[:, :topn] == holdout_items.reshape(-1, 1)
    # HR calculation
    hr = np.mean(hits_mask.any(axis=1))
    # MRR calculation
    n_test_users = recommended_items.shape[0]
    hit_rank = np.where(hits_mask)[1] + 1.0
    mrr = np.sum(1 / hit_rank) / n_test_users
    # coverage calculation
    n_items = holdout_description['n_items']
    cov = np.unique(recommended_items).size / n_items
    return hr, mrr, cov

In [None]:
eval_holdout = data_description['holdout_standard'][0:2000]
eval_holdout_id = data_description['holdout_standard']['user_id'][0:2000].values

In [None]:
user_ohe_df

In [None]:
interactions_vec.shape

(106350, 8175)

In [None]:
user_id_to_uid = cold_start_matrix_indices['user_id_to_uid']

In [None]:
holdout_cs = data_description['holdout_cs']

In [None]:
def top20(pers_uid, cs = False):
    # получаем фичи юзера и вектор его просмотров айтемов
    user_meta_feats = user_ohe_df.loc[user_ohe_df.user_id == pers_uid].drop(columns='user_id')

    if cs:
      user_id_to_uid = cold_start_matrix_indices['user_id_to_uid']
      uid = user_id_to_uid[pers_uid]
      user_interaction_vec = cold_user_matrix[uid]
    
    else:
      uid = user_id_to_uid[pers_uid]
      user_interaction_vec = interactions_vec[uid]

    # получаем вектор юзера
    user_vec = u2v.predict([np.array(user_meta_feats).reshape(1, -1),
                            np.array(user_interaction_vec).reshape(1, -1)])

    
    items_feats = item_ohe_df.drop(["item_id"], axis = 1).to_numpy()

    items_vecs = i2v.predict(items_feats)

    dists = ED(user_vec, items_vecs)
    
    return np.argsort(dists, axis = 1)[0][:20]

In [None]:
mrr_full = []
cov_full = []
for i,user_id in enumerate(holdout_cs.user_id):
  top_preds = top20(user_id, cs = True)

  if i == 5000:
    break

  holdout_items = holdout_cs['item_id'].values
  
  hits_mask = top_preds == holdout_items.reshape(-1, 1)
    # HR calculation
  hr = np.mean(hits_mask.any(axis=1))
    # MRR calculation
  n_test_users = holdout_cs.shape[0]
  hit_rank = np.where(hits_mask)[1] + 1.0
  mrr = np.sum(1 / hit_rank) / n_test_users
  # coverage calculation
  n_items = data_description['n_items']
  cov = np.unique(top_preds).size / n_items

  mrr_full.append(mrr)
  cov_full.append(cov)




In [None]:
id = 721985
top20(id)



array([3785, 1878, 3152, 6012, 3735, 4813, 8055, 4539, 6418, 3189, 6434,
       4170, 3676,  763, 4952, 7170, 5412, 8003, 1337, 5158])

In [None]:
holdout = data_description['holdout_standard']

In [None]:
mrr_full = []
cov_full = []
for i,user_id in enumerate(holdout.user_id):
  top_preds = top20(user_id)

  if i == 5000:
    break

  holdout_items = holdout['item_id'].values
  
  hits_mask = top_preds == holdout_items.reshape(-1, 1)
    # HR calculation
  hr = np.mean(hits_mask.any(axis=1))
    # MRR calculation
  n_test_users = holdout.shape[0]
  hit_rank = np.where(hits_mask)[1] + 1.0
  mrr = np.sum(1 / hit_rank) / n_test_users
  # coverage calculation
  n_items = data_description['n_items']
  cov = np.unique(top_preds).size / n_items

  mrr_full.append(mrr)
  cov_full.append(cov)






KeyboardInterrupt: ignored

In [None]:
np.array(mrr_full).mean()

0.02195887969558362

In [None]:
np.array(cov_full).mean()

0.002446483180428134

In [None]:
holdout_cs = data_description['holdout_cs']

In [None]:
cold_users_matrix

<2x8175 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [None]:
mrr_full = []
cov_full = []
for user_id in holdout.user_id:
  top_preds = top20(user_id)


  holdout_items = holdout['item_id'].values
  
  hits_mask = top_preds == holdout_items.reshape(-1, 1)
    # HR calculation
  hr = np.mean(hits_mask.any(axis=1))
    # MRR calculation
  n_test_users = holdout.shape[0]
  hit_rank = np.where(hits_mask)[1] + 1.0
  mrr = np.sum(1 / hit_rank) / n_test_users
  # coverage calculation
  n_items = data_description['n_items']
  cov = np.unique(top_preds).size / n_items

  mrr_full.append(mrr)
  cov_full.append(cov)


# Total preprocessing

In [None]:
train_val,data_description, train_matrix,train_matrix_indices, cold_users_matrix, cold_start_matrix_indices = \
preprocessing(interactions_df, user_ohe_df, item_ohe_df, cold_users_split = 5, itemid = 'last_watch_dt')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions_df['last_watch_dt_ts'] = interactions_df['last_watch_dt'].apply(lambda x: int(x.timestamp()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uid'] = data[userid].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['uid'] = data['uid'].cat.codes
A value is tryin