In [1]:
import implicit

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import scipy
from scipy.sparse.linalg import svds
from numpy.linalg import svd 
from scipy.sparse import csr_matrix
import time
from sklearn import preprocessing

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
def prepare_df(df):
    aggr1 = {
    'session_duration': 'max', 
    'video_duration': 'max',
    'month': 'max'
    }
    k = df.groupby(['user_id', 'primary_video_id']).agg(aggr1).reset_index().reset_index()
    k['session_duration_clean'] = k[['session_duration', 'video_duration']].min(axis=1)
    k['watching_percentage'] = k['session_duration_clean']/k['video_duration']
    k = k[['month', 'user_id', 'primary_video_id', 'watching_percentage']]
    k = k[k['watching_percentage']>=0.5]
    return k

In [5]:
def prepare_matrix(df):
    user_mapper = {k:v for v, k in dict(enumerate(df.user_id.unique())).items()}
    item_mapper = {k:v for v, k in dict(enumerate(df.primary_video_id.unique())).items()}
    df['user_num'] = df['user_id'].map(user_mapper)
    df['item_num'] = df['primary_video_id'].map(item_mapper)
    row  = df['user_num'].values
    col  = df['item_num'].values
    data = df['watching_percentage'].values
    matrix = csr_matrix(coo_matrix((data, (row, col)), shape=(df['user_num'].nunique(), df['item_num'].nunique())))   
    print(df.user_id.nunique(), df.primary_video_id.nunique(), matrix.shape)  
    return matrix, item_mapper, user_mapper

In [6]:
def map_at_k(df, k):
    df = df[df['rnk'] < k]
    df['target_cumcum'] = df[['user_id', 'targer']].groupby('user_id').cumsum()
    df['for_running_sum'] = df['target_cumcum']/(df['rnk']+1)
    agg_df = df[df['targer']==1][['user_id', 'for_running_sum']].groupby('user_id').mean().reset_index()
    agg_df.columns = ['user_id', 'ap']
    return agg_df['ap'].mean()

In [7]:
# генерация сетки для модели
def get_params(dict_):
    result = {}
    for i in params:
        result[i] = np.random.choice(dict_[i])
    return result

def get_grid(shape=200):
    dict_of_params = {}
    i = 0 
    while len(dict_of_params) < shape:
        par = get_params(params)
        if par not in dict_of_params.values():
            dict_of_params[i] = par
            i += 1
    return dict_of_params

## read data

In [8]:
train = pd.read_csv('/data/agoryach/datagym-recsys-01/big-hw-02/input/train_data_full.csv')
item_info = pd.read_csv('/data/agoryach/datagym-recsys-01/big-hw-02/input/video_meta_data_full.csv')
test = pd.read_csv('/data/agoryach/datagym-recsys-01/big-hw-02/input/sample_submission_full.csv')

In [9]:
train['session_start_datetime'] = pd.to_datetime(train['session_start_datetime'])
train['month'] = train['session_start_datetime'].apply(lambda x: x.month)

In [10]:
train.month.value_counts()

7    2866200
9    2742848
8    2640335
Name: month, dtype: int64

## grid search

In [11]:
params = {
    'K': [5, 10, 20, 30, 50, 100, 120, 150, 180, 200, 230, 300],
    'item_cnt': [5, 10, 20, 50],
    'user_cnt': [1, 3, 5, 10],
    'n_item_count': [300, 500, 800, 1000, 2000, 5000]
}

dict_of_params = get_grid(shape=30)

In [12]:
train_p = prepare_df(train)
train_df = train_p[train_p['month'] != 9]
test_df  = train_p[train_p['month'] == 9]
test_df['targer'] = [1]*test_df.shape[0]
matrix_m, item_mapper, user_mapper = prepare_matrix(train_df)

328140 5159 (328140, 5159)


In [13]:
param = dict_of_params[0]

In [14]:
# cut matrix
mask_users = np.diff(matrix_m.tocsr().indptr) > param['user_cnt']
mask_items = np.diff(matrix_m.tocsc().indptr) > param['item_cnt']
matrix = matrix_m.copy()
matrix = matrix[:, mask_items]
matrix = matrix[mask_users, :]

In [17]:
matrix = matrix.todense()

In [18]:
# маска из нанов
mask = np.isnan(matrix)
masked_arr = np.ma.masked_array(matrix, mask)
item_means = np.mean(masked_arr, axis=0)

In [19]:
# замена средним для нанов
matrix = masked_arr.filled(item_means)
x = np.tile(item_means, (matrix.shape[0], 1))

In [20]:
# вычитание
matrix = matrix - x
total_mean = np.mean(matrix)
col_means  = np.mean(matrix, axis=0) - total_mean
row_means  = np.mean(matrix, axis=1) - total_mean
matrix = matrix - row_means[:, np.newaxis] - col_means - total_mean

ValueError: operands could not be broadcast together with shapes (114523,5082) (1,114523) 

In [34]:
row_means[:, np.newaxis].shape

(1, 1, 114523)

In [35]:
col_means.shape

(1, 5082)

In [32]:
matrix - row_means[:, np.newaxis]

ValueError: operands could not be broadcast together with shapes (114523,5082) (1,114523) 

In [26]:
matrix - col_means

masked_matrix(
  data=[[-7.10444861e-03, -1.93079898e-02, -6.43734509e-03, ...,
         -3.30462252e-05, -5.21693075e-05, -4.28293018e-05],
        [-7.10444861e-03, -1.93079898e-02, -6.43734509e-03, ...,
         -3.30462252e-05, -5.21693075e-05, -4.28293018e-05],
        [-7.10444861e-03, -1.93079898e-02, -6.43734509e-03, ...,
         -3.30462252e-05, -5.21693075e-05, -4.28293018e-05],
        ...,
        [-7.10444861e-03, -1.93079898e-02, -6.43734509e-03, ...,
         -3.30462252e-05, -5.21693075e-05, -4.28293018e-05],
        [-7.10444861e-03, -1.93079898e-02, -6.43734509e-03, ...,
         -3.30462252e-05, -5.21693075e-05, -4.28293018e-05],
        [-7.10444861e-03, -1.93079898e-02, -6.43734509e-03, ...,
         -3.30462252e-05, -5.21693075e-05, -4.28293018e-05]],
  mask=False,
  fill_value=1e+20)

In [None]:
metric = []
for k, param in dict_of_params.items():
    print(f'iter {k+1} of {len(dict_of_params)}: {time.ctime()}')
    
    # cut matrix
    mask_users = np.diff(matrix_m.tocsr().indptr) > param['user_cnt']
    mask_items = np.diff(matrix_m.tocsc().indptr) > param['item_cnt']
    matrix = matrix_m.copy()
    matrix = matrix[:, mask_items]
    matrix = matrix[mask_users, :]
    
    '''
        с нормировкой + заполнениями и сдвигом
    '''
    
    # маска из нанов
    mask = np.isnan(matrix)
    masked_arr = np.ma.masked_array(matrix, mask)
    item_means = np.mean(masked_arr, axis=0)
    # замена средним для нанов
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (matrix.shape[0], 1))
    # вычитание
    matrix = matrix - x
    total_mean = np.mean(matrix)
    col_means  = np.mean(matrix, axis=0) - total_mean
    row_means  = np.mean(matrix, axis=1) - total_mean
    matrix = matrix - row_means[:, np.newaxis] - col_means - total_mean
    # нормировка
    row_norms = np.sqrt(np.sum(matrix*matrix, axis=1))[:, np.newaxis]
    col_norms = np.sqrt(np.sum(matrix*matrix, axis=0))[np.newaxis, :]
    matrix = matrix / np.sqrt(row_norms) / np.sqrt(col_norms)

    # mappers
    new_users = np.array([x[0] for x in sorted([(k,v) for k,v in user_mapper.items()], key=lambda x: x[1])])[mask_users]
    new_user_mapper = {v:k for k,v in enumerate(new_users)}
    new_user_mapper_rev = {k:v for k,v in enumerate(new_users)}
    
    new_items = np.array([x[0] for x in sorted([(k,v) for k,v in item_mapper.items()], key=lambda x: x[1])])[mask_items]
    new_item_mapper = {v:k for k,v in enumerate(new_items)}
    new_item_mapper_rev = {k:v for k,v in enumerate(new_items)}
    
    # fit
    model = implicit.nearest_neighbours.CosineRecommender(K=param['K'],)
    model.fit(matrix.T)
    
    # predict
    cold_users = set(test_df['user_id'].unique()) - set(new_user_mapper.keys())
    hot_users = set(test_df['user_id'].unique())&(set(new_user_mapper.keys()))
    hot_users_labels = [new_user_mapper[x] for x in hot_users]
    
    ## recs for hot users
    res1 = []
    n = len(hot_users)

    for user in hot_users_labels:
        rec = model.recommend(user, matrix, N=10)
        res1 += [[user, r] for r, v in rec]

    result1 = pd.DataFrame(res1, columns=['user_num', 'item_num'])
    result1['user_id'] = result1['user_num'].map(new_user_mapper_rev)
    result1['primary_video_id'] = result1['item_num'].map(new_item_mapper_rev)
    result1.drop(['user_num', 'item_num'], axis=1, inplace=True)
    
    ## recs for cold users
    # самые популярные товары
    item_pop_matrix = pd.DataFrame([])
    item_pop_matrix['item_id'] = new_item_mapper.keys()
    item_pop_matrix['cnt_not_null'] = (matrix.T > 0).sum(axis=1)

    # среднее по товарам
    (x,y,z) = scipy.sparse.find(matrix)
    countings = np.bincount(y)
    sums = np.bincount(y, weights=z)
    averages = sums/countings

    item_pop_matrix['item_means'] = averages

    top10_items = item_pop_matrix[item_pop_matrix['cnt_not_null'] > param['n_item_count']]\
    .sort_values('item_means', ascending=False).head(10)['item_id'].values

    result2 = pd.DataFrame(cold_users, columns=['user_id'])
    items = [x for x in top10_items]*result2.shape[0]
    result2 = result2.loc[result2.index.repeat(10)]
    result2['primary_video_id'] = items

    # join
    finalDF = result1.append(result2).reset_index(drop=True)
    finalDF = finalDF[finalDF['user_id'].isin(test_df['user_id'].unique())].reset_index(drop=True)

    # map@k
    finalDF = finalDF.merge(test_df[['user_id', 'primary_video_id', 'targer']], 
                            on=['user_id', 'primary_video_id'], 
                            how='left').fillna(0)
    finalDF['rnk'] = finalDF.groupby('user_id').cumcount()+1
    print(k, param, map_at_k(finalDF, 10))
    print(time.ctime(), '\n')
    metric.append((k, param, map_at_k(finalDF, 10)))

iter 1 of 30: Wed Jun 24 18:20:05 2020
0 {'K': 300, 'item_cnt': 20, 'user_cnt': 1, 'n_item_count': 5000} 0.26444457346901007
Wed Jun 24 18:23:00 2020 

iter 2 of 30: Wed Jun 24 18:23:01 2020
1 {'K': 50, 'item_cnt': 50, 'user_cnt': 1, 'n_item_count': 2000} 0.2708496034386116
Wed Jun 24 18:24:16 2020 

iter 3 of 30: Wed Jun 24 18:24:16 2020
2 {'K': 20, 'item_cnt': 20, 'user_cnt': 10, 'n_item_count': 300} 0.23641381694399452
Wed Jun 24 18:24:39 2020 

iter 4 of 30: Wed Jun 24 18:24:40 2020
3 {'K': 150, 'item_cnt': 50, 'user_cnt': 1, 'n_item_count': 300} 0.2616572971569356
Wed Jun 24 18:26:34 2020 

iter 5 of 30: Wed Jun 24 18:26:34 2020
4 {'K': 30, 'item_cnt': 20, 'user_cnt': 10, 'n_item_count': 500} 0.2554418853640816
Wed Jun 24 18:27:01 2020 

iter 6 of 30: Wed Jun 24 18:27:01 2020
5 {'K': 100, 'item_cnt': 10, 'user_cnt': 3, 'n_item_count': 1000} 0.26811009130307106
Wed Jun 24 18:28:22 2020 

iter 7 of 30: Wed Jun 24 18:28:22 2020
6 {'K': 150, 'item_cnt': 10, 'user_cnt': 10, 'n_item_cou

## predict

In [None]:
best = sorted(metric, lambda x: x[2], reverse=True)[0]
print(best)
best_params = best[1]

In [None]:
matrix_m, item_mapper, user_mapper = prepare_matrix(train_p)

In [None]:
# cut matrix
mask_users = np.diff(matrix_m.tocsr().indptr) > best_params['user_cnt']
mask_items = np.diff(matrix_m.tocsc().indptr) > best_params['item_cnt']
matrix = matrix_m.copy()
matrix = matrix[:, mask_items]
matrix = matrix[mask_users, :]

# mappers
new_users = np.array([x[0] for x in sorted([(k,v) for k,v in user_mapper.items()], key=lambda x: x[1])])[mask_users]
new_user_mapper = {v:k for k,v in enumerate(new_users)}
new_user_mapper_rev = {k:v for k,v in enumerate(new_users)}

new_items = np.array([x[0] for x in sorted([(k,v) for k,v in item_mapper.items()], key=lambda x: x[1])])[mask_items]
new_item_mapper = {v:k for k,v in enumerate(new_items)}
new_item_mapper_rev = {k:v for k,v in enumerate(new_items)}

In [None]:
# fit
model = implicit.nearest_neighbours.CosineRecommender(K=best_params['K'])
model.fit(matrix.T)

In [None]:
# predict
cold_users = set(test['user_id'].unique()) - set(new_user_mapper.keys())
hot_users = set(test['user_id'].unique())&(set(new_user_mapper.keys()))
hot_users_labels = [new_user_mapper[x] for x in hot_users]

In [None]:
## recs for hot users
res1 = []
n = len(hot_users)

for user in hot_users_labels:
    rec = model.recommend(user, matrix, N=10)
    res1 += [[user, r] for r, v in rec]

result1 = pd.DataFrame(res1, columns=['user_num', 'item_num'])
result1['user_id'] = result1['user_num'].map(new_user_mapper_rev)
result1['primary_video_id'] = result1['item_num'].map(new_item_mapper_rev)
result1.drop(['user_num', 'item_num'], axis=1, inplace=True)

In [None]:
## recs for cold users

# самые популярные товары
item_pop_matrix = pd.DataFrame([])
item_pop_matrix['item_id'] = new_item_mapper.keys()
item_pop_matrix['cnt_not_null'] = (matrix.T > 0).sum(axis=1)

# среднее по товарам
(x,y,z) = scipy.sparse.find(matrix)
countings = np.bincount(y)
sums = np.bincount(y, weights=z)
averages = sums/countings

item_pop_matrix['item_means'] = averages

top10_items = item_pop_matrix[item_pop_matrix['cnt_not_null'] > best_params['n_item_count']]\
.sort_values('item_means', ascending=False).head(10)['item_id'].values

result2 = pd.DataFrame(cold_users, columns=['user_id'])
items = [x for x in top10_items]*result2.shape[0]
result2 = result2.loc[result2.index.repeat(10)]
result2['primary_video_id'] = items

In [None]:
# join
finalDF = result1.append(result2).reset_index(drop=True)
finalDF = finalDF[finalDF['user_id'].isin(test['user_id'].unique())].reset_index(drop=True)

In [None]:
finalDF = finalDF[['user_id', 'primary_video_id']]
result = finalDF.groupby('user_id').apply(list)['primary_video_id']
result['primary_video_id'] = result['primary_video_id'].apply(lambda x: ' '.join([str(a) for a in x]))

In [None]:
result.to_csv('/data/agoryach/datagym-recsys-01/big-hw-02/output/submission_7.csv', index=False)