In [1]:
import implicit

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
import scipy
from scipy.sparse.linalg import svds
from numpy.linalg import svd 
from scipy.sparse import csr_matrix
import time
from sklearn import preprocessing

In [10]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def prepare_df(df):
    aggr1 = {
    'session_duration': 'max', 
    'video_duration': 'max',
    'month': 'max'
    }
    k = df.groupby(['user_id', 'primary_video_id']).agg(aggr1).reset_index().reset_index()
    k['session_duration_clean'] = k[['session_duration', 'video_duration']].min(axis=1)
    k['watching_percentage'] = k['session_duration_clean']/k['video_duration']
    k = k[['month', 'user_id', 'primary_video_id', 'watching_percentage']]
    k = k[k['watching_percentage']>=0.5]
    return k

In [4]:
def prepare_matrix(df):
    user_mapper = {k:v for v, k in dict(enumerate(df.user_id.unique())).items()}
    item_mapper = {k:v for v, k in dict(enumerate(df.primary_video_id.unique())).items()}
    df['user_num'] = df['user_id'].map(user_mapper)
    df['item_num'] = df['primary_video_id'].map(item_mapper)
    row  = df['user_num'].values
    col  = df['item_num'].values
    data = df['watching_percentage'].values
    matrix = csr_matrix(coo_matrix((data, (row, col)), shape=(df['user_num'].nunique(), df['item_num'].nunique())))   
    print(df.user_id.nunique(), df.primary_video_id.nunique(), matrix.shape)  
    return matrix, item_mapper, user_mapper

In [5]:
def map_at_k(df, k):
    df = df[df['rnk'] < k]
    df['target_cumcum'] = df[['user_id', 'targer']].groupby('user_id').cumsum()
    df['for_running_sum'] = df['target_cumcum']/(df['rnk']+1)
    agg_df = df[df['targer']==1][['user_id', 'for_running_sum']].groupby('user_id').mean().reset_index()
    agg_df.columns = ['user_id', 'ap']
    return agg_df['ap'].mean()

In [6]:
# генерация сетки для модели
def get_params(dict_):
    result = {}
    for i in params:
        result[i] = np.random.choice(dict_[i])
    return result

def get_grid(shape=200):
    dict_of_params = {}
    i = 0 
    while len(dict_of_params) < shape:
        par = get_params(params)
        if par not in dict_of_params.values():
            dict_of_params[i] = par
            i += 1
    return dict_of_params

## read data

In [7]:
train = pd.read_csv('/data/agoryach/datagym-recsys-01/big-hw-02/input/train_data_full.csv')
item_info = pd.read_csv('/data/agoryach/datagym-recsys-01/big-hw-02/input/video_meta_data_full.csv')
test = pd.read_csv('/data/agoryach/datagym-recsys-01/big-hw-02/input/sample_submission_full.csv')

In [8]:
train['session_start_datetime'] = pd.to_datetime(train['session_start_datetime'])
train['month'] = train['session_start_datetime'].apply(lambda x: x.month)

In [9]:
train.month.value_counts()

7    2866200
9    2742848
8    2640335
Name: month, dtype: int64

## grid search

In [52]:
params = {
    'K': [5, 10, 20, 30, 50, 100, 120, 150, 180, 200, 230, 300],
    'item_cnt': [5, 10, 20, 50],
    'user_cnt': [1, 3, 5],
    'n_item_count': [50, 100, 200, 300, 500, 800, 1000, 2000]
}

dict_of_params = get_grid(shape=50)

In [53]:
train_p = prepare_df(train)
train_df = train_p[train_p['month'] != 9]
test_df  = train_p[train_p['month'] == 9]
test_df['targer'] = [1]*test_df.shape[0]
matrix_m, item_mapper, user_mapper = prepare_matrix(train_df)

328140 5159 (328140, 5159)


In [54]:
metric = []
for k, param in dict_of_params.items():
    print(f'iter {k+1} of {len(dict_of_params)}: {time.ctime()}')
    
    # cut matrix
    mask_users = np.diff(matrix_m.tocsr().indptr) > param['user_cnt']
    mask_items = np.diff(matrix_m.tocsc().indptr) > param['item_cnt']
    matrix = matrix_m.copy()
    matrix = matrix[:, mask_items]
    matrix = matrix[mask_users, :]
    
    # mappers
    new_users = np.array([x[0] for x in sorted([(k,v) for k,v in user_mapper.items()], key=lambda x: x[1])])[mask_users]
    new_user_mapper = {v:k for k,v in enumerate(new_users)}
    new_user_mapper_rev = {k:v for k,v in enumerate(new_users)}
    
    new_items = np.array([x[0] for x in sorted([(k,v) for k,v in item_mapper.items()], key=lambda x: x[1])])[mask_items]
    new_item_mapper = {v:k for k,v in enumerate(new_items)}
    new_item_mapper_rev = {k:v for k,v in enumerate(new_items)}
    
    # fit
    model = implicit.nearest_neighbours.CosineRecommender(K=param['K'],)
    model.fit(matrix.T)
    
    # predict
    cold_users = set(test_df['user_id'].unique()) - set(new_user_mapper.keys())
    hot_users = set(test_df['user_id'].unique())&(set(new_user_mapper.keys()))
    hot_users_labels = [new_user_mapper[x] for x in hot_users]
    
    ## recs for hot users
    res1 = []
    n = len(hot_users)

    for user in hot_users_labels:
        rec = model.recommend(user, matrix, N=10)
        res1 += [[user, r] for r, v in rec]

    result1 = pd.DataFrame(res1, columns=['user_num', 'item_num'])
    result1['user_id'] = result1['user_num'].map(new_user_mapper_rev)
    result1['primary_video_id'] = result1['item_num'].map(new_item_mapper_rev)
    result1.drop(['user_num', 'item_num'], axis=1, inplace=True)
    
    ## recs for cold users
    # самые популярные товары
    item_pop_matrix = pd.DataFrame([])
    item_pop_matrix['item_id'] = new_item_mapper.keys()
    item_pop_matrix['cnt_not_null'] = (matrix.T > 0).sum(axis=1)

    # среднее по товарам
    (x,y,z) = scipy.sparse.find(matrix)
    countings = np.bincount(y)
    sums = np.bincount(y, weights=z)
    averages = sums/countings

    item_pop_matrix['item_means'] = averages

    top10_items = item_pop_matrix[item_pop_matrix['cnt_not_null'] > param['n_item_count']]\
    .sort_values('item_means', ascending=False).head(10)['item_id'].values

    result2 = pd.DataFrame(cold_users, columns=['user_id'])
    items = [x for x in top10_items]*result2.shape[0]
    result2 = result2.loc[result2.index.repeat(10)]
    result2['primary_video_id'] = items

    # join
    finalDF = result1.append(result2).reset_index(drop=True)
    finalDF = finalDF[finalDF['user_id'].isin(test_df['user_id'].unique())].reset_index(drop=True)

    # map@k
    finalDF = finalDF.merge(test_df[['user_id', 'primary_video_id', 'targer']], 
                            on=['user_id', 'primary_video_id'], 
                            how='left').fillna(0)
    finalDF['rnk'] = finalDF.groupby('user_id').cumcount()+1
    print(k, param, map_at_k(finalDF, 10))
    print(time.ctime(), '\n')
    metric.append((k, param, map_at_k(finalDF, 10)))

iter 1 of 50: Wed Jun 24 19:02:37 2020
0 {'K': 150, 'item_cnt': 20, 'user_cnt': 3, 'n_item_count': 2000} 0.26747180939436266
Wed Jun 24 19:04:17 2020 

iter 2 of 50: Wed Jun 24 19:04:17 2020
1 {'K': 180, 'item_cnt': 50, 'user_cnt': 5, 'n_item_count': 500} 0.24746427978218877
Wed Jun 24 19:05:42 2020 

iter 3 of 50: Wed Jun 24 19:05:43 2020
2 {'K': 230, 'item_cnt': 5, 'user_cnt': 1, 'n_item_count': 50} 0.2722501117258788
Wed Jun 24 19:08:13 2020 

iter 4 of 50: Wed Jun 24 19:08:14 2020
3 {'K': 120, 'item_cnt': 10, 'user_cnt': 3, 'n_item_count': 300} 0.2573451712990449
Wed Jun 24 19:09:43 2020 

iter 5 of 50: Wed Jun 24 19:09:43 2020
4 {'K': 30, 'item_cnt': 50, 'user_cnt': 1, 'n_item_count': 300} 0.2626071214133745
Wed Jun 24 19:10:49 2020 

iter 6 of 50: Wed Jun 24 19:10:49 2020
5 {'K': 200, 'item_cnt': 10, 'user_cnt': 1, 'n_item_count': 300} 0.26110104415856894
Wed Jun 24 19:13:09 2020 

iter 7 of 50: Wed Jun 24 19:13:10 2020
6 {'K': 20, 'item_cnt': 5, 'user_cnt': 1, 'n_item_count': 50

## predict

In [55]:
best = sorted(metric, key = lambda x: x[2], reverse=True)[0]
print(best)
best_params = best[1]

(40, {'K': 5, 'item_cnt': 5, 'user_cnt': 1, 'n_item_count': 50}, 0.27695139007145086)


In [56]:
matrix_m, item_mapper, user_mapper = prepare_matrix(train_p)

397294 5255 (397294, 5255)


In [57]:
# cut matrix
mask_users = np.diff(matrix_m.tocsr().indptr) > best_params['user_cnt']
mask_items = np.diff(matrix_m.tocsc().indptr) > best_params['item_cnt']
matrix = matrix_m.copy()
matrix = matrix[:, mask_items]
matrix = matrix[mask_users, :]

# mappers
new_users = np.array([x[0] for x in sorted([(k,v) for k,v in user_mapper.items()], key=lambda x: x[1])])[mask_users]
new_user_mapper = {v:k for k,v in enumerate(new_users)}
new_user_mapper_rev = {k:v for k,v in enumerate(new_users)}

new_items = np.array([x[0] for x in sorted([(k,v) for k,v in item_mapper.items()], key=lambda x: x[1])])[mask_items]
new_item_mapper = {v:k for k,v in enumerate(new_items)}
new_item_mapper_rev = {k:v for k,v in enumerate(new_items)}

In [58]:
# fit
model = implicit.nearest_neighbours.CosineRecommender(K=best_params['K'])
model.fit(matrix.T)

In [59]:
# predict
cold_users = set(test['user_id'].unique()) - set(new_user_mapper.keys())
hot_users = set(test['user_id'].unique())&(set(new_user_mapper.keys()))
hot_users_labels = [new_user_mapper[x] for x in hot_users]

In [60]:
## recs for hot users
res1 = []
n = len(hot_users)

for user in hot_users_labels:
    rec = model.recommend(user, matrix, N=10)
    res1 += [[user, r] for r, v in rec]

result1 = pd.DataFrame(res1, columns=['user_num', 'item_num'])
result1['user_id'] = result1['user_num'].map(new_user_mapper_rev)
result1['primary_video_id'] = result1['item_num'].map(new_item_mapper_rev)
result1.drop(['user_num', 'item_num'], axis=1, inplace=True)

In [61]:
## recs for cold users

# самые популярные товары
item_pop_matrix = pd.DataFrame([])
item_pop_matrix['item_id'] = new_item_mapper.keys()
item_pop_matrix['cnt_not_null'] = (matrix.T > 0).sum(axis=1)

# среднее по товарам
(x,y,z) = scipy.sparse.find(matrix)
countings = np.bincount(y)
sums = np.bincount(y, weights=z)
averages = sums/countings

item_pop_matrix['item_means'] = averages

top10_items = item_pop_matrix[item_pop_matrix['cnt_not_null'] > best_params['n_item_count']]\
.sort_values('item_means', ascending=False).head(10)['item_id'].values

result2 = pd.DataFrame(cold_users, columns=['user_id'])
items = [x for x in top10_items]*result2.shape[0]
result2 = result2.loc[result2.index.repeat(10)]
result2['primary_video_id'] = items

In [62]:
# join
finalDF = result1.append(result2).reset_index(drop=True)
finalDF = finalDF[finalDF['user_id'].isin(test['user_id'].unique())].reset_index(drop=True)

In [63]:
finalDF = finalDF[['user_id', 'primary_video_id']]
result = finalDF.groupby('user_id')['primary_video_id'].apply(list).reset_index()
result['primary_video_id'] = result['primary_video_id'].apply(lambda x: ' '.join([str(a) for a in x]))

In [64]:
left_users = list(set(test.user_id.unique())-set(finalDF.user_id.unique()))
print(left_users)

[101615009, 48046089, 63958124, 36271950, 97501040, 83920689, 122937333, 60276693, 38025751, 69072088, 9015609, 100913946]


In [65]:
result2 = pd.DataFrame(left_users, columns=['user_id'])
result2['primary_video_id'] = ' '.join([str(x) for x in top10_items])

In [66]:
result = result.append(result2)#.reset_index(drop=True)

In [67]:
result.to_csv('/data/agoryach/datagym-recsys-01/big-hw-02/output/submission_9.csv', index=False)