In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

from scipy.sparse.linalg import svds
from numpy.linalg import svd
from scipy.sparse import csr_matrix
import time

## submit 0

### base matrix

In [2]:
train = pd.read_csv('/data/agoryach/datagym-recsys-01/big-hw-02/input/train_data_full.csv')

In [3]:
train.user_id.nunique(), train.primary_video_id.nunique()

(407381, 5266)

In [4]:
aggr1 = {
    'session_duration': 'max', 
    'video_duration': 'max'
}

train_s = train.groupby(['user_id', 'primary_video_id'])\
               .agg(aggr1).reset_index().reset_index()

train_s['session_duration_clean'] = train_s[['session_duration', 'video_duration']].min(axis=1)
train_s['watching_percentage'] = train_s['session_duration_clean']/train_s['video_duration']
train_s = train_s[['user_id', 'primary_video_id', 'watching_percentage']]

In [5]:
by_user = train_s[['user_id', 'primary_video_id']].groupby('user_id').count().reset_index()
freq_users = by_user[by_user['primary_video_id']>10].user_id.unique()

by_item = train_s[['user_id', 'primary_video_id']].groupby('primary_video_id').count().reset_index()
freq_items = by_item[by_item['user_id'] > 10].primary_video_id.unique()

train_ss = train_s[(train_s['user_id'].isin(freq_users))&
                   (train_s['primary_video_id'].isin(freq_items))]

In [6]:
def create_utility_matrix(data, formatizer={'user': 0, 'item': 1, 'value': 2}):
    '''
        функция создания матрицы
    '''
    itemField = formatizer['item'] 
    userField = formatizer['user']
    valueField = formatizer['value']
    userList = data.iloc[:, userField].tolist()
    itemList = data.iloc[:, itemField].tolist()
    valueList = data.iloc[:, valueField].tolist()
    users = list(set(data.iloc[:, userField]))
    items = list(set(data.iloc[:, itemField]))
    users_index = {users[i]: i for i in range(len(users))}
    pd_dict = {item: [np.nan for i in range(len(users))] for item in items}
    for i in range(0, len(data)):
        item = itemList[i]
        user = userList[i]
        value = valueList[i]
        pd_dict[item][users_index[user]] = value
    X = pd.DataFrame(pd_dict)
    X.index = users
    itemcols = list(X.columns)
    items_index = {itemcols[i]: i for i in range(len(itemcols))}
    return X, users_index, items_index

In [7]:
%%time
utilMat_r, users_index, items_index = create_utility_matrix(train_ss)

CPU times: user 1min 56s, sys: 7.75 s, total: 2min 4s
Wall time: 2min 4s


In [10]:
print(f'start {time.ctime()}')

SyntaxError: invalid syntax (<ipython-input-10-b45acef102be>, line 1)

In [9]:
def svd_fillna_shift_norm(train, k):
    '''
        svd с нормировкой + заполнениями и сдвигом
    '''
    
    print(f'start {time.ctime()}')
    utilMat = train.copy()
    # маска из нанов
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    print(f'done mask {time.ctime()}')
    # замена средним для нанов
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0], 1))
    print(f'done fillna {time.ctime()}')
    # вычитание
    utilMat = utilMat - x
    total_mean = np.mean(utilMat)
    col_means = np.mean(utilMat, axis=0) - total_mean
    row_means = np.mean(utilMat, axis=1) - total_mean
    utilMat = utilMat - row_means[:, np.newaxis] - col_means - total_mean
    print(f'done centering {time.ctime()}')
    row_norms = np.sqrt(np.sum(utilMat*utilMat, axis=1))[:, np.newaxis]
    col_norms = np.sqrt(np.sum(utilMat*utilMat, axis=0))[np.newaxis, :]
    utilMat = utilMat / np.sqrt(row_norms) / np.sqrt(col_norms)
    print(f'done norm {time.ctime()}')
    U, s, V = np.linalg.svd(utilMat, full_matrices=False)
    print(f'done svd {time.ctime()}')
    s = np.diag(s)
    s = s[0:k, 0:k]
    U = U[:, 0:k]
    V = V[0:k, :]
    s_root = np.sqrt(s)
    Usk = np.dot(U, s_root)
    skV = np.dot(s_root, V)
    UsV = np.dot(Usk, skV)
    print(f'done dot {time.ctime()}')
    # восстановление
    UsV = UsV * np.sqrt(row_norms) * np.sqrt(col_norms)
    UsV = UsV + row_means[:, np.newaxis] + col_means + total_mean
    UsV = UsV + x
    print(f'done recover {time.ctime()}')
    return UsV

SyntaxError: invalid syntax (<ipython-input-9-3061682836c2>, line 5)

In [None]:
%%time
#svdout = svd_fillna_shift_norm(utilMat, k=10)
UsV = svd_fillna_shift_norm(utilMat, k=10)

In [None]:
UsV.dump("input/UsV_baseline.dat")

### cold start

In [None]:
test = pd.read_csv('input/sample_submission_full.csv')

In [None]:
# get top 10 for hot users
# заполнить 0 те айтемы, с котооорыми юзер уже взаимодействовал
idx = np.argpartition(A, -10)
# ranked = np.argsort(UsV)
# UsV_s = ranked[::-1]


In [None]:
cold_users = [x for x in test['user_id'].unique() if x not in users_index.keys()]

In [None]:
item_pop_matrix = pd.DataFrame([])
item_pop_matrix['item'] = items_index.keys()
item_pop_matrix['cnt_not_null'] = (utilMat.T > 0).sum(axis=1)

# среднее по товарам
mask = np.isnan(utilMat.T)
masked_arr = np.ma.masked_array(utilMat.T, mask)
item_means = np.mean(masked_arr.T, axis=1)

item_pop_matrix['item_means'] = item_means

In [None]:
item_pop_matrix.cnt_not_null.hist()

In [None]:
item_pop_matrix.item_means.hist()

In [None]:
# сделать функцию map k
# cross val params:
# - svd k
# mean by user/item

In [13]:
# train_pivot = train_ss[['user_id', 'primary_video_id', 'watching_percentage']]\
#                     .pivot_table(values='watching_percentage',
#                                  index='user_id',
#                                  columns='primary_video_id')

In [17]:
train_pivot.shape

(121336, 5199)

In [None]:
U, s, Vt = svd(train_pivot)

In [None]:
svd_res = np.dot(np.dot(U, np.diag(s)), Vt)

In [None]:
s

In [None]:
sum(sum(svd_res))

In [11]:
user_mapper = {k:v for v, k in dict(enumerate(train_s.user_id.unique())).items()}
item_mapper = {k:v for v, k in dict(enumerate(train_s.primary_video_id.unique())).items()}

train_s['user_num'] = train_s['user_id'].map(user_mapper)
train_s['item_num'] = train_s['primary_video_id'].map(item_mapper)

row  = train_s['user_num'].values
col  = train_s['item_num'].values
data = train_s['watching_percentage'].values
matrix = csr_matrix(coo_matrix((data, (row, col)), shape=(len(user_mapper), len(item_mapper))))

train.user_id.nunique(), train.primary_video_id.nunique(), matrix.shape

(407381, 5266, (407381, 5266))

In [117]:
matrix_d = pd.DataFrame(matrix.todense())
matrix_d.replace(0, np.nan, inplace=True)

In [None]:
matrix_d.replace(0, np.nan, inplace=True)

In [None]:
matrix_d.fillna()

In [109]:
U, s, Vt = svds(matrix)

In [115]:
Vt.shape

(6, 5266)

In [110]:
svd_res = np.dot(np.dot(U, np.diag(s)), Vt)

In [102]:
sum(sum(svd_res))

0.0