In [1]:
import pandas as pd
import numpy as np

In [2]:
from tqdm import tqdm

In [3]:
clicks = pd.read_csv('train_clicks.csv', parse_dates=['day'])
category_views = pd.read_csv('train_category_views.csv', parse_dates=['day'])
product_views = pd.read_csv('train_product_views.csv', parse_dates=['day'])
test_users = pd.read_csv('test_users.csv')

In [4]:
number_of_categories = max(clicks.category_id)
print (number_of_categories)
number_of_categories = max(number_of_categories, max(category_views.category_id))
print (number_of_categories)
number_of_users = max(clicks.user_id)

2653
2682


# Skip

In [5]:
user_clicks = np.zeros((number_of_users + 1, number_of_categories + 1))
for row in clicks.iterrows():
    _, row = row
    user_clicks[row['user_id'], row['category_id']] += 1

In [6]:
test_categories = np.argsort(-user_clicks[test_users.values.reshape(-1), :], axis=1)[:, :5]

In [7]:
test_categories[:5]

array([[1409,  134,  108,  429,  755],
       [1898,    0, 1764, 1765, 1766],
       [ 200,    0, 1765, 1766, 1767],
       [2138,  160,    0, 1765, 1766],
       [2273,    0, 1765, 1766, 1767]])

In [5]:
def join_categories(row):
    base_str = ' '.join(map(str, row))
    return base_str + ' '*(25 - len(base_str))

In [9]:
test_users['categories'] = np.apply_along_axis(join_categories, 1, test_categories)
test_users.head()

Unnamed: 0,user_id,categories
0,8,1409 134 108 429 755
1,12,1898 0 1764 1765 1766
2,27,200 0 1765 1766 1767
3,39,2138 160 0 1765 1766
4,40,2273 0 1765 1766 1767


In [10]:
test_users.to_csv('baseline.csv', index=None)

# My improvement

## Metrics

In [6]:
def apk(actual, predicted, k=5):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=5):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

## Train-Val Split

In [7]:
clicks_val = clicks[clicks.day >= pd.tslib.Timestamp('2016-09-19')]
clicks_train = clicks[clicks.day < pd.tslib.Timestamp('2016-09-19')]

# GC

In [7]:
import gc

In [9]:
gc.collect()

0

In [None]:
del([clicks, user_clicks, test_categories, test_users])

## Heuristics

In [15]:
y_val = np.zeros((number_of_users + 1, number_of_categories + 1))
for row in clicks_val.iterrows():
    _, row = row
    y_val[row['user_id'], row['category_id']] += 1

In [23]:
y_val[:, 1056].sum()

35.0

In [22]:
row['category_id']

1056

In [12]:
y_val = np.argsort(-y_val, axis=1)[:, :5]

In [13]:
y_val

array([[   0, 1765, 1766, 1767, 1768],
       [   0, 1765, 1766, 1767, 1768],
       [   0, 1765, 1766, 1767, 1768],
       ..., 
       [   0, 1765, 1766, 1767, 1768],
       [   0, 1765, 1766, 1767, 1768],
       [   0, 1765, 1766, 1767, 1768]])

In [39]:
y_val_pred = np.zeros((number_of_users + 1, number_of_categories + 1))
for row in clicks_train.iterrows():
    _, row = row
    diff = (pd.tslib.Timestamp('2016-09-25') - row['day']).days
    y_val_pred[row['user_id'], row['category_id']] += 1. / (1 + diff)

In [19]:
y_val_pred = np.argsort(-y_val_pred, axis=1)[:, :5]

## Test Predictions

In [10]:
def add_column(user_clicks, column):
    for row in column.iterrows():
        _, row = row
        diff = (pd.tslib.Timestamp('2016-09-25') - row['day']).days
        user_clicks[row['user_id'], row['category_id']] += 1. / (1 + diff)

In [11]:
user_clicks = np.zeros((number_of_users + 1, number_of_categories + 1))

In [12]:
add_column(user_clicks, category_views)

In [13]:
add_column(user_clicks, clicks)

In [14]:
product_views.shape

(831168, 4)

In [15]:
clicks.shape

(704469, 6)

In [16]:
category_views.shape

(708724, 3)

In [18]:
product_views.head()

Unnamed: 0,user_id,product_id,type_id,day
0,106,58196,7,2016-08-04
1,175,110179,8,2016-08-04
2,175,110179,3,2016-08-04
3,432,122057,7,2016-08-04
4,432,122057,7,2016-08-04


In [19]:
test_categories = np.argsort(-user_clicks[test_users.user_id.values.reshape(-1), :], axis=1)[:, :5]
test_users['categories'] = np.apply_along_axis(join_categories, 1, test_categories)
test_users.to_csv('clicks_views_1_div_diff+1.csv', index=None)

In [25]:
y_my = test_users['categories'].values

In [22]:
y_dima = pd.read_csv('1_divided_by_1__diff.csv')

In [26]:
y_dima = y_dima['categories'].values

In [27]:
y_my

array(['1079 429 426 1064 1409   ', '1898 789 0 1785 1786     ',
       '200 0 1784 1785 1786     ', ..., '466 0 1785 1786 1787     ',
       '435 0 1784 1785 1786     ', '672 0 1785 1786 1787     '], dtype=object)

In [28]:
for i in range()

array(['1079 429 0 426 1064      ', '1898 789 0 1785 1786     ',
       '200 0 1784 1785 1786     ', ..., '466 0 1785 1786 1787     ',
       '435 0 1784 1785 1786     ', '672 0 1785 1786 1787     '], dtype=object)