In [2]:
import datetime
import pandas as pd

# Read data

### Read events data

In [3]:
raw_events = pd.read_csv('events.csv')

In [3]:
raw_events

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [4]:
# chech empty data
raw_events.isna().sum() / len(raw_events)

timestamp        0.000000
visitorid        0.000000
event            0.000000
itemid           0.000000
transactionid    0.991852
dtype: float64

In [5]:
cat_tree = pd.read_csv('category_tree.csv')

In [6]:
cat_tree # really this data doesn't help

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0
...,...,...
1664,49,1125.0
1665,1112,630.0
1666,1336,745.0
1667,689,207.0


### Combine item properties data

In [5]:
item_prop_1 = pd.read_csv('item_properties_part1.csv')

In [6]:
item_prop_1

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
...,...,...,...,...
10999994,1439694000000,86599,categoryid,618
10999995,1435460400000,153032,1066,n1020.000 424566
10999996,1440298800000,421788,888,35975 856003 37346
10999997,1437879600000,159792,400,n552.000 639502 n720.000 424566


In [7]:
item_prop_1.columns

Index(['timestamp', 'itemid', 'property', 'value'], dtype='object')

In [8]:
item_prop_2 = pd.read_csv('item_properties_part2.csv', sep=',')

In [9]:
item_props = pd.concat([item_prop_1, item_prop_2], axis=0, ignore_index=True)

In [10]:
props_df = item_props.drop_duplicates(subset=['itemid', 'property'], ignore_index=True, keep='last')\
    .sort_values('itemid').reset_index(drop=True)

In [11]:
props_df = props_df[['itemid', 'property', 'value']]

In [12]:
props_df

Unnamed: 0,itemid,property,value
0,0,888,478989
1,0,11,n15360.000 628176 n12288.000
2,0,227,1152934 1238769
3,0,776,318611
4,0,127,1168476
...,...,...,...
12003809,466866,896,769062
12003810,466866,341,769062
12003811,466866,850,670753 114844 808585 1169076
12003812,466866,706,892415 670753 114844 808585


### Check itemid intersection

In [14]:
def is_sublist(sublist, full_list):
    return set(sublist) <= set(full_list)

In [15]:
is_sublist(raw_events.itemid.values, props_df.itemid.values)

False

Some itemids will have NaN in proprerties columns

In [16]:
# Find share of itemids have presented in events.csv
len(set(raw_events.itemid.values).intersection(set(props_df.itemid.values))) / len(set(raw_events.itemid))

0.7880762865809301

# Items factors preparing

In [22]:
# add a property name to a property value to prevent cases when a same value for different properties
props_df['prop_val'] = props_df['property'].astype(str) + " " + props_df['value'].astype(str)

In [23]:
# availabily is temp property and doesn't help for modeling
props_df = props_df.query('property != "available"')

In [25]:
# create property and value combinations most popular
props_cum_perc = (props_df['prop_val'].value_counts().cumsum()/props_df['prop_val'].value_counts().sum())

In [26]:
# turn Series to DataFrame and rename colums
props_cum_perc = props_cum_perc.to_frame().reset_index().rename(columns={'index': 'prop_val', 'prop_val': 'cum_perc'})

In [27]:
# get property-values combinations that cover 80% of all variations
top80_prop_val = props_cum_perc.query('cum_perc < 0.8').prop_val.values.tolist()

In [29]:
len(top80_prop_val)

99452

In [31]:
# filter items properties
props_df_cut = props_df.query('prop_val in @top80_prop_val')

In [32]:
# colect all property-value combinations for each itemid - that will be out 'texts'
items_prop_vals = props_df_cut.groupby('itemid').prop_val.unique().reset_index()

In [33]:
items_prop_vals.loc[:, 'prop_val'] = items_prop_vals.prop_val.map("|||".join)

In [34]:
items_prop_vals

Unnamed: 0,itemid,prop_val
0,0,11 n15360.000 628176 n12288.000|||227 1152934 ...
1,1,1036 1154859|||33 1128577 1000087 421694|||296...
2,2,839 147366 343631|||698 822092 325894 504272||...
3,3,326 769062|||1025 769062|||562 769062|||689 15...
4,4,897 324209|||115 n24.000|||6 588652 1091491|||...
...,...,...
417048,466862,29 769062|||186 575816 n432.000|||348 1102430|...
417049,466863,678 1194687 550565|||881 n60.000|||790 n43320....
417050,466864,720 1279814 n12000.000|||159 519769|||1036 115...
417051,466865,277 769062|||614 668981|||698 1088309|||713 76...


In [35]:
# Save to clean laptop memory, load this data and carry on
items_prop_vals.to_csv('items_texts')

# Contextual recommendations

### Create FastText model

In [36]:
from gensim.models import FastText

In [37]:
items_prop_vals = pd.read_csv('items_texts', index_col=0)

In [38]:
items_prop_vals.loc[:, 'prop_val'] = items_prop_vals.prop_val.map(lambda x: x.split('|||'))

In [39]:
# check stats about prop-val lists lenth for each itemid
items_prop_vals.prop_val.map(len).describe()

count    417053.000000
mean         22.225971
std           7.625993
min           5.000000
25%          17.000000
50%          20.000000
75%          25.000000
max          51.000000
Name: prop_val, dtype: float64

In [40]:
# get max count of "words" in text
max_lenth = int(items_prop_vals.prop_val.map(len).describe().loc['max'])

In [41]:
max_lenth

51

In [42]:
sentenses = items_prop_vals.prop_val.values.tolist()

In [46]:
sentenses[0]

['11 n15360.000 628176 n12288.000',
 '227 1152934 1238769',
 '127 1168476',
 '764 1285872',
 '1036 1276750',
 '698 1152934 1238769',
 '112 679677',
 '561 1294803 101489',
 '225 1301326',
 '6 66094',
 '42 n204.000',
 '839 372274',
 'categoryid 209',
 '177 n96.000 1206660',
 '159 519769',
 '790 n91200.000',
 '139 n4800.000 270060 924073',
 '678 372274',
 '869 769062',
 '1056 n3.168 1144008',
 '189 708480']

In [47]:
vectory_size = max_lenth

In [48]:
modelFT = FastText(sentences=sentenses, vector_size=vectory_size, min_count=100, window=1)

In [49]:
modelFT.save('ft.model')

In [50]:
modelFT = FastText.load('ft.model')

In [51]:
import numpy as np
import annoy
from tqdm import tqdm

In [52]:
items_prop_vals = items_prop_vals.reset_index(drop=True)

In [54]:
ft_index = annoy.AnnoyIndex(max_lenth ,'angular')

index_to_counter = {}
counter = 0

for i in tqdm(range(len(items_prop_vals))):
    n_ft = 0
    index_to_counter[counter] = items_prop_vals.loc[i, "itemid"]
    vector_ft = np.zeros(max_lenth)
    # Каждое слово обернем в эмбеддинг
    for word in items_prop_vals.loc[i, 'prop_val']:
        word_str = str(word)
        if word in modelFT.wv:
            vector_ft += modelFT.wv[word_str]
            n_ft += 1
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
    ft_index.add_item(counter, vector_ft)
    counter += 1

# 
ft_index.build(10)

100%|██████████| 417053/417053 [02:45<00:00, 2519.48it/s]


True

In [55]:
def recommend(itemid, n_recomms=3):
    # Получим тайтл айтема по идентификатору

    prop_vals = items_prop_vals.query('itemid == @itemid').values
    
    vector_ft = np.zeros(max_lenth)
    n_ft = 0
    # Каждое слово обернем в эмбеддинг
    for word in prop_vals:
        word_str = str(word)
        if word in modelFT.wv:
            vector_ft += modelFT.wv[word_str]
            n_ft += 1
    if n_ft > 0:
        vector_ft = vector_ft / n_ft
    # Найдем ближайшие по тайтлу
    rec_items = ft_index.get_nns_by_vector(vector_ft, n_recomms+1)
    recommendations =  [index_to_counter[i] for i in rec_items]
    if itemid in recommendations:
        recommendations.remove(itemid)
        return recommendations
    return recommendations[: -1]

recommend(466862)

[82381, 105494, 101640]

In [56]:
visitors_transactions = raw_events.query('event == "transaction"').groupby('visitorid').itemid.unique().reset_index()

In [57]:
vis_trans_above1 = visitors_transactions[
    visitors_transactions.itemid.map(len) > 1
    ]

In [58]:
map_at3 = 0
couter = 0

for _, data in tqdm(vis_trans_above1.iterrows()):
    last_bought_item = data['itemid'][-1]
    # print('Last bought item')
    # print(last_bought_item)
    
    all_bought_items = set(data['itemid']) - set([last_bought_item])
    # print('All bouth items')
    # print(all_bought_items)
    recommendations = []
    recommendations = set(recommend(last_bought_item, 3))
    # print('Recommendations')
    # print(recommendations)
    
    map_at3 += len(all_bought_items.intersection(recommendations)) / len(recommendations)
    counter += 1
    # break
map_at3 = map_at3 / len(visitors_transactions)
map_at3

2174it [00:11, 191.77it/s]


0.0011661973433455643

In [59]:
print('Mean Average Precision at 3: {:.2%}'.format(map_at3))

Mean Average Precision at 3: 0.12%


# Collaborative filtering

In [60]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
import lightfm

In [61]:
train, test = train_test_split(raw_events.query('event == "transaction"'), test_size=0.3, shuffle=False)

In [62]:
test

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
1944943,1432692565573,404403,transaction,218626,5610.0
1945138,1432661976535,1050485,transaction,45870,2384.0
1945211,1432663302215,327437,transaction,215522,1785.0
1945394,1432695433085,286616,transaction,27832,9114.0
1945399,1432698340626,958255,transaction,340800,16796.0
...,...,...,...,...,...
2755294,1438377176570,1050575,transaction,31640,8354.0
2755349,1438379878779,861299,transaction,456602,3643.0
2755508,1438357730123,855941,transaction,235771,4385.0
2755603,1438355560300,548772,transaction,29167,13872.0


In [63]:
from lightfm.evaluation import precision_at_k

In [64]:
from lightfm.data import Dataset

In [65]:
# all_visitorids = raw_events.query('event == "transaction"').visitorid.unique()
# all_itemids = raw_events.query('event == "transaction"').itemid.unique()

all_visitorids = raw_events.visitorid.unique()
all_itemids = raw_events.itemid.unique()

In [66]:
dataset = Dataset()

In [67]:
dataset.fit(users=all_visitorids, items=all_itemids)

In [68]:
# train_coo = dataset.build_interactions(train[['visitorid', 'itemid']].values)
train_coo = dataset.build_interactions(train.query('event =="transaction"')[['visitorid', 'itemid']].values)

In [69]:
# test_coo = dataset.build_interactions(test[['visitorid', 'itemid']].values)
test_coo = dataset.build_interactions(test.query('event =="transaction"')[['visitorid', 'itemid']].values)

In [70]:
model = lightfm.LightFM(loss='warp')
model.fit(train_coo[0], epochs=30)

<lightfm.lightfm.LightFM at 0x7f02629b7fa0>

In [71]:
p_at3 = precision_at_k(model, test_coo[0], k=3).mean()
print('Mean Average Precision at 3: {:.2%}'.format(p_at3))

Mean Average Precision at 3: 0.93%


# ALS

In [72]:
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k

  from .autonotebook import tqdm as notebook_tqdm


In [73]:
model = AlternatingLeastSquares(factors=10, random_state=42)
model.fit(train_coo[0])

100%|██████████| 15/15 [00:02<00:00,  5.21it/s]


In [74]:
p_at3_als = mean_average_precision_at_k(model, train_coo[0], test_coo[0], K=3)
print('Mean Average Precision at 3: {:.2%}'.format(p_at3_als))

100%|██████████| 3476/3476 [00:06<00:00, 558.06it/s]

Mean Average Precision at 3: 0.09%





# XGBoost

Bad idea because of extreme class disbalancing (0.87% data share is 1 class). But we'll try...

In [71]:
import pandas as pd
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

In [19]:
items_prop_vals = pd.read_csv('items_texts', index_col=0)

In [29]:
items_prop_vals['prop_val'] = items_prop_vals['prop_val'].map(lambda x: x.replace(' ', "_"))

In [30]:
# Create a sentences of property-values

items_prop_vals['prop_val'] = items_prop_vals['prop_val'].map(lambda x: x.replace('|||', " "))

In [31]:
items_prop_vals

Unnamed: 0,itemid,prop_val
0,0,11_n15360.000_628176_n12288.000 227_1152934_12...
1,1,1036_1154859 33_1128577_1000087_421694 296_866...
2,2,839_147366_343631 698_822092_325894_504272 332...
3,3,326_769062 1025_769062 562_769062 689_150169_1...
4,4,897_324209 115_n24.000 6_588652_1091491 841_13...
...,...,...
417048,466862,29_769062 186_575816_n432.000 348_1102430 546_...
417049,466863,678_1194687_550565 881_n60.000 790_n43320.000 ...
417050,466864,720_1279814_n12000.000 159_519769 1036_1154859...
417051,466865,277_769062 614_668981 698_1088309 713_769062 9...


In [32]:
dataset = pd.merge(left=raw_events, right=items_prop_vals, on='itemid', how='inner')\
    .dropna(subset=['itemid'])

dataset['datetime'] = pd.to_datetime(dataset['timestamp'], unit='ms')
dataset['day_of_week'] = dataset['datetime'].dt.weekday
dataset['month'] = dataset['datetime'].dt.month
dataset['day'] = dataset['datetime'].dt.day
dataset['hour'] = dataset['datetime'].dt.hour
dataset['minute'] = dataset['datetime'].dt.minute
dataset['target'] = 1 - dataset['transactionid'].isna()
dataset = dataset.sort_values('timestamp')

In [39]:
dataset

Unnamed: 0,timestamp,visitorid,event,itemid,prop_val,day_of_week,month,day,hour,minute,target
1298902,1430622004384,693516,addtocart,297662,678_820477 790_n14280.000 884_769062 159_51976...,6,5,3,3,0,0
213171,1430622011289,829044,view,60987,120_769062 809_769062 1_1021755 112_679677 764...,6,5,3,3,0,0
1456585,1430622024154,1125936,view,33661,764_1285872 6_550504_827388 categoryid_1628 79...,6,5,3,3,0,0
1298901,1430622026228,693516,view,297662,678_820477 790_n14280.000 884_769062 159_51976...,6,5,3,3,0,0
652199,1430622027031,1149227,view,29757,689_471783_827388 71_376905 19_769062 558_6403...,6,5,3,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2425354,1442545164029,472345,view,301436,332_n60.000 689_150169_1037891 6_253511 277_n1...,4,9,18,2,59,0
2000475,1442545165499,1207677,view,310922,227_21098_980118 1032_769062 464_n51.600 550_7...,4,9,18,2,59,0
2407648,1442545174109,255126,view,47467,689_150169_435459_16718 653_1314495_237874_536...,4,9,18,2,59,0
827626,1442545181778,622226,view,345308,434_769062 6_985131 764_1285872 243_985131 960...,4,9,18,2,59,0


In [40]:
dataset['target'] = 1 - dataset['transactionid'].isna()
dataset = dataset.drop(['transactionid', 'datetime', 'event'], axis=1)

In [41]:
dataset.drop('event', axis=1, inplace=True)

In [44]:
cat_features = ['visitorid', 'itemid', 'day_of_week',
       'month', 'day', 'hour', 'minute']
target = 'target'
num_features = 'timestamp'
text_features = 'prop_val'

# dataset[cat_features] = dataset[cat_features].astype('category')

In [49]:
dataset[cat_features]

Unnamed: 0,visitorid,itemid,day_of_week,month,day,hour,minute
1298902,693516,297662,6,5,3,3,0
213171,829044,60987,6,5,3,3,0
1456585,1125936,33661,6,5,3,3,0
1298901,693516,297662,6,5,3,3,0
652199,1149227,29757,6,5,3,3,0
...,...,...,...,...,...,...,...
2425354,472345,301436,4,9,18,2,59
2000475,1207677,310922,4,9,18,2,59
2407648,255126,47467,4,9,18,2,59
827626,622226,345308,4,9,18,2,59


In [63]:
cat_data = pd.get_dummies(dataset[cat_features], columns=cat_features, sparse=True)

In [132]:
import numpy as np
from scipy.sparse import coo_matrix, hstack

In [124]:
cat_data_sparse = cat_data.sparse.to_coo()

In [141]:
data_X_array = hstack([
    cat_data_sparse.tocsr(), 
    dataset[num_features].values.reshape(-1, 1)])
data_y_array = dataset[target].values

In [138]:
data_X_array

(2500516, 1421406)

In [142]:
X_train, X_test, y_train, y_test = train_test_split(data_X_array, data_y_array, test_size=0.3, shuffle=False)

In [146]:
texts_train, texts_test = train_test_split(dataset[text_features], test_size=0.3, shuffle=False)

In [173]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(texts_train)

In [151]:
X_test_counts = count_vect.transform(texts_test)

In [157]:
X_train_xgb = hstack([X_train, X_train_counts])
X_test_xgb = hstack([X_test, X_test_counts])

In [179]:
xgb = XGBClassifier(objective='binary:logistic', eval_metric='mae')

In [180]:
xgb.fit(X_train_xgb, y_train, verbose=2)

In [182]:
preds = xgb.predict(X_test_xgb)

In [183]:
from sklearn.metrics import mean_absolute_percentage_error as mape

In [184]:
index_of_1 = [idx for idx, val in enumerate(y_test) if val == 1]

In [185]:
mape(y_test[index_of_1], preds[index_of_1])

1.0

As expected there is no positive results using XGBoost. 