# Preprocessing

In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### Import data

In [19]:
# Importing Events data
df_events = pd.read_csv("events.csv")
pd.DataFrame(data=df_events, index=[0,1,17,19,130, 304])

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
17,1433223236124,287857,addtocart,5206,
19,1433221078505,158090,addtocart,10572,
130,1433222276276,599528,transaction,356475,4000.0
304,1433193500981,121688,transaction,15335,11117.0


In [20]:
# Events types “view”, “addtocart”, “transaction” are the implicit customer feedback.
# They can be considered as rating and will be transformed from categorical to numerical format.

# The weights are subject to tuning together with hyperparameters to achieve better performance.
# Initial weights: view=1, add to cart=2, purchase=3.

weight_view = 1
weight_addtocart = 2
weight_transaction = 3

df_events.event.replace(to_replace=dict(
    view=weight_view, addtocart=weight_addtocart, transaction=weight_transaction), inplace=True)

In [22]:
# Now the events replaced with corresponding weights.

pd.DataFrame(data=df_events, index=[0,1,17,19,130, 304])

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,1,355908,
1,1433224214164,992329,1,248676,
17,1433223236124,287857,2,5206,
19,1433221078505,158090,2,10572,
130,1433222276276,599528,3,356475,4000.0
304,1433193500981,121688,3,15335,11117.0


In [24]:
# Import Categories data

df_categories = pd.read_csv("category_tree.csv")
df_categories.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [26]:
# Import Properties data

df_properties1 = pd.DataFrame(pd.read_csv("item_properties_part1.csv"))
df_properties2 = pd.DataFrame(pd.read_csv("item_properties_part2.csv"))
df_properties = pd.concat([df_properties1, df_properties2])

df_properties.head(10)

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
5,1436065200000,285026,available,0
6,1434250800000,89534,213,1121373
7,1431831600000,264312,6,319724
8,1433646000000,229370,202,1330310
9,1434250800000,98113,451,1141052 n48.000
