# Prepare Data
We're going to prepare the dataset from the RecSys Challenge 2015 dataset to be used with Pytorch Geometric, the challenge of this dataset is this:

Given a sequence of click events performed by some user during a typical session in an e-commerce website, the goal is to predict whether the user is going to buy something or not, and if he is buying, what would be the items he is going to buy. The task could therefore be divided into two sub goals:

* Is the user going to buy items in this session? Yes|No
* If yes, what are the items that are going to be bought?


#### References
* https://2015.recsyschallenge.com/challenge.html
* https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8
* https://www.kaggle.com/danofer/2015-recsys-challenge-starter/notebook
* https://github.com/romovpa/ydf-recsys2015-challenge
* https://github.com/khuangaf/Pytorch-Geometric-YooChoose
* https://arxiv.org/pdf/1812.08434.pdf
* https://sxkdz.github.io/research/SR-GNN/
* https://github.com/userbehavioranalysis/SR-GNN_PyTorch-Geometric
* https://github.com/CRIPAC-DIG/SR-GNN
* https://arxiv.org/pdf/1811.00855.pdf

In [1]:
import os
#os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
#os.environ["MODIN_ENGINE"] = "dask"  # Modin will use Dask
# https://modin.readthedocs.io/en/latest/
#import modin.pandas as pd
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import InMemoryDataset
from tqdm import tqdm

  import pandas.util.testing as tm


#### Download data

In [2]:
#!wget https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z

#### Parse .dat files (using csv format)

In [12]:
# Get buy events
buys_raw=pd.read_csv('yoochoose-buys.dat',names=['session_id','timestamp','item_id','price','quantity'])
# Get click events
clicks_raw=pd.read_csv('yoochoose-clicks.dat',names=['session_id','timestamp','item_id','category'])

# Drop Nan
buys_raw = buys_raw.dropna()
clicks_raw = clicks_raw.dropna()

# Convert all categories to string
clicks_raw['category'] = clicks_raw['category'].astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [13]:
buys_raw.head()

Unnamed: 0,session_id,timestamp,item_id,price,quantity
0,420374,2014-04-06T18:44:58.314Z,214537888,12462,1
1,420374,2014-04-06T18:44:58.325Z,214537850,10471,1
2,281626,2014-04-06T09:40:13.032Z,214535653,1883,1
3,420368,2014-04-04T06:13:28.848Z,214530572,6073,1
4,420368,2014-04-04T06:13:28.858Z,214835025,2617,1


In [14]:
clicks_raw.head()

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0


#### Preprocessing
Force item_id to start at zero

In [15]:
item_encoder = LabelEncoder()
category_encoder = LabelEncoder()
clicks_raw['item_id'] = item_encoder.fit_transform(clicks_raw.item_id)
clicks_raw['category'] = category_encoder.fit_transform(clicks_raw.category)
clicks_raw.head()

Unnamed: 0,session_id,timestamp,item_id,category
0,1,2014-04-07T10:51:09.277Z,2053,0
1,1,2014-04-07T10:54:09.868Z,2052,0
2,1,2014-04-07T10:54:46.998Z,2054,0
3,1,2014-04-07T10:57:00.306Z,9876,0
4,2,2014-04-07T13:56:37.614Z,19448,0


##### Get Label
We will generate a label collumn that is true wheneaver a session_id from clicks_raw exists on buys_raw

In [18]:
clicks_raw['label'] = clicks_raw.session_id.isin(buys_raw.session_id)
clicks_raw.sample(7)

Unnamed: 0,session_id,timestamp,item_id,category,label
1443699,490143,2014-04-07T19:42:09.465Z,43898,0,False
27204835,9309854,2014-08-27T17:49:41.695Z,49804,338,False
27668607,9612643,2014-08-29T19:52:29.603Z,21518,0,False
5117910,1667823,2014-04-27T12:08:33.573Z,41222,0,True
26627031,9278102,2014-08-29T16:25:43.190Z,49838,338,True
16040658,5340766,2014-06-28T10:12:25.797Z,41963,338,False
24388918,8147532,2014-08-13T13:58:52.714Z,31127,329,False


In [19]:
grouped = clicks_raw.groupby('session_id')

In [None]:
len(grouped)

#### Create Pytorch Geometric Dataset
Here, we treat each item in a session as a node, and therefore all items in the same session form a graph. To build the dataset, we group the preprocessed data by session_id and iterate over these groups. In each iteration, the item_id in each group are categorically encoded again since for each graph, the node index should count from 0.

In [None]:
class YooChooseBinaryDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(YooChooseBinaryDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return []
    @property
    def processed_file_names(self):
        return ['./yoochoose_click_binary_1M_sess.dataset']

    def download(self):
        pass
    
    def process(self):
        data_list = []

        # process by session_id
        grouped = clicks_raw.groupby('session_id')
        for session_id, group in tqdm(grouped):
            sess_item_id = LabelEncoder().fit_transform(group.item_id)
            group = group.reset_index(drop=True)
            group['sess_item_id'] = sess_item_id
            node_features = group.loc[group.session_id==session_id,['sess_item_id','item_id']].sort_values('sess_item_id').item_id.drop_duplicates().values

            node_features = torch.LongTensor(node_features).unsqueeze(1)
            target_nodes = group.sess_item_id.values[1:]
            source_nodes = group.sess_item_id.values[:-1]

            edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
            x = node_features

            y = torch.FloatTensor([group.label.values[0]])

            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
# This will call process
dataset = YooChooseBinaryDataset(root='../')

#### Shuffle data and separate train/val/test

In [None]:
dataset = dataset.shuffle()
train_dataset = dataset[:800000]
val_dataset = dataset[800000:900000]
test_dataset = dataset[900000:]
len(train_dataset), len(val_dataset), len(test_dataset)