## Predspracovanie dát a ETL

In [1]:
# dependenciesimport pandas as pd
import numpy as np
import json
import pandas as pd
from google.protobuf import json_format
import tensorflow_data_validation as tfdv
from tensorflow.python.lib.io import file_io

In [2]:
# local modules
import sys
sys.path.append("..")

from _ import constants
from _.functions import drop, generate_statistics_from_parquet

In [3]:
# drop redundant data if not exists
if not constants.TRAIN_DROPPED.exists():
    df_train = drop(constants.TRAIN);
    df_train.to_parquet(constants.TRAIN_DROPPED, index=False)
else:
    df_train = pd.read_parquet(constants.TRAIN_DROPPED)
    
if not constants.TEST_DROPPED.exists():
    df_test = drop(constants.TEST);
    df_test.to_parquet(constants.TEST_DROPPED, index=False)
else:
    df_test = pd.read_parquet(constants.TEST_DROPPED)

In [4]:
df_train.head(10)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
9,00RL8Z82B2Z1,aff3928535f48,1541037542,10,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


### Úprava multi-value atribútov

Rozdelenie "|" separovaných hodnôt na list.

In [5]:
df_train['impressions'] = df_train['impressions'].apply(lambda x: x.split('|') if x != None else x)

In [6]:
df_train['prices'] = df_train['prices'].apply(lambda x: x.split('|') if x != None else x)

In [7]:
df_train['current_filters'] = df_train['current_filters'].apply(lambda x: x.split('|') if x != None else x)

### Prvá interakcia s referenciou (položkou)

Pridanie atribútu "first_interaction" s hodnotou vyjadrujúcou timestamp, kedy bola prvá interakcia s položkou, pre RFM analýzu.

In [8]:
first_interaction = df_train[df_train['reference'].apply(lambda x: x.isnumeric())].groupby('reference').agg({'timestamp':'min'}).reset_index().rename(columns={'timestamp':'first_interaction'})

In [9]:
df_train = df_train.merge(first_interaction,on=['reference'],how='left').reset_index(drop=True)

In [10]:
df_train.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,first_interaction
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1541038000.0
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1541038000.0
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1541038000.0
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,1541033000.0


## Enkódovanie 

### Enkódovanie kategorických atribútov

In [11]:
categorical_attributes = ['user_id','session_id', 'action_type', 'platform', 'city']

In [12]:
for cat in categorical_attributes:
    df_train[cat] = df_train[cat].astype('category')
    df_train[cat] = df_train[cat].cat.codes

In [13]:
df_train.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,first_interaction
0,478,469138,1541037460,1,9,Newtown,4,26837,mobile,,,,
1,478,469138,1541037522,2,4,666856,4,26837,mobile,,,,1541038000.0
2,478,469138,1541037522,3,4,666856,4,26837,mobile,,,,1541038000.0
3,478,469138,1541037532,4,4,666856,4,26837,mobile,,,,1541038000.0
4,478,469138,1541037532,5,4,109038,4,26837,mobile,,,,1541033000.0


### One-hot enkódovanie

Zamýšlali sme sa nad one-hot enkódovaním aj atribútu platform, ale kedže nie je z nášho pohľadu až tak dôležitý pre predikciu a obsahuje 55 rozličných hodnôt, rozhodli sme sa len pre city.

In [14]:
df_train = pd.get_dummies(df_train, columns=["device"])

In [15]:
df_train.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,current_filters,impressions,prices,first_interaction,device_desktop,device_mobile,device_tablet
0,478,469138,1541037460,1,9,Newtown,4,26837,,,,,0,1,0
1,478,469138,1541037522,2,4,666856,4,26837,,,,1541038000.0,0,1,0
2,478,469138,1541037522,3,4,666856,4,26837,,,,1541038000.0,0,1,0
3,478,469138,1541037532,4,4,666856,4,26837,,,,1541038000.0,0,1,0
4,478,469138,1541037532,5,4,109038,4,26837,,,,1541033000.0,0,1,0


## Výstup

Zoradenie podla stepu v každej session a zahodenie atribútu step

In [16]:
df_train.sort_values(['session_id','step'])
# no longer needed column
del df_train['step']

In [17]:
df_train.to_parquet(constants.TRAIN_PREPROCESSED, index=False)

### Vytvorenie schémy 

In [18]:
df_stats = generate_statistics_from_parquet(constants.TRAIN_PREPROCESSED)





Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [19]:
schema = tfdv.infer_schema(df_stats)

In [20]:
tfdv.visualize_statistics(df_stats)

In [25]:
# until https://github.com/NVIDIA-Merlin/Transformers4Rec/issues/357 is fixed
schema_json = json.loads(json_format.MessageToJson(schema))

def add_annotation(x):
    x['annotation'] = {}
    return x

schema_json['feature'] = list(map(lambda x: add_annotation(x), schema_json['feature']))

file_io.write_string_to_file(constants.SCHEMA, json.dumps(schema_json))