# Odporúčanie

In [1]:
# dependencies
import os
import pandas as pd
from transformers4rec import tf as tr
import tensorflow as tf
from transformers4rec.tf.ranking_metric import NDCGAt, RecallAt

from _ import constants
from _.functions import preprocess, generate_schema_from_parquet

In [8]:
# preprocess data if not exists
if not constants.TRAIN_PREPROCESSED.exists():
    df_train = preprocess(constants.TRAIN_DROPPED, constants.TRAIN);
    df_train.to_parquet(constants.TRAIN_PREPROCESSED, index=False)
else:
    df_train = pd.read_parquet(constants.TRAIN_PREPROCESSED)
    
if not constants.TEST_PREPROCESSED.exists():
    df_test = preprocess(constants.TEST_DROPPED, constants.TEST);
    df_test.to_parquet(constants.TEST_PREPROCESSED, index=False)
else:
    df_test = pd.read_parquet(constants.TEST_PREPROCESSED)
    
# generate or load schema    
if not constants.SCHEMA.exists():
    generate_schema_from_parquet(constants.TRAIN_PREPROCESSED,constants.SCHEMA);

schema = tr.Schema().from_json(constants.SCHEMA)



In [28]:
sequence_length = 20
inputs = tr.TabularSequenceFeatures.from_schema(
    schema,
    max_sequence_length = sequence_length,
)

In [29]:
d_model = 128
body = tr.SequentialBlock(
    [inputs,
    tr.MLPBlock([d_model]),
    tf.keras.layers.GRU(units=d_model)]
)

In [30]:
head = tr.Head(
    body,
    tr.NextItemPredictionTask(
        weight_tying=True,
        metrics=[tf.keras.metrics.AUC]
    )
)
model = tr.Model(head)

In [39]:
import transformers4rec.torch.utils

transformers4rec.torch.utils.data_utils.T4RecDataLoader

x_cat_names, x_cont_names = ['product_id-list_seq'], []

# dictionary representing max sequence length for column
sparse_features_max = {
    fname: sequence_length
    for fname in x_cat_names + x_cont_names
}

# Define a `get_dataloader` function to call in the training loop
def get_dataloader(path, batch_size=32):

    return NVTabularDataLoader.from_schema(
        schema,
        path,
        batch_size,
        max_sequence_length=sequence_length,
        sparse_names=x_cat_names + x_cont_names,
        sparse_max=sparse_features_max,
)

transformers4rec.torch.utils.data_utils.T4RecDataLoader

In [None]:
from transformers4rec.config.trainer import T4RecTrainingArguments
from transformers4rec.torch import Trainer

#Set arguments for training
train_args = T4RecTrainingArguments(local_rank = -1,
                                    dataloader_drop_last = False,
                                    report_to = [],   #set empy list to avoig logging metrics to Weights&Biases
                                    gradient_accumulation_steps = 1,
                                    per_device_train_batch_size = 256,
                                    per_device_eval_batch_size = 32,
                                    output_dir = "./tmp",
                                    max_sequence_length=sequence_length,
                                    learning_rate=0.00071,
                                    num_train_epochs=3,
                                    logging_steps=200,
                                   )

In [None]:
# Instantiate the T4Rec Trainer, which manages training and evaluation
trainer = Trainer(
    model=model,
    args=train_args,
    schema=schema,
    compute_metrics=True,
)

In [None]:
%%time
start_time_window_index = 1
final_time_window_index = 4
for time_index in range(start_time_window_index, final_time_window_index):
    # Set data
    time_index_train = time_index
    time_index_eval = time_index + 1
    train_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_train}/train.parquet"))
    eval_paths = glob.glob(os.path.join(OUTPUT_DIR, f"{time_index_eval}/valid.parquet"))

    # Initialize dataloaders
    trainer.train_dataloader = get_dataloader(train_paths, train_args.per_device_train_batch_size)
    trainer.eval_dataloader = get_dataloader(eval_paths, train_args.per_device_eval_batch_size)

    # Train on day related to time_index
    print('*'*20)
    print("Launch training for day %s are:" %time_index)
    print('*'*20 + '\n')
    trainer.reset_lr_scheduler()
    trainer.train()
    trainer.state.global_step +=1

    # Evaluate on the following day
    train_metrics = trainer.evaluate(metric_key_prefix='eval')
    print('*'*20)
    print("Eval results for day %s are:\t" %time_index_eval)
    print('\n' + '*'*20 + '\n')
    for key in sorted(train_metrics.keys()):
        print(" %s = %s" % (key, str(train_metrics[key])))
    trainer.wipe_memory()

In [None]:
with open("results.txt", 'w') as f:
    f.write('GRU accuracy results:')
    f.write('\n')
    for key, value in  model.compute_metrics().items():
        f.write('%s:%s\n' % (key, value.item()))