In [None]:
import os
GPU_id = 3
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [None]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf

import nvtabular as nvt
from nvtabular.ops import Normalize, FillMissing, Categorify, Moments, Median, Encoder, LogOp, ZeroFill
from nvtabular.torch_dataloader import FileItrDataset, DLCollator, DLDataLoader

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
torch.__version__, cudf.__version__

In [None]:
%load_ext snakeviz
# load snakeviz if you want to run profiling

<h3> Dataset Gathering: Define files in the training and validation datasets. </h3>

In [None]:
# data_path = '/rapids/notebooks/jperez/Documents/ds-itr/examples/'
data_path = '/datasets/outbrain/jp_out/output/'
#df_test = 'test/'
df_valid = 'validation_feature_vectors_integral.csv/'
df_train = 'train_feature_vectors_integral_eval.csv/'

train_set = [data_path + df_train + x for x in os.listdir(data_path + df_train) if x.startswith("part")][:20] 
valid_set = [data_path + df_valid + x for x in os.listdir(data_path + df_valid) if x.startswith("part")][:5] 

In [None]:
len(train_set), len(valid_set)

<h4>Grab column information</h4>

In [None]:
cols = open(data_path + 'train_feature_vectors_integral_eval.csv.header').read().splitlines()

In [None]:
cat_names = ['display_id', 'is_leak', 'doc_event_id', 'ad_id', 'doc_id', 'doc_ad_entity_id', 'doc_event_entity_id', 'doc_event_entity_id', 'doc_ad_source_id', 'doc_event_source_id', 'event_geo_location', 'ad_advertiser', 'event_country_state', 'doc_ad_publisher_id', 'doc_event_publisher_id', 'doc_ad_topic_id', 'doc_event_topic_id', 'event_country', 'doc_ad_category_id', 'doc_event_category_id', 'event_hour', 'event_platform', 'traffic_source', 'event_weekend', 'user_has_already_viewed_doc']
cont_names =  ['pop_ad_id_conf', 'pop_document_id_conf', 'user_doc_ad_sim_categories_conf', 'user_doc_ad_sim_topics_conf', 'pop_publisher_id_conf', 'pop_advertiser_id_conf', 'pop_campaign_id_conf', 'pop_source_id_conf', 'pop_entity_id_conf', 'pop_topic_id_conf', 'pop_category_id_conf', 'pop_ad_id', 'pop_document_id', 'pop_publisher_id', 'pop_advertiser_id', 'pop_campaign_id', 'pop_source_id', 'pop_entity_id', 'pop_topic_id', 'pop_category_id', 'user_doc_ad_sim_categories', 'user_doc_ad_sim_topics', 'user_doc_ad_sim_entities', 'doc_event_doc_ad_sim_categories', 'doc_event_doc_ad_sim_topics', 'doc_event_doc_ad_sim_entities', 'user_views', 'ad_views', 'doc_views', 'doc_event_days_since_published', 'doc_event_hour', 'doc_ad_days_since_published'] #+ [i for i in ds.columns if i not in cat_names and i not in ['label']]
cat_names = [name for name in cat_names if name in cols]
cont_names = [name for name in cont_names if name in cols]


<h3>Preprocessing:</h3> <p>Select operations to perform, create the Preprocessor object, create dataset iterator object and collect the stats on the training dataset</p>

In [None]:
freq_threshes = {}
for x in cat_names:
    freq_threshes[x] = 1
freq_threshes

In [None]:
%%time
proc = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=['label'])

In [None]:
proc.add_cont_preprocess([FillMissing(replace=True), Normalize(replace=True)])
proc.add_cat_preprocess(Categorify(replace=True, use_frequency=True, freq_threshold=1))

In [None]:
%%time
trains_itrs = nvt.dataset(train_set,names=cols, engine='csv')
valids_itrs = nvt.dataset(valid_set,names=cols, engine='csv')

In [None]:
output_path_train = './jp_outbrains/train'
output_path_valid = './jp_outbrains/valid'

In [None]:
%%time
proc.apply(trains_itrs, apply_offline=True, record_stats=True, output_path=output_path_train, shuffle=False)

In [None]:
%%time
proc.apply(valids_itrs, apply_offline=True, record_stats=False, output_path=output_path_valid, shuffle=False)

In [None]:
new_train_set = [os.path.join(output_path_train, x) for x in os.listdir(output_path_train) if x.endswith("parquet")]
new_valid_set = [os.path.join(output_path_valid, x) for x in os.listdir(output_path_valid) if x.endswith("parquet")]

<h5>Gather embeddings using statistics gathered in the Read phase.</h5>

In [None]:
embeddings = [x[1] for x in proc.df_ops['Categorify'].get_emb_sz(proc.stats["categories"], proc.columns_ctx['categorical']['base'])]

In [None]:
embeddings

<h5>Create the file iterators using the FileItrDataset Class.</h5>

In [None]:
%%time
t_batch_sets = [FileItrDataset(x, engine='parquet', batch_size=400000) for x in new_train_set]
v_batch_sets = [FileItrDataset(x, engine='parquet', batch_size=400000) for x in new_valid_set]

In [None]:
%%time
t_chain = torch.utils.data.ChainDataset(t_batch_sets)
v_chain = torch.utils.data.ChainDataset(v_batch_sets)

In [None]:
proc.columns_ctx['final']['ctx']

<h5>Use the Deep Learning Collator to create a collate function to pass to the dataloader.</h5>

In [None]:
%%time
dlc = DLCollator(preproc=proc, apply_ops=False)

In [None]:
%%time
t_data = DLDataLoader(t_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)
v_data = DLDataLoader(v_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)

<h4>After creating the Dataloaders you can leverage fastai framework to create Machine Learning models</h4>

In [None]:
databunch = DataBunch(t_data, v_data, collate_fn=dlc.gdf_col, device="cuda")

In [None]:
%%time
model = TabularModel(emb_szs = embeddings, n_cont=len(cont_names), out_sz=2, layers=[512,256])

learn =  Learner(databunch, model, metrics=[accuracy])
learn.loss_func = torch.nn.CrossEntropyLoss()


In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot(show_moms=True, suggestion=True)

In [None]:
learning_rate = 2.75e-2
epochs = 1

In [None]:
start = time()
learn.fit_one_cycle(epochs,learning_rate)
t_final = time() - start 

In [None]:
t_final