# Download files from Criteo Dataset

For this test, only one file from Criteo dataset was downloaded (1 day).  
You can ajust the variable "NUMBER_OF_DAYS" to 
These tests are run locally on a conda environment and on a containers. To install the latest version of the NVTabular software, please refer to https://github.com/NVIDIA-Merlin/NVTabular.

In [None]:
# Download 1 file from Criteo website
import os

from nvtabular.utils import download_file

BASE_DIR = '/home/renatoleite/data'
input_path = os.path.join(BASE_DIR, "crit_orig")
NUMBER_DAYS = 1

In [None]:
# Create BASE_DIR if not exists
if not os.path.exists(BASE_DIR):
    os.makedirs(BASE_DIR)
    print(f'Directory \"{BASE_DIR}\" created')
else:
    print(f'Directory \"{BASE_DIR}\" already exists')

# Create input dir if not exists
if not os.path.exists(input_path):
    os.makedirs(input_path)
    print(f'Directory \"{input_path}\" created')
else:
    print(f'Directory \"{input_path}\" already exists')

In [None]:
# Iterate over days
for i in range(0, NUMBER_DAYS):
    file = os.path.join(input_path, "day_" + str(i) + ".gz")
    # Download file, if there is no .gz, .csv or .parquet file
    if not (
        os.path.exists(file)
        or os.path.exists(
            file.replace(".gz", ".parquet").replace("crit_orig", "converted/criteo/")
        )
        or os.path.exists(file.replace(".gz", ""))
    ):
        download_file(
            "http://azuremlsampleexperiments.blob.core.windows.net/criteo/day_"
            + str(i)
            + ".gz",
            file
        )

In [None]:
BUCKET_NAME = 'gs://renatoleite-nvtabular/crit_orig_csv'

In [None]:
! gsutil -m cp $input_path/* $BUCKET_NAME

# Data Analysis
Analysis of one Criteo file.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Define file header
HEADER = ["label"]
for i in range(1, 14):
  HEADER.append(f"I{i}")
for i in range(1, 27):
  HEADER.append(f"C{i}")

In [None]:
sample_size = 500000

day1_dataset = pd.read_csv(
    f"{BUCKET_NAME}/day_0.gz",
    sep="\t",
    names=HEADER,
    nrows=sample_size    
)

In [None]:
day1_dataset.head(5).T

In [None]:
day1_dataset.label.value_counts()

In [None]:
day1_dataset.describe()

# Load file to BigQuery Table

In [None]:
PROJECT = 'renatoleite-mldemos'
REGION = 'us-central1'
DATASET_GCS_LOCATION = 'gs://renatoleite-nvtabular/crit_orig_csv'
BQ_DATASET_NAME = 'criteo'
BQ_TRAIN_TABLE_NAME = 'train'
BQ_VALID_TABLE_NAME = 'valid'
NUM_FILES = 1

In [None]:
!bq --location=US mk -d \
$PROJECT:$BQ_DATASET_NAME

In [None]:
# Create SCHEMA to load the data
schema = []
for column in HEADER:
    if "C" in column:
        schema.append(f"{column}:STRING")
    else:
        schema.append(f"{column}:INTEGER")
schema = ','.join(schema)

In [None]:
train_files = []
for v in range(NUM_FILES):
    train_files.append(f'"{DATASET_GCS_LOCATION}/day_{v}"')

train_files = ','.join(train_files)
print(train_files)

In [None]:
!bq load \
    --source_format=CSV \
    --field_delimiter=tab \
    --autodetect \
    --replace \
    {BQ_DATASET_NAME}.{BQ_TRAIN_TABLE_NAME} \
    {train_files} \
    {schema}

In [None]:
!bq load \
    --source_format=CSV \
    --field_delimiter=tab \
    --autodetect \
    --replace \
    {BQ_DATASET_NAME}.{BQ_VALID_TABLE_NAME} \
    {DATASET_GCS_LOCATION}/day_0 \
    {schema}