This notebook assumes that the steps mentioned in the glovo-foodi-ml-dataset README have been followed, that is:
* The glovo-foodi-ml-dataset-sample.csv has been downloaded
* The images are already downloaded in disk by downloading the mock_dataset_ES.zip from S3

In [4]:
import os
import numpy as np
import pandas as pd
import shutil

In [5]:
import torch
torch.__version__

'1.9.1+cu102'

In [6]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")

# Exploration of execution

Execution 

```{bash}
cd /home/ec2-user/SageMaker/foodi-ml
source activate python3
export DATA_PATH=/home/ec2-user/SageMaker/dataset/
python run.py options/adapt/foodi-ml/i2t.yaml
```

# Configurations

In [7]:
PATH_DATA = '/home/ec2-user/SageMaker/dataset/'
PATH_FOODI = '/home/ec2-user/SageMaker/foodi-ml'
DATASET_CSV = 'glovo-foodi-ml-dataset-sample.csv'

conf = {
    "LOCAL_RAW_DATASET": os.path.join(PATH_DATA, DATASET_CSV),
    "LOCAL_DATASET": os.path.join(PATH_DATA, 'samples'),
    "LOCAL_IMAGES": os.path.join(PATH_DATA, 'dataset'),
    "LOCAL_VOCAB": os.path.join(PATH_FOODI, '.vocab_cache/foodiml_vocab.json'),
    "pth_dwn_samples": '/home/ec2-user/SageMaker/dataset/',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
}

# Read all samples

In [42]:
samples = pd.read_csv(os.path.join(conf['LOCAL_RAW_DATASET']))

In [43]:
samples.shape

(10000, 5)

In [10]:
samples.head()

Unnamed: 0.1,Unnamed: 0,img_id,caption,s3_path,split
0,2076195,1643078,crema oxigenada 30v har 60 ml coloracion nan,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...,train
1,1131612,698495,"la baitagalleano, composta di ribes e peperonc...",/home/ec2-user/SageMaker/dataset/dataset/YNGJL...,train
2,1584914,1151797,"pollo asado brie bocadillos pan gran reserva, ...",/home/ec2-user/SageMaker/dataset/dataset/BFDKZ...,train
3,1412771,979654,jus de fraise jus nan,/home/ec2-user/SageMaker/dataset/dataset/JVHVQ...,train
4,1831135,1398018,auricular bola rosa auriculares nan,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...,train


## 1) Create sentences

In [10]:
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_section"], samples["collection_section"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

### 1.1) Fit tokenizer

In [53]:
os.chdir(PATH_FOODI)
from retrieval.data.tokenizer import Tokenizer

In [54]:
# 1) Get all sentences 
sentences = samples["sentence"].values

# 2) Fit Tokenizer with senteces (CAREFUL, takes 6-7 min)
tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
# Fit tokenize
vocab = tokenizer.fit(sentences)

100%|██████████| 2887444/2887444 [06:46<00:00, 7094.59it/s]


In [74]:
len(vocab.word2idx)

245967

In [57]:
%%time
# 3) Saving vocabulary
tokenizer.save(conf['LOCAL_VOCAB'])

CPU times: user 334 ms, sys: 0 ns, total: 334 ms
Wall time: 341 ms


In [75]:
# 4) Load if already saved
tokenizer = tokenizer.load(conf['LOCAL_VOCAB'])
len(tokenizer.vocab)

245967

In [59]:
# LOAD equivalent
tokenizer_2 = Tokenizer(vocab_path=conf["pth_vocab"], download_tokenizer=True)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
len(tokenizer_2.vocab)

245967

### 1.2) Creating dataset parquet

In [36]:
# Keep only necessary columns for modelling
final_samples = samples[["sentence", "s3_path", "subset"]].reset_index().copy()
final_samples.rename(columns={"sentence": "caption"}, inplace=True)
final_samples.rename(columns={"subset": "split"}, inplace=True)
final_samples.rename(columns={"index": "img_id"}, inplace=True)

KeyError: "['sentence', 'subset'] not in index"

In [41]:
# Remove previous version if it exists
if os.path.exists(conf["LOCAL_DATASET"]):
    shutil.rmtree(conf["LOCAL_DATASET"])

In [44]:
# Save as partitioned parquet
samples.to_parquet(
    path=conf["LOCAL_DATASET"],
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)