# FINAL PREPROCESS
1. Read CSV
2. Fit Tokenizer (if it is already fitted read it from disk)
3. Create a parquet file called samples
4. Save the parquet

In [1]:
import os
import numpy as np
import pandas as pd
import multiprocessing
import sys
import json
from tqdm import tqdm
import shutil
from sklearn.model_selection import train_test_split

#os.chdir("/home/ec2-user/SageMaker/foodi-ml/notebooks/")

In [2]:
import torch
torch.__version__

'1.1.0'

In [3]:
os.chdir("/home/ec2-user/SageMaker/foodi-ml/")
from retrieval.data.tokenizer import Tokenizer

# Exploration of execution

Execution 

```{bash}
cd /home/ec2-user/SageMaker/foodi-ml
source activate python3
export DATA_PATH=/home/ec2-user/SageMaker/dataset/
python run.py options/adapt/foodi-ml/i2t.yaml
python test.py options/adapt/foodi-ml/i2t.yaml

#watch -n 1 "nvidia-smi"
```

# Configurations

In [4]:
PATH_DATA = '/home/ec2-user/SageMaker/dataset/'
PATH_FOODI = '/home/ec2-user/SageMaker/foodi-ml'
DATASET_CSV = 'glovo-foodi-ml-dataset.csv'

conf = {
    "S3_BUCKET": 'glovo-products-dataset-d1c9720d',
    "S3_KEY_DATASET": DATASET_CSV,
    "LOCAL_RAW_DATASET": os.path.join(PATH_DATA, DATASET_CSV),
    "LOCAL_DATASET": os.path.join(PATH_DATA, 'samples'),
    "LOCAL_IMAGES": os.path.join(PATH_DATA, 'dataset'),
    "LOCAL_VOCAB": os.path.join(PATH_FOODI, '.vocab_cache/foodiml_vocab.json'),
    "pth_dwn_samples": '/home/ec2-user/SageMaker/dataset/',
    "pth_dwn_images": '/home/ec2-user/SageMaker/dataset/dataset/',
    "pth_vocab": '/home/ec2-user/SageMaker/foodi-ml/.vocab_cache/foodiml_vocab.json',
    "pth_dataset_json": '/home/ec2-user/SageMaker/dataset/dataset_foodiml.json',
}

# Read all samples

In [5]:
samples = pd.read_csv(os.path.join(conf['LOCAL_RAW_DATASET']))

In [6]:
samples.shape

(2887444, 13)

In [7]:
samples.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,country_code,city_code,store_name,product_name,collection_section,product_description,subset,hash,aux_store,HIER,s3_path
0,0,0,CL,STG,AS_000,Savital Shampoo Keratina Y Sabila 550Ml (00001...,Shampoo,,train,3509449892161349181,True,False,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...
1,1,1,KE,NRK,Cold Stone Creamery Cart,Chocolate Layer Cake™,Promotions - Free waffles on every ice cream p...,Sweet Cream Ice Cream with Crumbles of Chocola...,train,1077526765743747663,False,True,/home/ec2-user/SageMaker/dataset/dataset/YKKVT...
2,2,2,UA,DNP,Khinkali & Khachapuri / Хинкали & Хачапури,Асорті на компанію,ЗАКУСКИ,"балик домашній,грудинка копчена,паштет з печін...",train,-4606644841517710049,False,True,/home/ec2-user/SageMaker/dataset/dataset/ZFSHH...
3,3,3,CL,STG,AS_001,Suav Vivere Hierbas F Jazmin Pouch 900Cc,Suavizantes,,train,-3420249778599023770,True,False,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...
4,4,4,CL,STG,AS_000,Leoncio Y El Doctor Veterinario,Infantil,,train,2295647601757404193,True,False,/home/ec2-user/SageMaker/dataset/dataset/NZTCK...


In [8]:
# Check that images in the dataframe are refering to the correct path
samples["s3_path"].iloc[0]

'/home/ec2-user/SageMaker/dataset/dataset/NZTCKFL_0017467_1193055503.png'

There are some images (~30) that are missing in the S3 bucket but they appear in the CSV file. We remove them.

In [9]:
missing_images = []
for p in samples["s3_path"].unique():
    if not os.path.exists(p):
        print(p)
        missing_images.append(p)

/home/ec2-user/SageMaker/dataset/dataset/LNQBTMC_0092690_1654903940.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0040488_1098020161.png
/home/ec2-user/SageMaker/dataset/dataset/PPKWSJR_0004646_60587075.png
/home/ec2-user/SageMaker/dataset/dataset/BSCRBMV_0011116_1220714661.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0214351_1380823197.png
/home/ec2-user/SageMaker/dataset/dataset/BFDKZRG_0032872_617098369.png
/home/ec2-user/SageMaker/dataset/dataset/BFDKZRG_0036209_809223163.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0211327_1380809266.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0213966_1380821411.png
/home/ec2-user/SageMaker/dataset/dataset/PPKWSJR_0002867_60584254.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0206886_1380791874.png
/home/ec2-user/SageMaker/dataset/dataset/QPCYVVZ_0009860_1097897031.png
/home/ec2-user/SageMaker/dataset/dataset/FZKQYZG_0004222_1586626572.png
/home/ec2-user/SageMaker/dataset/dataset/VJTTJQD_0211294_1380809134.pn

In [10]:
samples = samples[~samples["s3_path"].isin(missing_images)]
samples.shape

(2887399, 13)

In [11]:
samples.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'country_code', 'city_code', 'store_name',
       'product_name', 'collection_section', 'product_description', 'subset',
       'hash', 'aux_store', 'HIER', 's3_path'],
      dtype='object')

## 1) Create sentences

In [12]:
samples["product_description"].fillna("", inplace=True)
samples["sentence"] = \
    np.where(samples["product_name"], samples["product_name"].astype(str), "") + " " + \
    np.where(samples["collection_section"], samples["collection_section"].astype(str), "") + " " + \
    np.where(samples["product_description"], samples["product_description"].astype(str), "")

samples["sentence"] = samples["sentence"].str.lower()
samples.rename(columns={'Unnamed: 0': 'idx'}, inplace=True)

## 1.1) Fit Tokenizer (only if it is not fitted)

In [13]:
# 1) Get all sentences 
sentences = samples["sentence"].values

In [14]:
#if not os.path.isfile(conf["LOCAL_VOCAB"]):
if True:
    # 2) Fit Tokenizer with senteces (CAREFUL, takes 6-7 min)
    tokenizer = Tokenizer(vocab_path=None, download_tokenizer=True)
    # Fit tokenize
    vocab = tokenizer.fit(sentences)
    # 3) Saving vocabulary
    tokenizer.save(conf['LOCAL_VOCAB'])

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
100%|██████████| 2887399/2887399 [06:49<00:00, 7048.82it/s]


In [15]:
# 4) Load if already saved
tokenizer = Tokenizer(vocab_path=conf["LOCAL_VOCAB"], download_tokenizer=True)
len(tokenizer.vocab)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


245962

## 2) Creating dataset parquet
In this section we create a parquet file that contains the DataFrame with all the captions and images of our dataset. This parquet file **will be consumed by our DataLoader class.**

In [16]:
# Keep only necessary columns for modelling
final_samples = samples[["sentence", "s3_path", "subset","country_code"]].reset_index().copy()
final_samples.rename(columns={"sentence": "caption"}, inplace=True)
final_samples.rename(columns={"subset": "split"}, inplace=True)
final_samples.rename(columns={"index": "img_id"}, inplace=True)

In [17]:
final_samples.split.unique()

array(['train', 'val', 'test'], dtype=object)

In [18]:
num_samples_train = (final_samples["split"] == "train").sum()
num_samples_train

2021174

### Action required: set subsample to True or False 

In [19]:
subsample = True # Set me
subsample_size = 10000
if subsample:
    final_samples_ES = final_samples[final_samples["country_code"]=="ES"]
    final_samples_ES = final_samples_ES.sample(subsample_size)
    train, val_test = train_test_split(final_samples_ES, test_size=int(subsample_size*0.3))
    val, test = train_test_split(val_test, test_size=int(val_test.shape[0]*0.5))
    print(train.shape, val.shape, test.shape)
    final_samples = pd.concat([train, val, test])

(7000, 5) (1500, 5) (1500, 5)


In [20]:
# Remove previous version
if os.path.exists(conf["LOCAL_DATASET"]):
    shutil.rmtree(conf["LOCAL_DATASET"])

In [21]:
# Save as partitioned parquet
final_samples.to_parquet(
    path=conf["LOCAL_DATASET"],
    engine="pyarrow",
    index=False,
    partition_cols=["split"],
)