In [None]:
#QUESTO VA ESEGUITO USANDO L'ENVIRONMENT "TEXT_TORCH" CHE HA PYTORCH, TORCHTEXT (che non è compatibile con la versioen di torchvision) E SPACY

In [None]:
import torch
import os
import json
from torch.utils.data import DataLoader, TensorDataset
import random
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
import torchtext
from torchtext import transforms
import lightning as lgn

from src_scratches.recipes_standardization.seq2seq_model import Encoder, Decoder, Seq2Seq, Seq2SeqDataset, Seq2SeqLightning, Seq2SeqTrainer
from settings.config import RECIPES_PATH, METADATA_FILENAME
from src_scratches.recipes_standardization.dictionaries import units, quantities_dict, modifiers


In [None]:
recipes = json.load(open(os.path.join(RECIPES_PATH, METADATA_FILENAME), 'r'))
raw_ingredients = pd.DataFrame(recipes)['ingredients'].explode().unique()
raw_ingredients

In [4]:
def synthetic_gen(num_samples):
    
    x = [0] * num_samples  
    y = [0] * num_samples  

    for i in range(num_samples):

        rnd_qty_str, rnd_qty_int = random.choice(list(quantities_dict.items()))

        no_units_present = random.choice([False, False, False, False, True])
        rnd_unit = random.choice(units)
        
        rnd_mod_present = random.choice([None, None, True])
        rnd_mod = random.choice(modifiers)

        rnd_ing = random.choice(raw_ingredients)

        # Build the output string, Y
        # e.g. {"qty": 36, "unit": "count", "item": "eggs", "mod": "scrambled"}
        if no_units_present:
            rnd_unit = 'count'  # For purposes of building Y

        if rnd_mod_present:
            y[i] = f'{{ qty: {rnd_qty_int} , unit: {rnd_unit} , item: {rnd_ing} , mod: {rnd_mod} }}'
        else:
            y[i] = f'{{ qty: {rnd_qty_int} , unit: {rnd_unit} , item: {rnd_ing} , mod: {None} }}'

        # Build the input string, X
        # e.g. "3 dozen scrambled eggs"
        mod_at_end = [False, True]
        rnd_mod_at_end = random.choice(mod_at_end)

        # avoiding double space
        if rnd_mod_present:
            if no_units_present:
                if rnd_mod_at_end:
                    x[i] = f'{rnd_qty_str} {rnd_ing} , {rnd_mod}'             # e.g. 3 eggs, scrambled
                else:
                    x[i] = f'{rnd_qty_str} {rnd_mod} {rnd_ing}'              # e.g. 3 scrambled eggs
            else:
                if rnd_mod_at_end:
                    x[i] = f'{rnd_qty_str} {rnd_unit} {rnd_ing} , {rnd_mod}'  # e.g. 3 cups eggs, scrambled
                else:
                    x[i] = f'{rnd_qty_str} {rnd_unit} {rnd_mod} {rnd_ing}'   # e.g. 3 cups scrambled eggs
        else:
            if no_units_present:
                x[i] = f'{rnd_qty_str} {rnd_ing}'                            # e.g. 3 eggs
            else:
                x[i] = f'{rnd_qty_str} {rnd_unit} {rnd_ing}'                 # e.g. 3 cups eggs

    return x, y

In [34]:
N_SAMPLES = 100000
TRAIN_SIZE = 0.95 # 95% of the data
BATCH_SIZE = 128
EPOCHS = 10
NUM_WORKERS = os.cpu_count()
data = synthetic_gen(N_SAMPLES)
data[0][:3], data[1][:3]
x, y = data

In [35]:
x[0], y[0]

('seven milliliters infused TACO BELLÂ® Thick & Chunky Medium Salsa',
 '{ qty: 7 , unit: milliliters , item: TACO BELLÂ® Thick & Chunky Medium Salsa , mod: infused }')

In [36]:
vocab_size = len(raw_ingredients) + len(modifiers) + len(units) + len(quantities_dict)
vocab_size

7336

In [37]:
def ceil_int(num, target_digit=1):
    num_digits = len(str(num))
    num = num / 10**(num_digits - target_digit) # e.g. 1234 -> 1.234
    return int(np.ceil(num)) * 10**(num_digits - target_digit) # e.g. 1.234 -> 2 * 1000 = 2000

vocab_size = ceil_int(vocab_size, 2)
vocab_size

7400

In [38]:
en_nlp = spacy.load("en_core_web_sm")
sos_token = "<sos>"
eos_token = "<eos>"
unk_token = "<unk>"
pad_token = "<pad>"

def tokenize_en(text, tokenizer=en_nlp, sos_token=sos_token, eos_token=eos_token):
    tokenized =  [token.text.lower() for token in tokenizer(text)]
    return [sos_token] + tokenized + [eos_token]

def tokenize_whitespaces(text, sos_token=sos_token, eos_token=eos_token):
    tokenized = text.lower().split()
    return [sos_token] + tokenized + [eos_token]
tokenize_en("3 cups scrambled eggs")

['<sos>', '3', 'cups', 'scrambled', 'eggs', '<eos>']

In [40]:

min_freq = 1
special_tokens = [sos_token, eos_token, unk_token, pad_token]

x_tokens = [tokenize_whitespaces(i) for i in tqdm(x)]
y_tokens = [tokenize_whitespaces(i) for i in tqdm(y)]

train_len = int(len(x_tokens) * TRAIN_SIZE)

x_train, y_train = x_tokens[:train_len], y_tokens[:train_len]
x_val, y_val = x_tokens[train_len:], y_tokens[train_len:]
x_train[:3], y_train[:3]

100%|██████████| 100000/100000 [00:00<00:00, 657910.43it/s]
100%|██████████| 100000/100000 [00:00<00:00, 230417.90it/s]


([['<sos>',
   'seven',
   'milliliters',
   'infused',
   'taco',
   'bellâ®',
   'thick',
   '&',
   'chunky',
   'medium',
   'salsa',
   '<eos>'],
  ['<sos>', '2', 'cup', 'soaked', 'nonfat', 'frozen', 'yogurt', '<eos>'],
  ['<sos>',
   'two',
   'dozen',
   't.',
   'powdered',
   'spice',
   'islands',
   'garlic',
   'salt',
   '<eos>']],
 [['<sos>',
   '{',
   'qty:',
   '7',
   ',',
   'unit:',
   'milliliters',
   ',',
   'item:',
   'taco',
   'bellâ®',
   'thick',
   '&',
   'chunky',
   'medium',
   'salsa',
   ',',
   'mod:',
   'infused',
   '}',
   '<eos>'],
  ['<sos>',
   '{',
   'qty:',
   '2',
   ',',
   'unit:',
   'cup',
   ',',
   'item:',
   'nonfat',
   'frozen',
   'yogurt',
   ',',
   'mod:',
   'soaked',
   '}',
   '<eos>'],
  ['<sos>',
   '{',
   'qty:',
   '24',
   ',',
   'unit:',
   't.',
   ',',
   'item:',
   'spice',
   'islands',
   'garlic',
   'salt',
   ',',
   'mod:',
   'powdered',
   '}',
   '<eos>']])

In [19]:
tokenize_whitespaces(x[3])

['<sos>', '11', 'milliliters', 'baby', 'artichokes', '<eos>']

In [32]:
tokenizer = 

['3', 'cups', 'scrambled', 'eggs']

In [10]:
x_vocab = torchtext.vocab.build_vocab_from_iterator(x_train, min_freq=min_freq, specials=special_tokens)
x_vocab.set_default_index(x_vocab[unk_token])
x_vocab.lookup_tokens(x_vocab.lookup_indices(tokenize_en("3 cups scrambled eggs")))

['<sos>', '3', 'cups', 'scrambled', 'eggs', '<eos>']

In [11]:
y_vocab = torchtext.vocab.build_vocab_from_iterator(y_train, min_freq=min_freq, specials=special_tokens)
y_vocab.set_default_index(y_vocab[unk_token])
y_vocab.lookup_indices(y_train[0])

[0, 9, 7, 26, 4, 8, 100, 4, 5, 150, 41, 564, 4, 6, 11, 10, 1]

In [12]:
assert x_vocab[pad_token] == y_vocab[pad_token]
train_dataset = Seq2SeqDataset(x_train, y_train, x_vocab, y_vocab, pad_token=pad_token)
val_dataset = Seq2SeqDataset(x_val, y_val, x_vocab, y_vocab, pad_token=pad_token)

In [13]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS,
                                pin_memory=True, persistent_workers=True, collate_fn=train_dataset.collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS,
                            pin_memory=True, persistent_workers=True, collate_fn=val_dataset.collate_fn)

In [30]:
input_dim = len(x_vocab)
output_dim = len(y_vocab)
encoder_embedding_dim = 256
decoder_embedding_dim = 256
hidden_dim = 512
n_layers = 2
encoder_dropout = 0.5
decoder_dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(
    input_dim,
    encoder_embedding_dim,
    hidden_dim,
    n_layers,
    encoder_dropout,
)

decoder = Decoder(
    output_dim,
    decoder_embedding_dim,
    hidden_dim,
    n_layers,
    decoder_dropout,
)

model = Seq2Seq(encoder, decoder, device)

In [32]:
len(x_vocab)

2859

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 8,516,714 trainable parameters


In [17]:
import torchinfo
model = model.to(device)
# src, trg = next(iter(train_dataloader)), next(iter(train_dataloader))
src = next(iter(train_dataloader))[0]
trg = src

print(src.shape, trg.shape)
# torchinfo.summary(model, input_data=(src, trg, 0.5), verbose=2)
model(src.to(device), trg.to(device), 0.5).shape

torch.Size([128, 28]) torch.Size([128, 28])


torch.Size([128, 28, 1130])

In [18]:
lgn_model = Seq2SeqLightning(
    model, lr=0.001, lr_scheduler=torch.optim.lr_scheduler.ReduceLROnPlateau,
    optimizer=torch.optim.SGD, loss_fn=torch.nn.CrossEntropyLoss, batch_size=BATCH_SIZE,
    momentum=0.9, weight_decay=1e-4
)
lr_monitor_callback = lgn.pytorch.callbacks.LearningRateMonitor(logging_interval="epoch")
bar_callback = lgn.pytorch.callbacks.RichProgressBar(leave=True)
timer_callback = lgn.pytorch.callbacks.Timer()

torch.set_float32_matmul_precision('medium')  # For better performance with cuda

lgn_trainer = Seq2SeqTrainer(
    max_epochs=EPOCHS,
    accelerator="gpu",
    precision="16-mixed",
    log_every_n_steps=len(train_dataloader),
    callbacks=[
        bar_callback,
        timer_callback,
        lr_monitor_callback,
    ],

    accumulate_grad_batches=5,
    enable_model_summary=False,
    )

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\polil\.conda\envs\text_torch\lib\site-packages\lightning\pytorch\trainer\connectors\logger_connector\logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [19]:
lgn_trainer.fit(model=lgn_model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
