# Import packages

In [1]:
import json
from functools import partial
from pathlib import Path

import pandas as pd
import pytorch_lightning
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from attribute_extraction.models.attribute_classification import MultiAttributeClassifier
from attribute_extraction.models.mapper import Mapper
from attribute_extraction.models.train_utils import (
    AttributeDataset,
    MultiAttributeBatchSampler,
    build_callbacks,
    collate_fun_generator,
)
from attribute_extraction.utils.data_balancing_utils import add_weight_for_data_balancing


[nltk_data] Downloading package punkt to /home/20014946/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# setting up variables and hyperparameters

In [2]:
attribute_code_col = "attribute_code"
attribute_lov_col = "lov_code"
context_col = "context"
weight_col = "weight"
mapped_lov_col = "mapped_lov_code"

local_path = "../outputs_train_workflow/"

model_name = "distilbert-base-multilingual-cased"
experiment_description = "Multi-task small GRU model with multilanguage distilledBert tokenizer"
num_epoch = 20
max_len = 512
batch_size = 128 
freeze_backbone = True
learning_rate = 1e-4
projection_dim = 256
dropout = 0.2
data_balance = False
upper_qn = 0.9
lower_qn = 0.1
lov_attribute_codes = ["02419", "01746", "00562", "15344", "99999"]
train_set_uri = "../data/train_formatted.csv"
val_set_uri = "../data/val_formatted.csv"
test_set_uri = "../data/test_formatted.csv"

# Save the hyper_parameters
hyper_parameters = {
    "model_name": model_name,
    "experiment_description": experiment_description,
    "num_epoch": num_epoch,
    "max_len": max_len,
    "batch_size": batch_size,
    "freeze_backbone": freeze_backbone,
    "learning_rate": learning_rate,
    "projection_dim": projection_dim,
    "dropout": dropout,
    "data_balance": data_balance,
    "upper_qn": upper_qn,
    "lower_qn": lower_qn,
    "lov_attribute_codes": lov_attribute_codes,
    "train_set_uri": train_set_uri,
    "val_set_uri": val_set_uri,
    "test_set_uri": test_set_uri,

}

with open(f"{local_path}hyper_parameters.json", "w") as f:
    json.dump(hyper_parameters, f, indent=4)

# Loading data

In [3]:
data_train = pd.read_csv(train_set_uri).reset_index(drop=True)
data_val = pd.read_csv(val_set_uri).reset_index(drop=True)
data_test = pd.read_csv(test_set_uri).reset_index(drop=True)

In [4]:
data_train['attribute_code'] = data_train['attribute_code'].apply(lambda x: str(x).zfill(5))
data_train['lov_code'] = data_train['lov_code'].apply(lambda x: str(x).zfill(5))

data_val['attribute_code'] = data_val['attribute_code'].apply(lambda x: str(x).zfill(5))
data_val['lov_code'] = data_val['lov_code'].apply(lambda x: str(x).zfill(5))

data_test['attribute_code'] = data_test['attribute_code'].apply(lambda x: str(x).zfill(5))
data_test['lov_code'] = data_test['lov_code'].apply(lambda x: str(x).zfill(5))

In [25]:
data_train = data_train.fillna('')
data_val = data_val.fillna('')
data_test = data_test.fillna('')

In [26]:
data_train[context_col] = data_train.title + " " + data_train.description_clean
data_val[context_col] = data_val.title + " " + data_val.description_clean
data_test[context_col] = data_test.title + " " + data_test.description_clean

In [27]:
data_train.isnull().sum()


adeo_product_id      0
id                   0
title                0
description          0
lang                 0
attribute_code       0
attribute_label      0
attribute_type       0
is_multivalued       0
is_lov               0
attribute_value      0
lov_code             0
step_model_code      0
mirakl_model_code    0
data_source          0
description_clean    0
attribute_eng        0
context              0
dtype: int64

In [15]:
data_train.loc[(data_train.description.notnull()) & (data_train.description_clean.isnull())]

Unnamed: 0,adeo_product_id,id,title,description,lang,attribute_code,attribute_label,attribute_type,is_multivalued,is_lov,attribute_value,lov_code,step_model_code,mirakl_model_code,data_source,description_clean,attribute_eng,context
9645,84466757,b'\xec\xd3\x14E\xef\xc8.\x14\xee\xd3:N\x86\xa6...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,562,Forma,CHAR,0,1,['Irregular'],64432,202303,202303|MIROIR_DECORATIF_AVEC_MOULURE|MIROIR|R1...,BOMP,,shape,
12522,82303826,b'\xe4i\xd6x\x97$i|\xd4\xca+n\xab\xa1Y3\xa2\xe...,Chéminée Bio-éthanol Camogli 2.0 Kw Blanc,<p></p> <p></p>,FR,15344,Batterie ou pile incluse,CHAR,0,1,['Non'],2,200101,200101|CHEMINEE_BIOETHANOL|CHEMINEE_ET_INSERT|...,BOMP,,batt_inclu,
16637,82551260,b'\xd3\xf0\x8bV\x88s;MW\x9c\x03\x9ap\xb5\xe6\x...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,2419,Estilo,CHAR,0,1,['Clásico chic'],207342,202372,202372|1001|R13-001-014,BOMP,,style,
20091,83716068,b'\xbd\xdb\xccFK\x93\t\t)\xa4\x00\x0c\xbak\xec...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,2419,Estilo,CHAR,0,1,['Clásico'],41852,202341,202341|RIDEAU|RIDEAU_TRINGLE_A_RIDEAU_ET_ACCES...,BOMP,,style,
21784,82232735,b'\xab\x9e\x8e\xa5i\r\xf3Poe\xc9\x1b\x94\xe1\x...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,562,Forma,CHAR,0,1,['Cuadrada'],900,201486,201486|TONNELLE_PERGOLA|TONNELLE_ET_PERGOLA|R0...,BOMP,,shape,
26846,84466758,b'\xf9\x1c\x88J\x9a\xddw[\x9c\r\x8d\xcf\xc7\xa...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,2419,Estilo,CHAR,0,1,['Contemporáneo'],525,202303,202303|MIROIR_DECORATIF_AVEC_MOULURE|MIROIR|R1...,BOMP,,style,
26847,84466758,b'r[\xe2\x8c\x98\x07\xa8#=E\x82\x92SD\xff\xce2...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,562,Forma,CHAR,0,1,['Ovalada'],1082,202303,202303|MIROIR_DECORATIF_AVEC_MOULURE|MIROIR|R1...,BOMP,,shape,
29034,84448803,b'\x9b\x11\xc1kF\xf5\x9b\xc1\xdc\xe9\x89\x0eo\...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,15344,Batería incluida,CHAR,0,1,['No'],2,202372,202372|1001|R13-001-014,BOMP,,batt_inclu,
30310,84466749,b'\xad\x03n\xe6\xcf\xd9z\x9d\xb3\x0b\xcbz\xf3\...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,562,Forma,CHAR,0,1,['Redondo'],4325,202303,202303|MIROIR_DECORATIF_AVEC_MOULURE|MIROIR|R1...,BOMP,,shape,
51816,84466757,b'\xec\xd3\x14E\xef\xc8.\x14\xee\xd3:N\x86\xa6...,$Marketing,<br/> </br> <br/></br></br></br></br> <br/><br/>,ES,99999,product_model,CHAR,0,1,['Irregular'],202303,202303,202303|MIROIR_DECORATIF_AVEC_MOULURE|MIROIR|R1...,BOMP,,product_class,


In [11]:
data_train.isnull().sum()

adeo_product_id       0
id                    0
title                 0
description           2
lang                  0
attribute_code        0
attribute_label       0
attribute_type        0
is_multivalued        0
is_lov                0
attribute_value       0
lov_code              0
step_model_code       0
mirakl_model_code     0
data_source           0
description_clean    24
attribute_eng         0
context              24
dtype: int64

In [8]:
data_train[context_col].head()

0    Tapis tissé à la main move 4453 grau rouge - 1...
1    Fodera per cuscino Naturals Mermaids (50 x 50 ...
2    Cojín Liso Gris oscuro (40 x 16 x 40 cm) Si qu...
3    PONTE RIALZATA 13 RUBINETTO LAVABO IN OTTONE N...
4    ROBINET DE LAVABO SURÉLEVÉ SUR PLAGE 13 EN LAI...
Name: context, dtype: object

# Building Mapper

In [20]:
mapper = Mapper(attribute_code_col=attribute_code_col, attribute_value_col=attribute_lov_col)

mapper.fit(pd.concat([data_train, data_val, data_test]))

mapper.save(f"{local_path}mapper.json")

# Mapping Columns

In [21]:
data_train = mapper.map_dataframe(data_train, mapped_col_name=mapped_lov_col)
data_val = mapper.map_dataframe(data_val, mapped_col_name=mapped_lov_col)
data_test = mapper.map_dataframe(data_test, mapped_col_name=mapped_lov_col)

# Data balancing

In [22]:
if data_balance:

    data_train = add_weight_for_data_balancing(
        df=data_train,
        label_col=mapped_lov_col,
        weight_col=weight_col,
        attribute_code_col=attribute_code_col,
        upper_qn=upper_qn,
        lower_qn=lower_qn,
    )

    data_val[weight_col] = 1
    data_test[weight_col] = 1
else:
    data_train[weight_col] = 1
    data_val[weight_col] = 1
    data_test[weight_col] = 1

# Model Initialisation

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = MultiAttributeClassifier(
    vocab_size=tokenizer.vocab_size,
    class_config=mapper.mappings,
    freeze_backbone=freeze_backbone,
    warmup_steps=data_train.shape[0] // batch_size // 100,
    estimated_stepping_batches=data_train.shape[0]
    * num_epoch
    // batch_size,
    num_cycles=num_epoch,
    projection_dim=projection_dim,
    dropout=dropout,
    learning_rate=learning_rate,
)

# Dataloaders Initialisation

In [24]:
train = AttributeDataset(
    data=data_train,
    context_col_name=context_col,
    label_col_name=mapped_lov_col,
    attribute_code_col_name=attribute_code_col,
)
validation = AttributeDataset(
    data=data_val,
    context_col_name=context_col,
    label_col_name=mapped_lov_col,
    attribute_code_col_name=attribute_code_col,
)
test = AttributeDataset(
    data=data_test,
    context_col_name=context_col,
    label_col_name=mapped_lov_col,
    attribute_code_col_name=attribute_code_col,
)

train_loader = DataLoader(
    dataset=train,
    batch_sampler=MultiAttributeBatchSampler(
        data=data_train,
        batch_size=batch_size,
        split_col=attribute_code_col,
        weight_col=weight_col,
    ),
    collate_fn=partial(
        collate_fun_generator,
        tokenizer=tokenizer,
        max_len=max_len,
    ),
    num_workers=4,
)

validation_loader = DataLoader(
    dataset=validation,
    batch_sampler=MultiAttributeBatchSampler(
        data=data_val,
        batch_size=batch_size,
        split_col=attribute_code_col,
        weight_col=weight_col,
    ),
    collate_fn=partial(
        collate_fun_generator,
        tokenizer=tokenizer,
        max_len=max_len,
    ),
    num_workers=4,
)

test_loader = DataLoader(
    dataset=test,
    batch_sampler=MultiAttributeBatchSampler(
        data=data_test,
        batch_size=batch_size,
        split_col=attribute_code_col,
        weight_col=weight_col,
    ),
    collate_fn=partial(
        collate_fun_generator,
        tokenizer=tokenizer,
        max_len=max_len,
    ),
    num_workers=4,
)

In [84]:
train.data

Unnamed: 0,adeo_product_id,id,title,description,lang,attribute_code,attribute_label,attribute_type,is_multivalued,is_lov,attribute_value,lov_code,step_model_code,mirakl_model_code,data_source,description_clean,attribute_eng,context,mapped_lov_code,weight
0,85266696,b'\xb1\xd4q\x83\x89\xcc\xb1\xec\xd6\x1a\xdc\xb...,Tapis tissé à la main move 4453 grau rouge - 1...,Facile d'entretien tapis de coton Ce coton ...,FR,02419,Style,CHAR,0,1,['Moderne'],64985,200853,200853|TAPIS_DE_SALON_CHAMBRE_SALLE_A_MANGER|T...,BOMP,Facile d'entretien tapis de coton Ce coton ...,style,Tapis tissé à la main move 4453 grau rouge - 1...,44,1
1,85177753,"b'\xffT\x8d\xddQ\x9a,N^\x9d\xf3\x81\xb8\x8d2\x...",Fodera per cuscino Naturals Mermaids (50 x 50 cm),Se ti piace curare ogni singolo dettaglio dell...,IT,00562,Forma,CHAR,0,1,['Quadrato'],00900,202343,202343|HOUSSE_DE_COUSSIN|COUSSIN_ET_HOUSSE|R12...,BOMP,Se ti piace curare ogni singolo dettaglio dell...,shape,Fodera per cuscino Naturals Mermaids (50 x 50 ...,2,1
2,84833908,b'\x99\xf8\xc3\xa2\xc8\x85vl\x06\x9e\x855\xee<...,Cojín Liso Gris oscuro (40 x 16 x 40 cm),<p>Si quieres darle a tu hogar un aire de orig...,ES,02419,Estilo,CHAR,0,1,['Diseño'],00533,202345,202345|COUSSIN_DECORATIF|COUSSIN_ET_HOUSSE|R12...,BOMP,Si quieres darle a tu hogar un aire de origina...,style,Cojín Liso Gris oscuro (40 x 16 x 40 cm) Si qu...,2,1
3,84592677,"b'\\#hC\xc2\x9b""\x16\xc4\x8e\x10\x9b\x8a\xab\x...",PONTE RIALZATA 13 RUBINETTO LAVABO IN OTTONE N...,Questo rubinetto lavabo rialzato dal design el...,IT,01746,Colore,CHAR,1,1,['Nero opaco'],10106,201177,201177|MITIGEUR_DE_LAVABO|ROBINET_POUR_LAVABO_...,BOMP,Questo rubinetto lavabo rialzato dal design el...,color,PONTE RIALZATA 13 RUBINETTO LAVABO IN OTTONE N...,50,1
4,84592677,b'\xd0\x83[\xabC\xda\xef\x1e\xd3\xd8\xa8\xa3\x...,ROBINET DE LAVABO SURÉLEVÉ SUR PLAGE 13 EN LAI...,Ce robinet de lavabo surélevé au design épuré ...,FR,01746,Couleur,CHAR,1,1,['Noir mat'],10106,201177,201177|MITIGEUR_DE_LAVABO|ROBINET_POUR_LAVABO_...,BOMP,Ce robinet de lavabo surélevé au design épuré ...,color,ROBINET DE LAVABO SURÉLEVÉ SUR PLAGE 13 EN LAI...,50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87663,85980222,b'm\xc78\xb5Ie\xf3\x80\x9cd\x9a\xceT\xf6\x8b\x...,TAPISO Maya Tapis de Couloir Entrée Moderne Gr...,Les tapis de couloir TAPISO Maya c'est une col...,FR,99999,product_model,CHAR,1,1,['Gris'],200853,200853,200853|TAPIS_DE_COULOIR|TAPIS_ET_PAILLASSON|R1...,BOMP,Les tapis de couloir TAPISO Maya c'est une col...,product_class,TAPISO Maya Tapis de Couloir Entrée Moderne Gr...,15,1
87664,85024535,b'\x90l\x0b\xde\xc3\xbe\xceNU\x7f\x0e\xbb\xbe\...,Rideau de Fenêtre 140x280 cm 2 pcs Oeillet Por...,"<ul class=""a-unordered-list a-vertical a-spaci...",FR,99999,product_model,CHAR,0,1,['Design'],202341,202341,202341|RIDEAU|RIDEAU_TRINGLE_A_RIDEAU_ET_ACCES...,BOMP,"<ul class=""a-unordered-list a-vertical a-spaci...",product_class,Rideau de Fenêtre 140x280 cm 2 pcs Oeillet Por...,53,1
87665,84182320,b'\xac\xd8s\x8b\x93\xa0\x0f\x94\x93L\xa7w\xb5\...,Set de 3 abat jour en rotin blanc GREG,Laissez-vous tenter par GREG et ajoutez une to...,FR,99999,product_model,CHAR,0,1,['Rond'],202375,202375,202375|ABATS-JOUR|ABATS-JOUR|R13-001-024,BOMP,Laissez-vous tenter par GREG et ajoutez une to...,product_class,Set de 3 abat jour en rotin blanc GREG Laissez...,62,1
87666,86532625,b'\x98v\x88\xc8\x16\xeap\xeb\xc7\xbb%\x91\x8fQ...,Lettre Décorative Plexigglas Noir. Lettrage Ad...,"<p style=""text-align:justify;"">De couleur noir...",FR,99999,product_model,CHAR,0,1,['Essentiel'],202301,202301,202301|PLEXIGLASS_IMPRIME|AFFICHE_TOILE_ET_PLA...,BOMP,"<p style=""text-align:justify;"">De couleur noir...",product_class,Lettre Décorative Plexigglas Noir. Lettrage Ad...,48,1


In [87]:
for x in train_loader:
    x

TypeError: Caught TypeError in DataLoader worker process 2.
Original Traceback (most recent call last):
  File "/home/20014946/anaconda3/envs/pem-entities/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/20014946/anaconda3/envs/pem-entities/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
    return self.collate_fn(data)
  File "/home/20014946/Documents/Document Adeo/Project/Product_Entity_Matching_Extraction/Attribute_extraction/pdp--product-entity-matching--attributes-extraction/src/attribute_extraction/models/train_utils.py", line 53, in collate_fun_generator
    network_input = tokenizer(
  File "/home/20014946/anaconda3/envs/pem-entities/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 2488, in __call__
    encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
  File "/home/20014946/anaconda3/envs/pem-entities/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 2574, in _call_one
    return self.batch_encode_plus(
  File "/home/20014946/anaconda3/envs/pem-entities/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 2765, in batch_encode_plus
    return self._batch_encode_plus(
  File "/home/20014946/anaconda3/envs/pem-entities/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py", line 429, in _batch_encode_plus
    encodings = self._tokenizer.encode_batch(
TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]


In [59]:

first = next(it)
print(first)


{'input_ids': tensor([[   101,  14248,  31486,  ...,      0,      0,      0],
        [   101,  34421,  11705,  ...,  51551,    169,    102],
        [   101,  82021,    234,  ...,      0,      0,      0],
        ...,
        [   101,    123, 110829,  ...,      0,      0,      0],
        [   101,  11255,  58445,  ...,      0,      0,      0],
        [   101,  36642,  11889,  ...,      0,      0,      0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), '00562': tensor([ 2,  3, 23,  4, 25,  4,  2,  2,  4,  4,  5,  2,  8,  4,  2,  3,  2, 15,
         4,  4,  7, 15,  2,  5,  4, 15, 15,  4,  3,  4, 13, 15,  5,  4,  7,  2,
         2,  2,  3,  4,  4,  3,  4,  3,  3,  3,  4,  4,  5,  2, 15,  4,  7, 15,
        15,  4,  3, 15, 15,  7,  2, 15,  4,  4, 53,  2, 33, 35, 44,  4,  4,  4,
         4,  4,  4,  

dict_keys(['input_ids', 'attention_mask', '00562'])

In [33]:
first = next(it)
print(first)

{'input_ids': tensor([[  101, 34387, 89270,  ...,     0,     0,     0],
        [  101, 99549, 10107,  ...,     0,     0,     0],
        [  101, 11469, 10362,  ...,     0,     0,     0],
        ...,
        [  101, 19561, 12752,  ...,     0,     0,     0],
        [  101,   155, 11403,  ...,     0,     0,     0],
        [  101, 26037, 16419,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), '99999': tensor([40, 27, 52, 59,  7, 52, 38, 63, 46, 71, 34, 49, 48, 58, 16, 71,  2, 11,
        41, 14,  2, 79, 73, 70,  8, 35,  1, 75, 66, 48, 19, 11, 31, 73, 16, 41,
        75, 56, 83, 21, 54, 53, 19, 34, 48,  2, 55, 81, 15,  6, 66, 25, 45, 33,
        51, 11, 40,  5, 68, 67, 69, 27, 64, 69, 41, 15, 15,  5, 48, 48, 28, 69,
        21, 77, 42, 41, 37, 47, 44, 48, 57, 71, 15, 51, 6

In [None]:
second = next(it)

# Build Callbacks

In [25]:
callbacks, metric_logger = build_callbacks(output_path=Path(local_path), model_name="model")

# Model Training

In [26]:
trainer = pytorch_lightning.Trainer(
    gpus=1 if torch.cuda.is_available() else 0,
    logger=metric_logger,
    callbacks=callbacks,
    log_every_n_steps=1,
    max_epochs=num_epoch,
    precision=16 if torch.cuda.is_available() else 32,
)

trainer.fit(model, train_loader, validation_loader)

GPU available: False, used: False
TPU available: None, using: 0 TPU cores
Missing logger folder: ../outputs_train_workflow/model

  | Name                 | Type             | Params
----------------------------------------------------------
0 | encoder              | Embedding        | 30.6 M
1 | classification_heads | ModuleDict       | 2.9 M 
2 | ce_loss              | CrossEntropyLoss | 0     
----------------------------------------------------------
33.5 M    Trainable params
0         Non-trainable params
33.5 M    Total params
133.906   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

MisconfigurationException: ModelCheckpoint(monitor='validation_loss') not found in the returned metrics: ['train_loss', 'train_accuracy_01746', 'train_accuracy_99999', 'train_accuracy_00562', 'train_accuracy_02419', 'train_accuracy_15344']. HINT: Did you call self.log('validation_loss', tensor) in the LightningModule?

# Model Evaluation

In [None]:
test_metrics = trainer.test(model, test_loader)
test_metrics_sort = [dict(sorted(test_metrics[0].items()))]

with open(local_path / "metrics.json", "w") as f:
    json.dump(test_metrics, f, indent=4)

with open(local_path / "metrics_sorted.json", "w") as f:
    json.dump(test_metrics_sort, f, indent=4)