# Training Notebook
 - This notebook is based on the training notebook provided by Hugging Face inside their documentation (https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb#scrollTo=YVx71GdAIrJH).
  - Important to note is that testing results may be misleading since all the data is labeled using heuristics and not manually. Proper testing should be done with manually labeled data but since manual labeling was deemed too time-consuming. I tested my models on some edge cases which I deemed important. Also, the app itself is a great way to test the model in the intended environment.



In [None]:
# pip install datasets evaluate seqeval # for google colab 

In [None]:
import transformers
from numpy.f2py.cfuncs import callbacks
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AdamW, TrainerCallback
from datasets import Dataset
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch
import csv
import sys
csv.field_size_limit(2**31 - 1)
from urllib.parse import urlparse

In [24]:
model_checkpoint = "roberta-base" # "distilbert-base-uncased" "bert-base-uncased"
batch_size = 16

label_all_tokens = False
label_map = {'O': 0, 'B-PRODUCT': 1, 'I-PRODUCT': 2} # bert expects labels to be in the form of integers
reverse_label_map = {v: k for k, v in label_map.items()} # we will use this to convert the model's output back to the original labels for metrics

### Data Parsing Methods
- The read_csv_file_grouped_by_base_url method is made specifically for grouping the data based on the url (it was initially dispersed due to having a different goal before deciding otherwise).

In [25]:
def get_base_url(url):
    try:
        parsed_url = urlparse(url) # parse the URL using the library
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        return None

def read_csv_file_grouped_by_base_url(file_path):
    data_by_url = {}

    with open(file_path, 'r', encoding='utf-8') as file:
        csv_reader = csv.reader(file, delimiter=',')
        for row in csv_reader:
            # skip the header
            if row[0] == "URL":
                continue

            url, tokens_str, labels_str = row
            tokens = tokens_str.split(' ')
            labels = labels_str.split(' ')

            base_url = get_base_url(url)  # get base URL

            # group sentences and labels by base URL
            if base_url not in data_by_url:
                data_by_url[base_url] = {'sentences': [], 'labels': []} 

            # this also keeps the order of the tokens in respect to the labels
            data_by_url[base_url]['sentences'].append(tokens) 
            data_by_url[base_url]['labels'].append(labels)
            

    # now we convert labels to integers since that is what the model expects
    for base_url, data in data_by_url.items():
        for i in range(len(data['labels'])):
            data['labels'][i] = [label_map[label] for label in data['labels'][i]]

    return data_by_url

### Train / Test Split
- Important to note here is that, even though I am using rule-based data for testing also, I am only testing on data that was extracted from base URLs that were not used for training.
- The testing performance is directly tied to the quality of the training data.
- This ensures to some degree that the F1 score is not misleading.
- If anything, if the model is trained well, considering all the noise in the data (both training and test), an F1 of around 0.9 should be considered more than sufficient and should indicate decent model performance.

In [26]:
from sklearn.model_selection import train_test_split

# load and group your data by base URL
dataset_path = "../../Data/TrainingDatasets/100000_TL25_TR40_ST80.csv"
data_by_url = read_csv_file_grouped_by_base_url(dataset_path)

# get the list of unique base URLs
base_urls = list(data_by_url.keys())

# perform the train/test split on the base URLs (instead of individual entries)
train_urls, test_urls = train_test_split(base_urls, test_size=0.15, random_state=42)

# now split the data into train and test sets based on the base URLs
train_sentences, train_labels = [], []
test_sentences, test_labels = [], []

for base_url in train_urls:
    train_sentences.extend(data_by_url[base_url]['sentences'])
    train_labels.extend(data_by_url[base_url]['labels'])

for base_url in test_urls:
    test_sentences.extend(data_by_url[base_url]['sentences'])
    test_labels.extend(data_by_url[base_url]['labels'])

print(f"Training entries: {len(train_sentences)}, Testing entries: {len(test_sentences)}")


Training entries: 5051, Testing entries: 949


### Tokenization and Alignment
- Method to tokenize and align labels in accordance to the requirements of the tokenizer.

In [27]:
def tokenize_and_align_labels(train_sentences, train_labels):
    tokenized_inputs = tokenizer(train_sentences, truncation=True, is_split_into_words=True)
    
    special_token_ids = tokenizer.convert_tokens_to_ids(special_tokens) # we need to have access to the special token ids (for ignoring them in the loss function since they were set not to -100)
    
    labels = []
    for i, label in enumerate(train_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        input_ids = tokenized_inputs['input_ids'][i]
        previous_word_idx = None
        label_ids = []
        for word_idx, input_id in zip(word_ids, input_ids):
            # special tokens have a word id that is None (except the ones added manually). We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None or input_id in special_token_ids:
                label_ids.append(-100)
            # if this is the first token of a word, use the corresponding label
            elif word_idx != previous_word_idx:
                if word_idx < len(label):  # check if the word index is within label range
                    label_ids.append(label[word_idx])
                else:
                    # if the word index is out of range, append -100 (ignore token)
                    label_ids.append(-100)
            # for the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [28]:

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # remove the add_prefix_space if you are using a model that doesn't require it
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# we add the special tokens
special_tokens = ['[URL]', '[TITLE]', '[TEXT]', '<NO_TITLE>', '<NO_URL>']

tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

# tokenize and align labels for both training and test datasets
train_data = tokenize_and_align_labels(train_sentences, train_labels)
test_data = tokenize_and_align_labels(test_sentences, test_labels)

# convert the tokenized data to Hugging Face Dataset format
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)




### Weighted Trainer Class
- This inheritance was created so I can emphasize the importance of the 'B-PRODUCT' label in the loss function since I was getting I-PRODUCT labels without the B-PRODUCT labels at the beginning.
- In the statistics, this creates an imbalance between the precision and recall in favour of the latter.

In [29]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs["labels"]  # keep labels in the input
        outputs = model(**inputs)
        logits = outputs.logits

        # move class weights to the same device as logits
        device = logits.device
        class_weights = torch.tensor([0.1, 3.0, 1.0], dtype=torch.float).to(device)

        # flatten logits and labels
        logits = logits.view(-1, len(label_map))  # (batch_size * sequence_length, num_labels)
        labels = labels.view(-1)  # (batch_size * sequence_length)

        # create the weighted loss function
        loss_fct = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-100)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


### Method to log the loss
- Just so I can see more often the progress of the training.

In [30]:
class CustomLogCallback(TrainerCallback):
    def __init__(self, log_interval):
        self.log_interval = log_interval

    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.global_step % self.log_interval == 0 and 'loss' in logs:
            print(f"Step {state.global_step}: Loss: {logs['loss']}")

### Initializing the model

In [31]:
# now we load the model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_map))
model.resize_token_embeddings(len(tokenizer)) # this is done because of the special tokens we added - we need to resize

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-for-product-extraction",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch", # so you don't save all the checkpoints
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = evaluate.load("seqeval")

# method to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # remove ignored index (special tokens)
    true_predictions = [
        [reverse_label_map[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [reverse_label_map[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

print(len(train_dataset), len(test_dataset))

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


5051 949


In [None]:

trainer = CustomTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[CustomLogCallback(log_interval=50)] # logs the loss every log_interval steps
)

trainer.train()

print(trainer.evaluate())

Epoch,Training Loss,Validation Loss
