## Training notebook made after reconsidering some choices made in the first training notebook.


In [5]:
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from datasets import Dataset
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split

import csv
from urllib.parse import urlparse

In [6]:
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

label_all_tokens = False
label_map = {'O': 0, 'B-PRODUCT': 1, 'I-PRODUCT': 2} # bert expects labels to be in the form of integers
reverse_label_map = {v: k for k, v in label_map.items()} # we will use this to convert the model's output back to the original labels ffor metrics

### This method is made specifically for grouping the data based on the url (it was initially dispersed due to having a different goal before deciding otherwise)

In [21]:
def get_base_url(url):
    try:
        parsed_url = urlparse(url)
        base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
        return base_url
    except Exception as e:
        # print(f"Error parsing URL {url}: {e}")
        return None

def read_csv_file_grouped_by_base_url(file_path):
    data_by_url = {}
    row_count = 0

    with open(file_path, 'r', encoding='utf-8') as file:
        csv_reader = csv.reader(file, delimiter=',')
        for row in csv_reader:
            # Skip the header
            if row[0] == "URL":
                continue

            url, tokens_str, labels_str = row
            tokens = tokens_str.split(' ')
            labels = labels_str.split(' ')

            base_url = get_base_url(url)  # Get base URL

            # Group sentences and labels by base URL
            if base_url not in data_by_url:
                data_by_url[base_url] = {'sentences': [], 'labels': []}

            data_by_url[base_url]['sentences'].append(tokens)
            data_by_url[base_url]['labels'].append(labels)
            
            row_count += 1
            if row_count >= 4000:
                break

    # Now we convert labels to integers
    for base_url, data in data_by_url.items():
        for i in range(len(data['labels'])):
            data['labels'][i] = [label_map[label] for label in data['labels'][i]]

    return data_by_url

In [23]:
from sklearn.model_selection import train_test_split

# Load and group your data by base URL
data_by_url = read_csv_file_grouped_by_base_url("../data/100000_data_ready_for_training.csv")

# Get the list of unique base URLs
base_urls = list(data_by_url.keys())

# Perform the train/test split on the base URLs (instead of individual entries)
train_urls, test_urls = train_test_split(base_urls, test_size=0.15, random_state=42)

# Now split the data into train and test sets based on the base URLs
train_sentences, train_labels = [], []
test_sentences, test_labels = [], []

for base_url in train_urls:
    train_sentences.extend(data_by_url[base_url]['sentences'])
    train_labels.extend(data_by_url[base_url]['labels'])

for base_url in test_urls:
    test_sentences.extend(data_by_url[base_url]['sentences'])
    test_labels.extend(data_by_url[base_url]['labels'])

print(f"Training entries: {len(train_sentences)}, Testing entries: {len(test_sentences)}")


Training entries: 3410, Testing entries: 590


In [24]:
def tokenize_and_align_labels(train_sentences, train_labels):
    tokenized_inputs = tokenizer(train_sentences, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(train_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # If this is the first token of a word, use the corresponding label
            elif word_idx != previous_word_idx:
                if word_idx < len(label):  # Check if the word index is within label range
                    label_ids.append(label[word_idx])
                else:
                    # If the word index is out of range, append -100 (ignore token)
                    label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [25]:
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# We add the special tokens
special_tokens = ['[URL]', '[TITLE]', '[TEXT]', '<NO_TITLE>', '<NO_URL>']

tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

# tokenize and align labels for both training and test datasets
train_data = tokenize_and_align_labels(train_sentences, train_labels)
test_data = tokenize_and_align_labels(test_sentences, test_labels)

# Convert the tokenized data to Hugging Face Dataset format
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)



In [None]:
print(train_dataset[0])
print(test_dataset[0])

In [26]:
# Now we load the model
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_map))
model.resize_token_embeddings(len(tokenizer)) # This is done because of the special tokens we added

model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-for-product-extraction",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [reverse_label_map[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [reverse_label_map[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

print(len(train_dataset), len(test_dataset))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


3410 590


In [27]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

print(trainer.evaluate())

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.091426,0.573857,0.745139,0.648377,0.971562
2,No log,0.088747,0.627064,0.818182,0.709986,0.973304
3,0.109500,0.083623,0.639351,0.787178,0.705605,0.975437


{'eval_loss': 0.08362289518117905, 'eval_precision': 0.6393512590695689, 'eval_recall': 0.7871781397792958, 'eval_f1': 0.7056052755534621, 'eval_accuracy': 0.9754369083532716, 'eval_runtime': 7.1268, 'eval_samples_per_second': 82.786, 'eval_steps_per_second': 5.192, 'epoch': 3.0}
