In [1]:
import os
import ast
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from datasets import load_dataset, DatasetDict, Sequence, ClassLabel, Dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForTokenClassification,
    CamembertTokenizerFast,
    CamembertTokenizer,
    CamembertForSequenceClassification
)
from seqeval.metrics import precision_score, recall_score, f1_score

from torch import cuda


# Setup device
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Running on device: {device}")

print("imported successfully !!!")


Running on device: cpu
imported successfully !!!


In [2]:
# Label list
label_list = ["O", "B-DEP", "I-DEP", "B-ARR", "I-ARR"]

# Path to dataset
path = "C:/Users/Vzhan/OneDrive/Bureau/Master/Epitech/TravelOrder/Travel-Order-Resolver/ai/nlp/dataset/tokens/token.csv"

# Load dataset
dataset = load_dataset('csv', data_files={'train': path}, delimiter=';')
print("Dataset loaded:", dataset["train"].column_names)

# Remove the column "spacy_ner_tags"
dataset["train"] = dataset["train"].remove_columns("spacy_ner_tags")

print("Dataset loaded:", dataset["train"].column_names)

Dataset loaded: ['text', 'tokens', 'ner_tags', 'spacy_ner_tags']
Dataset loaded: ['text', 'tokens', 'ner_tags']


In [3]:
def is_valid_row(row):
    """
    Validates whether the 'tokens' and 'ner_tags' fields in a row are properly formatted.
    """
    try:
        # Attempt to parse 'tokens' and 'ner_tags' as Python lists
        tokens = ast.literal_eval(row['tokens'])
        ner_tags = ast.literal_eval(row['ner_tags'])

        # Ensure 'tokens' is a list of strings
        if not isinstance(tokens, list) or not all(isinstance(t, str) for t in tokens):
            return False

        # Ensure 'ner_tags' is a list of integers
        if not isinstance(ner_tags, list) or not all(isinstance(tag, int) for tag in ner_tags):
            return False

        return True  # Row is valid
    except (ValueError, SyntaxError):
        return False  # Row is invalid

# Apply the filter to remove invalid rows
dataset = dataset.filter(is_valid_row)

print(f"Filtered dataset size: {len(dataset)} rows")

Filtered dataset size: 1 rows


In [4]:
# Evaluate tokens and ner_tags as lists
dataset = dataset.map(lambda line: {'tokens': ast.literal_eval(line['tokens'])})
dataset = dataset.map(lambda line: {'ner_tags': ast.literal_eval(line['ner_tags'])})

# Cast ner_tags to ClassLabel with all labels present in ner_tags
dataset = dataset.cast_column("ner_tags", Sequence(feature=ClassLabel(num_classes=len(label_list), names=label_list)))

# Limit the dataset to 80,000 rows
dataset["train"] = dataset["train"].select(range(80000))

# Shuffle train dataset, and pick 30% of it
train_data = dataset['train']
train_data = train_data.shuffle(seed=42)
train_data = train_data.train_test_split(test_size=0.7)['train']

# Split the dataset into train, test, and validation
train_test_valid = train_data.train_test_split(test_size=0.25)
test_valid = train_test_valid['test'].train_test_split(test_size=0.7)

dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

print("Split realized")

Split realized


In [5]:

# Initialize tokenizer
model_checkpoint = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    label_all_tokens = False  # Define this variable to specify how labels are assigned

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # If label_all_tokens is True, use the label for all tokens of a word, otherwise, use -100 for sub-tokens
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

dataset = dataset.map(tokenize_and_align_labels, batched=True)

print("tokenized and aligned")



Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

tokenized and aligned


In [6]:
# Prepare model
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

print("model preparation finished")

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model preparation finished


In [7]:
# Define training arguments
batch_size = 16
epochs = 3
args = TrainingArguments(
    output_dir="models/camembert-finetuned-token-classification-ner-trip",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01
)

print("training args defined")

training args defined


In [None]:
# Data collator
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define evaluation metric using seqeval library
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Handle padding (i.e., -100) properly in labels
    y_pred = [
        [label_list[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    y_true = [
        [label_list[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Ensure that the lengths of y_pred and y_true are consistent
    if len(y_pred) != len(y_true):
        raise ValueError(f"Inconsistent lengths: {len(y_pred)} != {len(y_true)}")

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = (np.array(y_pred) == np.array(y_true)).mean()

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

# Initialize Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

print("model trained")

Epoch,Training Loss,Validation Loss


In [None]:
# Save the model
trainer.save_model("models/camembert-finetuned-token-classification-ner-trip")

# Evaluate model on train and validation sets
train_metrics = trainer.evaluate(dataset["train"])
validation_metrics = trainer.evaluate(dataset["valid"])

# Predict on the test set
predictions, labels, _ = trainer.predict(dataset["test"])
predictions = np.argmax(predictions, axis=2)

y_pred = [
    [label_list[p] for p, l in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
y_true = [
    [label_list[l] for p, l in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

print("done")

In [None]:
# Compute evaluation results using seqeval
results = {
    "precision": precision_score(y_true, y_pred),
    "recall": recall_score(y_true, y_pred),
    "f1": f1_score(y_true, y_pred),
    "accuracy": (np.array(y_pred) == np.array(y_true)).mean()
}

print(f"Results: {results}")

In [None]:
# Display confusion matrix
flat_y_true = [item for sublist in y_true for item in sublist]
flat_y_pred = [item for sublist in y_pred for item in sublist]

cm = confusion_matrix(flat_y_true, flat_y_pred, labels=label_list)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=label_list, yticklabels=label_list)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Display confusion matrix without label "O"
def filter_label(label_to_exclude, true_labels, predicted_labels):
    filtered_true = [true for true, pred in zip(true_labels, predicted_labels) if pred != label_to_exclude]
    filtered_pred = [pred for pred in predicted_labels if pred != label_to_exclude]
    return filtered_true, filtered_pred

filtered_y_true, filtered_y_pred = filter_label("O", y_true, y_pred)

flat_filtered_y_true = [item for sublist in filtered_y_true for item in sublist]
flat_filtered_y_pred = [item for sublist in filtered_y_pred for item in sublist]

cm = confusion_matrix(flat_filtered_y_true, flat_filtered_y_pred, labels=label_list[1:])

sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=label_list[1:], yticklabels=label_list[1:])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix (Filtered "O" label)')
plt.show()