In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import os
import ast
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets
from datasets import ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import evaluate  # Corrected import
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report as seqeval_classification_report

label_list = ["O", "B-DEP", "I-DEP", "B-ARR", "I-ARR"]

path = "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/tokens/"

# dataset dict with empty attribute train
dataset = DatasetDict({'train': Dataset.from_dict({})})

# Load CSV files and concatenate them into the dataset
for file in os.listdir(path):
    if file.endswith(".csv"):
        new_data = load_dataset('csv', data_files=path + file, delimiter=';')
        dataset["train"] = concatenate_datasets([dataset["train"], new_data["train"]])

# Remove the column "spacy_ner_tags"
dataset["train"] = dataset["train"].remove_columns("spacy_ner_tags")

# Evaluate tokens and ner_tags as lists
dataset = dataset.map(lambda line: {'tokens': ast.literal_eval(line['tokens'])})
dataset = dataset.map(lambda line: {'ner_tags': ast.literal_eval(line['ner_tags'])})

# Cast ner_tags to ClassLabel with all labels present in ner_tags
dataset = dataset.cast_column("ner_tags", Sequence(feature=ClassLabel(num_classes=len(label_list), names=label_list)))

# Limit the dataset to 80,000 rows
dataset["train"] = dataset["train"].select(range(80000))

# Shuffle train dataset, and pick 30% of it
train_data = dataset['train']
train_data = train_data.shuffle(seed=42)
train_data = train_data.train_test_split(test_size=0.7)['train']

# Split the dataset into train, test, and validation
train_test_valid = train_data.train_test_split(test_size=0.25)
test_valid = train_test_valid['test'].train_test_split(test_size=0.7)

dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']
})

# Initialize tokenizer
model_checkpoint = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    label_all_tokens = False  # Define this variable to specify how labels are assigned

    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, padding=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                # If label_all_tokens is True, use the label for all tokens of a word, otherwise, use -100 for sub-tokens
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Prepare model
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in id2label.items()}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Define training arguments
batch_size = 16
epochs = 3
args = TrainingArguments(
    output_dir="models/camembert-finetuned-token-classification-ner-trip",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01
)

# Data collator
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define evaluation metric using seqeval library
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Handle padding (i.e., -100) properly in labels
    y_pred = [
        [label_list[p] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    y_true = [
        [label_list[l] for p, l in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Ensure that the lengths of y_pred and y_true are consistent
    if len(y_pred) != len(y_true):
        raise ValueError(f"Inconsistent lengths: {len(y_pred)} != {len(y_true)}")

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = (np.array(y_pred) == np.array(y_true)).mean()

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

# Initialize Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the model
trainer.save_model("models/camembert-finetuned-token-classification-ner-trip")

# Evaluate model on train and validation sets
train_metrics = trainer.evaluate(dataset["train"])
validation_metrics = trainer.evaluate(dataset["valid"])

# Predict on the test set
predictions, labels, _ = trainer.predict(dataset["test"])
predictions = np.argmax(predictions, axis=2)

y_pred = [
    [label_list[p] for p, l in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
y_true = [
    [label_list[l] for p, l in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute evaluation results using seqeval
results = {
    "precision": precision_score(y_true, y_pred),
    "recall": recall_score(y_true, y_pred),
    "f1": f1_score(y_true, y_pred),
    "accuracy": (np.array(y_pred) == np.array(y_true)).mean()
}

print(f"Results: {results}")

# Display confusion matrix
flat_y_true = [item for sublist in y_true for item in sublist]
flat_y_pred = [item for sublist in y_pred for item in sublist]

cm = confusion_matrix(flat_y_true, flat_y_pred, labels=label_list)

sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=label_list, yticklabels=label_list)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

# Display confusion matrix without label "O"
def filter_label(label_to_exclude, true_labels, predicted_labels):
    filtered_true = [true for true, pred in zip(true_labels, predicted_labels) if pred != label_to_exclude]
    filtered_pred = [pred for pred in predicted_labels if pred != label_to_exclude]
    return filtered_true, filtered_pred

filtered_y_true, filtered_y_pred = filter_label("O", y_true, y_pred)

flat_filtered_y_true = [item for sublist in filtered_y_true for item in sublist]
flat_filtered_y_pred = [item for sublist in filtered_y_pred for item in sublist]

cm = confusion_matrix(flat_filtered_y_true, flat_filtered_y_pred, labels=label_list[1:])

sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=label_list[1:], yticklabels=label_list[1:])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix (Filtered "O" label)')
plt.show()

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[codecarbon INFO @ 00:31:19] [setup] RAM Tracking...
[codecarbon INFO @ 00:31:19] [setup] GPU Tracking...
[codecarbon INFO @ 00:31:19] No GPU found.
[codecarbon INFO @ 00:31:19] [setup] CPU Tracking...
[codecarbon INFO @ 00:31:22] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz
[codecarbon INFO @ 00:31:22] >>> Tracker's metadata:
[codecarbon INFO @ 00:31:22]   Platform system: Windows-10-10.0.26100-SP0
[codecarbon INFO @ 00:31:22]   Python version: 3.11.9
[codecarbon INFO @ 00:31:22]   CodeCarbon version: 2.3.2
[codecarbon INFO @ 00:31:22]   Available RAM : 7.826 GB
[codecarbon INFO @ 00:31:22]   CPU count: 8
[codecarbon INFO @ 00:31:22]   CPU mo

  0%|          | 0/3375 [00:00<?, ?it/s]

[codecarbon INFO @ 00:31:38] Energy consumed for RAM : 0.000012 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 00:31:38] Energy consumed for all CPUs : 0.000060 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 00:31:38] 0.000072 kWh of electricity used since the beginning.
[codecarbon INFO @ 00:31:53] Energy consumed for RAM : 0.000025 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 00:31:53] Energy consumed for all CPUs : 0.000118 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 00:31:53] 0.000142 kWh of electricity used since the beginning.
[codecarbon INFO @ 00:32:08] Energy consumed for RAM : 0.000037 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 00:32:08] Energy consumed for all CPUs : 0.000176 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 00:32:08] 0.000213 kWh of electricity used since the beginning.
[codecarbon INFO @ 00:32:23] Energy consumed for RAM : 0.000049 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 00:32:23] Energy consumed f

{'loss': 0.3205, 'grad_norm': 0.6803662180900574, 'learning_rate': 1.7037037037037038e-05, 'epoch': 0.44}


[codecarbon INFO @ 01:42:11] Energy consumed for RAM : 0.003461 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 01:42:11] Energy consumed for all CPUs : 0.016517 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 01:42:11] 0.019978 kWh of electricity used since the beginning.
[codecarbon INFO @ 01:42:26] Energy consumed for RAM : 0.003473 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 01:42:26] Energy consumed for all CPUs : 0.016576 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 01:42:26] 0.020049 kWh of electricity used since the beginning.
[codecarbon INFO @ 01:42:41] Energy consumed for RAM : 0.003485 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 01:42:41] Energy consumed for all CPUs : 0.016634 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 01:42:41] 0.020119 kWh of electricity used since the beginning.
[codecarbon INFO @ 01:42:56] Energy consumed for RAM : 0.003498 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 01:42:56] Energy consumed f

{'loss': 0.0826, 'grad_norm': 0.1978594809770584, 'learning_rate': 1.4074074074074075e-05, 'epoch': 0.89}


[codecarbon INFO @ 02:53:44] Energy consumed for RAM : 0.006958 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 02:53:44] Energy consumed for all CPUs : 0.033208 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 02:53:44] 0.040166 kWh of electricity used since the beginning.
[codecarbon INFO @ 02:53:59] Energy consumed for RAM : 0.006970 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 02:53:59] Energy consumed for all CPUs : 0.033266 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 02:53:59] 0.040236 kWh of electricity used since the beginning.
[codecarbon INFO @ 02:54:14] Energy consumed for RAM : 0.006982 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 02:54:14] Energy consumed for all CPUs : 0.033324 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 02:54:14] 0.040307 kWh of electricity used since the beginning.
[codecarbon INFO @ 02:54:29] Energy consumed for RAM : 0.006995 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 02:54:29] Energy consumed f

  0%|          | 0/113 [00:00<?, ?it/s]

[codecarbon INFO @ 03:11:45] Energy consumed for RAM : 0.007838 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:11:45] Energy consumed for all CPUs : 0.037409 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 03:11:45] 0.045248 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:12:00] Energy consumed for RAM : 0.007851 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:12:00] Energy consumed for all CPUs : 0.037468 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 03:12:00] 0.045318 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:12:15] Energy consumed for RAM : 0.007863 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:12:15] Energy consumed for all CPUs : 0.037526 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 03:12:15] 0.045389 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:12:30] Energy consumed for RAM : 0.007875 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:12:30] Energy consumed f

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1800,) + inhomogeneous part.

[codecarbon INFO @ 03:14:45] Energy consumed for RAM : 0.007985 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:14:45] Energy consumed for all CPUs : 0.038109 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 03:14:45] 0.046094 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:15:00] Energy consumed for RAM : 0.007997 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:15:00] Energy consumed for all CPUs : 0.038168 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 03:15:00] 0.046165 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:15:15] Energy consumed for RAM : 0.008009 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:15:15] Energy consumed for all CPUs : 0.038226 kWh. Total CPU Power : 14.0 W
[codecarbon INFO @ 03:15:15] 0.046235 kWh of electricity used since the beginning.
[codecarbon INFO @ 03:15:30] Energy consumed for RAM : 0.008022 kWh. RAM Power : 2.9346041679382324 W
[codecarbon INFO @ 03:15:30] Energy consumed f