## 1. Install dependecies

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install datasets
# To run the training on TPU, you will need to uncomment the following line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

### 2. Access and Prepare data for modelling

2.1. Access to data and load into DatasetDict

Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#change current directory

import os

os.chdir("/content/drive/MyDrive/data")
directory = os.getcwd()
print(directory)

Mounted at /content/drive
/content/drive/MyDrive/data


Access, Convert bio file to a csv ready for load_dataset function and define utility dictionaries

In [None]:
# convert bio to a csv ready for load_dataset function
def preprocess_bio2csv(file_name,delimiter):

  input_train = open(file_name + ".bio")
  input_train_right_format= open(file_name + '.csv', 'w')
  i=1


  input_train_right_format.write("tokens;ner_tags" + "\n")

  words = "" # will include the sequence
  labels = "" # will include NER tags
  line_to_add = ""
  label_count = {}  #count ner labels
  label2int = {"O":0} # associate integer to ner labels
  ner_labels = ["O"] # list of labels in right order
  ner_entities = ["O"] # ner entities

  for line in input_train:
    #start of a new sequence
    if len(line)<=1:
      line_to_add = words + ";" + labels
      input_train_right_format.write(line_to_add.strip()+ "\n") #strip removes the begin and end of line or string
      words = ""
      labels = ""
    else:
      word,label = line.strip().split(delimiter)
      words= words + " " + word
      labels = labels + " " + label

      # Update info on ner labels

      if label != "O":
        entity = label.split("-")[1]
      else:
        entity = "O"

      if entity not in ner_entities:
        ner_entities.append(entity)
        label2int["B-"+entity] = len(ner_labels)
        label2int["I-"+entity] = len(ner_labels) + 1
        ner_labels.append("B-"+entity)
        ner_labels.append("I-"+entity)

      label_count[label] = label_count.get(label,0) + 1


  input_train.close()
  input_train_right_format.close()
  return label_count, label2int, ner_labels, ner_entities

label_count, label2int, ner_labels, ner_entities = preprocess_bio2csv("/content/drive/MyDrive/data/ner_complete"," -X- _ ")

In [None]:
print(label_count, label2int, ner_labels, ner_entities)

{'O': 2426, 'B-Statistic': 339, 'B-Metric': 393, 'B-Room': 398, 'I-Room': 878, 'B-Period': 397, 'I-Period': 1599, 'I-Statistic': 50, 'I-Metric': 2} {'O': 0, 'B-Statistic': 1, 'I-Statistic': 2, 'B-Metric': 3, 'I-Metric': 4, 'B-Room': 5, 'I-Room': 6, 'B-Period': 7, 'I-Period': 8} ['O', 'B-Statistic', 'I-Statistic', 'B-Metric', 'I-Metric', 'B-Room', 'I-Room', 'B-Period', 'I-Period'] ['O', 'Statistic', 'Metric', 'Room', 'Period']


Load csv file in a DatasetDict

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("csv", data_files="/content/drive/MyDrive/data/ner_complete.csv",delimiter=";")

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 399
    })
})


In [None]:
print(raw_datasets["train"][0])

{'tokens': 'À quoi pourrait ressembler la médiane de la luminosité dans la salle Turing pour les trois prochains mois ?', 'ner_tags': ' O O O O O B-Statistic O O B-Metric B-Room I-Room I-Room I-Room B-Period I-Period I-Period I-Period I-Period O'}


2.2. Preprocess DatasetDict to be ready for modelling

Remove duplicates as ChatGPT generated duplicate questions that were not identified at first glance

In [None]:
#remove duplicates
import datasets
import pandas as pd
from datasets import DatasetDict
from datasets import Dataset
tmp = pd.DataFrame(raw_datasets["train"])
tmp=tmp.drop_duplicates()
raw_datasets = DatasetDict({'train': Dataset.from_pandas(tmp)})
raw_datasets = raw_datasets.remove_columns("__index_level_0__")

In [None]:
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 249
    })
})


Split data into train, validation and test

In [None]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding

#raw_datasets = load_dataset("csv",data_files = "/content/drive/MyDrive/data/annotations_ner_bio.csv",delimiter=";")


#split in train, validation and test
raw_datasets_split= raw_datasets["train"].train_test_split(test_size=0.2,seed=1234)
val_test= raw_datasets_split["test"]
val_test= val_test.train_test_split(test_size=0.5,seed=1234)
raw_datasets_final= DatasetDict({"train":raw_datasets_split["train"],"validation": val_test["train"],"test":val_test["test"]})

In [None]:
print(raw_datasets_final)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 199
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 25
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 25
    })
})


In [None]:
print(raw_datasets_final["train"][0])

{'tokens': 'Peux-tu prédire le niveau de CO2 dans la cafétariat ?', 'ner_tags': ' O B-Period O O O B-Metric O O B-Room O'}


Convert sequences strings into lists of elements and labels to numerical values

In [None]:
def string2lists_tokens(example):
    return {"tokens": example["tokens"].split()}

def listsstr2listsint_ner(example):
    example["ner_tags"] = example["ner_tags"].split()
    example["ner_tags"] = [label2int[element] for element in example["ner_tags"]]
    return {"ner_tags": example["ner_tags"]}

dataset = raw_datasets_final.map(string2lists_tokens)
dataset = dataset.map(listsstr2listsint_ner)

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [None]:
dataset["train"][0]

{'tokens': ['Peux-tu',
  'prédire',
  'le',
  'niveau',
  'de',
  'CO2',
  'dans',
  'la',
  'cafétariat',
  '?'],
 'ner_tags': [0, 7, 0, 0, 0, 3, 0, 0, 5, 0]}

Rename to use same names as the one's on Huggin Face plateform

In [None]:
label_names = ner_labels
raw_datasets = dataset

Tokenize

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "camembert/camembert-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer.is_fast

Downloading (…)lve/main/config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/809k [00:00<?, ?B/s]

True

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['<s>',
 '▁P',
 'eux',
 '-',
 'tu',
 '▁prédire',
 '▁le',
 '▁niveau',
 '▁de',
 '▁CO',
 '2',
 '▁dans',
 '▁la',
 '▁café',
 't',
 'ariat',
 '▁?',
 '</s>']

In [None]:
inputs.word_ids()

[None, 0, 0, 0, 0, 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 8, 9, None]

Align Tokens and NER labels again as there are not aligned anymore due to (1) subword tokenization and special tokens added ([CLS] for beinning and [SEP] for the end with BERT, but can be different)

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
# in the example below digits 3 and 4 refers to the bio labelling of the same entity (the I part follows the B in the dict)
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 7, 0, 0, 0, 3, 0, 0, 5, 0]
[-100, 0, 0, 0, 0, 7, 0, 0, 0, 3, 4, 0, 0, 5, 6, 6, 0, -100]


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

Import dynamic padding to handle sentences of different lengths

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
#test of data collator: -100 are added to get the same length
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    0,    0,    0,    0,    7,    0,    0,    0,    3,    4,    0,
            0,    5,    6,    6,    0, -100, -100, -100, -100, -100],
        [-100,    0,    0,    0,    0,    0,    0,    0,    3,    5,    6,    6,
            6,    6,    7,    8,    8,    8,    8,    0,    0, -100]])

In [None]:
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 0, 0, 0, 0, 7, 0, 0, 0, 3, 4, 0, 0, 5, 6, 6, 0, -100]
[-100, 0, 0, 0, 0, 0, 0, 0, 3, 5, 6, 6, 6, 6, 7, 8, 8, 8, 8, 0, 0, -100]


# 3. Modelling

3.1. Design performance metrics

In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=104b4524c631e9c6c4051422a11c7a77df245d27841bc5bf416d2a92ff9d814c
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import evaluate

metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Test performance metric on a toy example

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels
predictions = labels.copy()
predictions[2] = "O"
metric.compute(predictions=[predictions], references=[labels])

{'Metric': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Period': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'Room': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

3.2. Define model

In [None]:
label2id = label2int
id2label = {i: label for i, label in enumerate(label_names)}


In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

Downloading pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.num_labels

9

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.592418,0.640777,0.709677,0.673469,0.894249
2,No log,0.364025,0.567797,0.72043,0.635071,0.912801
3,No log,0.331154,0.596491,0.731183,0.657005,0.920223


TrainOutput(global_step=75, training_loss=0.6219356791178385, metrics={'train_runtime': 63.2814, 'train_samples_per_second': 9.434, 'train_steps_per_second': 1.185, 'total_flos': 30495688505928.0, 'train_loss': 0.6219356791178385, 'epoch': 3.0})

## Custom data loop

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
from tqdm.auto import tqdm
import torch

output_dir = "/content/drive/MyDrive/data/ner_chatbot"

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 0: {'precision': 0.7857142857142857, 'recall': 0.6814159292035398, 'f1': 0.7298578199052133, 'accuracy': 0.9360902255639098}
epoch 1: {'precision': 0.7857142857142857, 'recall': 0.6311475409836066, 'f1': 0.7, 'accuracy': 0.9304511278195489}
epoch 2: {'precision': 0.7448979591836735, 'recall': 0.5703125, 'f1': 0.6460176991150443, 'accuracy': 0.924812030075188}


In [None]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "/content/drive/MyDrive/data/ner_chatbot"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Peux-tu me dire la température actuelle dans la salle Turing ?")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'Metric',
  'score': 0.97089064,
  'word': 'température',
  'start': 18,
  'end': 30},
 {'entity_group': 'Room',
  'score': 0.8848248,
  'word': 'salle',
  'start': 47,
  'end': 53},
 {'entity_group': 'Room',
  'score': 0.8468303,
  'word': 'ing',
  'start': 57,
  'end': 60}]

In [None]:
token_classifier("Quel était la moyenne du co2 à l'extérieur ces 10 derniers jours ?")

[{'entity_group': 'Statistic',
  'score': 0.97657245,
  'word': 'moyenne',
  'start': 13,
  'end': 21},
 {'entity_group': 'Metric',
  'score': 0.9624163,
  'word': 'co2',
  'start': 24,
  'end': 28},
 {'entity_group': 'Room',
  'score': 0.7957276,
  'word': "à l'extérieur",
  'start': 28,
  'end': 42},
 {'entity_group': 'Period',
  'score': 0.97877574,
  'word': 'ces 10 derniers jours',
  'start': 42,
  'end': 64}]

In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/data/ner_chatbot/pytorch_model.bin')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>