In [29]:
import random
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import ast
from collections import Counter

from transformers import DistilBertTokenizer, DistilBertForTokenClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import precision_score, recall_score, f1_score

print("Libraries imported")

Libraries imported


In [30]:
model_path = "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/models/distilbert_latest/"

print("paths defined")

paths defined


In [31]:
# Chargement du tokenizer DistilBERT
def load_tokenizer_with_progress(model_name):
    with tqdm(total=100, desc="Loading DistilBERT tokenizer") as pbar:
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        pbar.update(100)
    return tokenizer


tokenizer = load_tokenizer_with_progress("distilbert-base-uncased")
print("\nDistilBERT tokenizer loaded successfully")

Loading DistilBERT tokenizer: 100%|██████████| 100/100 [00:01<00:00, 84.66it/s]


DistilBERT tokenizer loaded successfully





In [32]:
# Chargement du modèle DistilBERT pour la classification par tokens
model = DistilBertForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
print("\nDistilBERT model loaded successfully")

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



DistilBERT model loaded successfully


In [33]:
# Chargement des données
def load_multiple_datasets(dataset_paths):
    dataframes = []
    for i, path in enumerate(dataset_paths):
        print(f"Loading dataset {i+1}/{len(dataset_paths)}: {path}")
        dataset = pd.read_csv(
            path,
            delimiter=';',
            quotechar='"',
            names=["text", "tokens", "ner_tags", "spacy_ner_tags"]
        )
        dataframes.append(dataset)

    combined_dataset = pd.concat(dataframes, ignore_index=True)
    return combined_dataset


dataset_paths = [
    "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv",
    "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv",
    "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv",
    "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv"
]

dataset = load_multiple_datasets(dataset_paths)
print(f"Total dataset loaded successfully: {len(dataset)} rows")

Loading dataset 1/4: C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv
Loading dataset 2/4: C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv
Loading dataset 3/4: C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv
Loading dataset 4/4: C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/token_classification/dataset_token_classification_1.csv
Total dataset loaded successfully: 435716 rows


In [34]:
# Normalisation et tokenisation avec DistilBERT
def safe_eval(val):
    try:
        val = val.replace('""', '"').replace("'", '"')
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return None


def is_entity_well_aligned(text, start, end):
    if start > 0 and text[start - 1].isalnum():
        return False
    if end < len(text) and text[end].isalnum():
        return False
    return True

In [36]:
# Préparation des données d'entraînement
print("Preparing training data...")
CONVERTED_TRAIN_DATA = []

for _, item in tqdm(dataset.iterrows(), total=len(dataset)):
    tokens_str = item['tokens'].replace("'", '"')
    spacy_ner_tags_str = item['spacy_ner_tags'].replace("'", '"') if pd.notnull(item['spacy_ner_tags']) else None

    tokens = safe_eval(tokens_str)
    if tokens is None:
        continue

    # Créer la liste de labels avec une taille correspondant à celle des tokens
    labels = [0] * len(tokens)  # Par défaut, toutes les entités sont non-entities

    if spacy_ner_tags_str:
        annotations = safe_eval(spacy_ner_tags_str)
        if annotations:
            for annotation in annotations:
                # Vérifier la validité des indices
                start, end, label = annotation['start'], annotation['end'], annotation['label']
                
                # Token alignement - Assurez-vous que les tokens sont correctement indexés
                token_start = 0
                for idx, token in enumerate(tokens):
                    token_end = token_start + len(token)  # Déterminer la fin de chaque token
                    if token_start <= start < token_end:
                        # L'annotation commence dans ce token
                        labels[idx] = 1
                    token_start = token_end + 1  # Passer au token suivant

    CONVERTED_TRAIN_DATA.append({
        "tokens": tokens,
        "labels": labels
    })

# Tokeniser les données
tokenized_texts = []
tokenized_labels = []

for item in tqdm(CONVERTED_TRAIN_DATA):
    encoding = tokenizer(item['tokens'], is_split_into_words=True, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
    tokenized_texts.append(encoding)
    # Ajuster les labels en fonction de la tokenisation
    labels_padded = item['labels'][:64] + [0] * (64 - len(item['labels'][:64]))
    tokenized_labels.append(torch.tensor(labels_padded, dtype=torch.long))

Preparing training data...


100%|██████████| 435716/435716 [02:38<00:00, 2752.79it/s]
100%|██████████| 334704/334704 [13:01<00:00, 428.24it/s]  


In [37]:
# Séparation en ensembles d'entraînement, test et validation
train_texts, test_valid_texts = train_test_split(tokenized_texts, test_size=0.5, random_state=42)
valid_texts, test_texts = train_test_split(test_valid_texts, test_size=0.5, random_state=42)

print(f"Train data: {len(train_texts)} examples")
print(f"Test data: {len(train_texts)} examples")
print(f"Validation data: {len(valid_texts)} examples")

Train data: 167352 examples
Test data: 167352 examples
Validation data: 83676 examples


In [41]:
from sklearn.metrics import precision_score, recall_score, f1_score
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer, AdamW


# Fonction pour les métriques
def compute_metrics(pred):
    predictions, labels = pred
    predictions = torch.argmax(torch.tensor(predictions), dim=-1).numpy()
    precision = precision_score(labels, predictions, zero_division=1)
    recall = recall_score(labels, predictions, zero_division=1)
    f1 = f1_score(labels, predictions, zero_division=1)
    return {"precision": precision, "recall": recall, "f1": f1}


# Initialiser le tokenizer et le modèle
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Configurer l'entraînement
training_args = TrainingArguments(
    output_dir="/chemin/vers/mon/dossier",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    evaluation_strategy="epoch",
)

# Créer un Optimizer personnalisé
optimizer = AdamW(model.parameters(), lr=5e-5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_texts,
    eval_dataset=valid_texts,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, None),
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


ImportError: cannot import name 'tarfile' from 'backports' (C:\Users\vikne\anaconda3\Lib\site-packages\backports\__init__.py)

In [None]:
# Sauvegarde du modèle
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
# Visualisation
def draw_scores(metrics_hist):
    fig, axs = plt.subplots(2, 2, figsize=(12, 8))
    sns.lineplot(x=list(range(len(metrics_hist))), y=metrics_hist, ax=axs[0, 0])
    axs[0, 0].set_title('Precision, Recall, F1')

draw_scores([])
plt.tight_layout()
plt.show()

In [None]:
# Test du modèle
test_sentences = [
    "Je pars de Paris et j'arrive à Marseille.",
    "Je vais à Bordeaux en partant de Toulouse."
]

def test_model(model, tokenizer, sentences):
    for sentence in sentences:
        tokens = tokenizer(sentence, return_tensors="pt")
        with torch.no_grad():
            predictions = model(**tokens)
        print(predictions)

test_model(model, tokenizer, test_sentences)