In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import pandas as pd
import ast
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os

print("Libraries imported")

# Chemins
dataset_path = "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/tokens/token.csv"
model_path = "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/models/bilstmcrf/"

print("Paths defined")

# Définition de la CRF avec PyTorch pur
class CRF(nn.Module):
    def __init__(self, num_tags, batch_first=True):
        super(CRF, self).__init__()
        self.num_tags = num_tags
        self.batch_first = batch_first
        self.transitions = nn.Parameter(torch.randn(num_tags, num_tags))

    def forward(self, emissions, tags, mask):
        return self._calculate_score(emissions, tags, mask)

    def _calculate_score(self, emissions, tags, mask):
        batch_size, seq_len, num_tags = emissions.size()
        score = torch.zeros(batch_size, device=emissions.device)

        for t in range(seq_len):
            active_mask = mask[:, t]
            if t > 0:
                score += self.transitions[tags[:, t - 1], tags[:, t]] * active_mask
            score += emissions[torch.arange(batch_size), t, tags[:, t]] * active_mask  # Corrected this line

        return score.sum()


    def decode(self, emissions, mask):
        batch_size, seq_len, num_tags = emissions.size()

        # Initialisation
        viterbi = emissions[:, 0]  # Taille (batch_size, num_tags)
        backpointers = []

        for t in range(1, seq_len):
            broadcast_viterbi = viterbi.unsqueeze(2)  # Taille (batch_size, num_tags, 1)
            broadcast_emissions = emissions[:, t].unsqueeze(1)  # Taille (batch_size, 1, num_tags)

            # Calcul des scores
            score = broadcast_viterbi + self.transitions + broadcast_emissions
            viterbi, backpointer = torch.max(score, dim=1)  # Taille (batch_size, num_tags)
            backpointers.append(backpointer)

        # Récupération des meilleures séquences
        best_tags_list = []
        for i in range(batch_size):
            best_tag = torch.argmax(viterbi[i]).item()
            best_tags = [best_tag]
            for backpointer in reversed(backpointers):
                best_tag = backpointer[i][best_tag].item()
                best_tags.append(best_tag)
            best_tags.reverse()
            best_tags_list.append(best_tags)

        return best_tags_list


# Modèle BiLSTM-CRF
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags, padding_idx):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim * 2, num_tags)
        self.crf = CRF(num_tags)
    
    def forward(self, sentences, tags, mask):
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        emissions = self.hidden2tag(lstm_out)
        return self.crf(emissions, tags, mask)

    def predict(self, sentences, mask):
        embeddings = self.embedding(sentences)
        lstm_out, _ = self.lstm(embeddings)
        emissions = self.hidden2tag(lstm_out)  # Taille (batch_size, seq_len, num_tags)
        return self.crf.decode(emissions, mask)

# Charger et prétraiter les données
def load_data(dataset_path, sample_size=80000):
    df = pd.read_csv(dataset_path, sep=";")
    if len(df) > sample_size:
        df = df.sample(n=sample_size, random_state=42)  # Sélection aléatoire de 80 000 lignes
    sentences = []
    labels = []
    for _, row in df.iterrows():
        tokens = ast.literal_eval(row['tokens'])
        ner_tags = ast.literal_eval(row['ner_tags'])
        sentences.append(tokens)
        labels.append(ner_tags)
    return sentences, labels

# Charger les données avec échantillonnage
sentences, labels = load_data(dataset_path, sample_size=80000)

# Création du vocabulaire
word_to_idx = {"<PAD>": 0}
tag_to_idx = {}
for sentence in sentences:
    for word in sentence:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)

for label_seq in labels:
    for label in label_seq:
        if label not in tag_to_idx:
            tag_to_idx[label] = len(tag_to_idx)

# Conversion des données
def prepare_data(sentences, labels, word_to_idx, tag_to_idx):
    X_data = [[word_to_idx[word] for word in sentence] for sentence in sentences]
    y_data = [[tag_to_idx[label] for label in label_seq] for label_seq in labels]
    return X_data, y_data

X_data, y_data = prepare_data(sentences, labels, word_to_idx, tag_to_idx)

# Division des données
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

# Paramètres du modèle
embedding_dim = 100
hidden_dim = 128
num_tags = len(tag_to_idx)
padding_idx = word_to_idx["<PAD>"]

model = BiLSTM_CRF(len(word_to_idx), embedding_dim, hidden_dim, num_tags, padding_idx)

# Optimiseur et fonction de perte
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Entraînement du modèle avec suivi des scores
losses_hist, f1_hist, recall_hist, precision_hist = [], [], [], []

def calculate_scores(y_true, y_pred):
    tp = sum([1 for yt, yp in zip(y_true, y_pred) if yt == yp])
    fn = len(y_true) - tp
    fp = len(y_pred) - tp
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)
    return precision, recall, f1

def train(model, X_train, y_train, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        y_true_all, y_pred_all = [], []
        for i in tqdm(range(len(X_train))):
            sentence = torch.tensor(X_train[i], dtype=torch.long).unsqueeze(0)
            tags = torch.tensor(y_train[i], dtype=torch.long).unsqueeze(0)
            mask = (sentence != padding_idx).long()

            optimizer.zero_grad()
            loss = model(sentence, tags, mask)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            with torch.no_grad():
                predicted_tags = model.predict(sentence, mask)
                y_true_all.extend(y_train[i])
                y_pred_all.extend(predicted_tags[0])  # Ajout [0] pour le batch unique

        precision, recall, f1 = calculate_scores(y_true_all, y_pred_all)
        losses_hist.append(total_loss / len(X_train))
        f1_hist.append(f1)
        recall_hist.append(recall)
        precision_hist.append(precision)

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(X_train)}, F1: {f1:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}')

# Entraîner le modèle
train(model, X_train, y_train, optimizer, num_epochs=5)

# Dessiner les graphiques
def draw_scores(losses_hist, f1_hist, recall_hist, precision_hist):
    fig, axs = plt.subplots(2, 2, figsize=(12, 8))
    axs[0, 0].plot(losses_hist, label="Loss")
    axs[0, 1].plot(f1_hist, label="F1 Score", color="orange")
    axs[1, 0].plot(recall_hist, label="Recall", color="green")
    axs[1, 1].plot(precision_hist, label="Precision", color="blue")
    for ax in axs.flat:
        ax.legend()
    plt.tight_layout()
    plt.show()

draw_scores(losses_hist, f1_hist, recall_hist, precision_hist)

# Tester le modèle
def test(model, X_test, y_test):
    model.eval()
    correct = 0
    total = 0
    for i in tqdm(range(len(X_test))):
        sentence = torch.tensor(X_test[i], dtype=torch.long).unsqueeze(0)
        tags = torch.tensor(y_test[i], dtype=torch.long).unsqueeze(0)
        mask = (sentence != padding_idx).long()

        with torch.no_grad():
            predicted_tags = model.predict(sentence, mask)
        
        correct += sum(yt == yp for yt, yp in zip(y_test[i], predicted_tags))
        total += len(y_test[i])

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Tester le modèle
test(model, X_test, y_test)

Libraries imported
Paths defined


100%|██████████| 64000/64000 [1:36:42<00:00, 11.03it/s]


Epoch 1/5, Loss: -101676.2487624628, F1: 0.06, Precision: 0.06, Recall: 0.06


 98%|█████████▊| 62435/64000 [2:09:15<08:24,  3.10it/s]  