In [1]:
import pandas as pd
import ast
import sklearn
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter

print("Libraries imported")

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Chemins
dataset_path = "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/dataset/tokens/token.csv"
model_path = "C:/Users/vikne/Documents/Master 2/Semestre 9/Intelligence artificielle/Travel-Order-Resolver/ai/nlp/models/crf/"

# Chargement des données
print("Loading dataset...")
def load_dataset_with_progress(dataset_path):
    with tqdm(total=100, desc="Loading dataset") as pbar:
        dataset = pd.read_csv(dataset_path, delimiter=';', quotechar='"', names=["text", "tokens", "ner_tags", "spacy_ner_tags"])
        pbar.update(100)
    return dataset

dataset = load_dataset_with_progress(dataset_path)

In [None]:
# Prétraitement des données
def safe_eval(val):
    try:
        val = val.replace('""', '"').replace("'", '"')
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return None

def prepare_data(dataset):
    sequences = []
    for _, row in tqdm(dataset.iterrows(), total=len(dataset)):
        tokens = safe_eval(row["tokens"])
        spacy_ner_tags = safe_eval(row["spacy_ner_tags"]) if pd.notnull(row["spacy_ner_tags"]) else None
        if tokens is None:
            continue
        labels = ["O"] * len(tokens)
        if spacy_ner_tags:
            for tag in spacy_ner_tags:
                start, end, label = tag['start'], tag['end'], tag['label']
                for i, token in enumerate(tokens):
                    if start <= sum(len(t) + 1 for t in tokens[:i]) < end:
                        labels[i] = label
        sequences.append((tokens, labels))
    return sequences

print("Preparing data...")
data = prepare_data(dataset)

In [None]:
# Caractéristiques pour CRF
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'BOS': i == 0,
        'EOS': i == len(sent) - 1
    }
    if i > 0:
        word1 = sent[i - 1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(labels):
    return labels

# Séparation des données
print("Splitting data...")
train, test_valid = train_test_split(data, test_size=0.5, random_state=42)
test, valid = train_test_split(test_valid, test_size=0.5, random_state=42)

X_train = [sent2features(s[0]) for s in train]
y_train = [sent2labels(s[1]) for s in train]
X_valid = [sent2features(s[0]) for s in valid]
y_valid = [sent2labels(s[1]) for s in valid]
X_test = [sent2features(s[0]) for s in test]
y_test = [sent2labels(s[1]) for s in test]

In [None]:
# Entraînement du modèle CRF
print("Training CRF model...")
crf = CRF(algorithm='lbfgs', max_iterations=100, c1=0.1, c2=0.1, all_possible_transitions=True)

losses = []
for epoch in tqdm(range(20), desc="Training epochs"):
    crf.fit(X_train, y_train)
    y_pred = crf.predict(X_valid)
    f1 = flat_f1_score(y_valid, y_pred, average='weighted')
    print(f"Epoch {epoch + 1}: F1-Score on validation set: {f1:.4f}")
    losses.append(1 - f1)

In [None]:
# Évaluation finale
print("\nEvaluating on test set...")
y_pred = crf.predict(X_test)
report = flat_classification_report(y_test, y_pred)
print("\nClassification Report on Test Set:")
print(report)

In [None]:
# Sauvegarde du modèle
import joblib
print("Saving model...")
joblib.dump(crf, model_path + "crf_model.joblib")

# Visualisation des pertes
plt.plot(losses)
plt.title("Loss per epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

# Test du modèle sur des phrases
test_sentences = [
    "Je pars de Paris et j'arrive à Marseille.",
    "Je vais à Bordeaux en partant de Toulouse.",
    "Mon trajet va de VILLIERS SUR LOIR à JARNY."
]

print("\nTesting on new sentences...")
for sentence in test_sentences:
    tokens = sentence.split()
    features = sent2features(tokens)
    prediction = crf.predict([features])[0]
    print(f"\nPhrase: {sentence}")
    for token, label in zip(tokens, prediction):
        print(f" - {token}: {label}")
