In [57]:
import spacy
from spacy.tokens import DocBin
import ast
import os

In [34]:
import pandas as pd
import spacy
from spacy.tokens import DocBin
from spacy.training import Example

# Charger les données
train_df = pd.read_csv('train_2.csv')
test_df = pd.read_csv('test.csv')

def prepare_data(df):
    db = DocBin()  # Pour stocker les documents
    nlp = spacy.blank("en")  # Créer un modèle vide

    for _, row in df.iterrows():
        doc = nlp.make_doc(row['sentence'])
        ents = []
        covered_chars = set()  # Ensemble pour suivre les caractères déjà couverts

        for word, label in zip(eval(row['words']), eval(row['labels'])):
            if label in ['person', 'content']:
                start = row['sentence'].find(word)
                end = start + len(word)

                # Vérifier si le span actuel chevauche un span précédemment ajouté
                if not any((start <= c < end) for c in covered_chars):
                    span = doc.char_span(start, end, label=label)
                    if span is not None:
                        ents.append(span)
                        covered_chars.update(range(start, end))  # Marquer les caractères comme couverts

        doc.ents = ents
        db.add(doc)
    return db


# Préparer les données d'entraînement et de test
train_data = prepare_data(train_df)
test_data = prepare_data(test_df)

# Enregistrer les données préparées
train_data.to_disk("train.spacy")
test_data.to_disk("test.spacy")

# Configuration pour l'entraînement
config = spacy.util.load_config("config.cfg")

# Entraîner le modèle
!spacy train config.cfg --output ./output --paths.train train.spacy --paths.dev test.spacy

print("finished")

^C
finished


In [35]:
def format_api_call(text, nlp_model):
    # Process the text with the trained model
    doc = nlp_model(text)
    
    # Extract entities for "person" and "content"
    receiver = []
    content = []
    for ent in doc.ents:
        if ent.label_ == "person":
            receiver.append(ent.text)
        elif ent.label_ == "content":
            content.append(ent.text)
    
    # Format the API call JSON
    api_call = {
        "job": "send_message",
        "receiver": " ".join(receiver),
        "content": " ".join(content)
    }
    return api_call

# Example usage:
nlp_model = spacy.load("./output/model-best")  # Load your trained model
sentence = "Ask the python teacher when is the next class?"
api_call_json = format_api_call(sentence, nlp_model)
print(api_call_json)


{'job': 'send_message', 'receiver': 'the python teacher', 'content': 'when is next class'}


In [36]:
from spacy.tokens import DocBin
nlp_trained = spacy.load("./output/model-best")  # Load your trained model

# Load the test data
test_db = DocBin().from_disk("./test.spacy")
test_docs = list(test_db.get_docs(nlp_trained.vocab))

# Example: Process each document in the test set and print the entities
for doc in test_docs:
    print(f"Text: {doc.text}")
    for ent in doc.ents:
        print(f"  {ent.text} ({ent.label_})")
    print("---")


Text: Create a 6 pm note in Notas.
---
Text: Can you create a 8pm note?
---
Text: Can you create a household note for María?
  María (person)
---
Text: Crea a note for Bob.
  Bob (person)
---
Text: Nota con reminder crear.
---
Text: Create a memo saying Mitbringbuffet am Mittwoch.
  Mitbringbuffet (content)
  am (content)
  Mittwoch (content)
---
Text: Make a note for Mama on my iPhone.
  Mama (person)
---
Text: I need make a reminder to get kid son aus dem Kindergarten.
  get (content)
  kid (content)
  son (content)
  aus (content)
  dem (content)
  Kindergarten (content)
---
Text: Take a note for the oficina group, please.
---
Text: Can you create a memo with a Erinnerung?
---
Text: Nota buy milk and eggs crear.
  buy (content)
  milk (content)
  and (content)
  eggs (content)
---
Text: Por favor, create a note for me.
---
Text: Schreib ein Memo Love and other drugs.
  Love (content)
  and (content)
  other (content)
  drugs (content)
---
Text: Make a note for François on my Google 

In [37]:
import pandas as pd

# Load the test.csv dataset to inspect its structure
test_csv_path = 'test.csv'
test_df = pd.read_csv(test_csv_path)

# Display the first few rows of the dataframe to understand its structure
test_df.head()

# Define a function to preprocess the test data from test.csv
def preprocess_test_data(df):
    sentences = df['sentence'].tolist()
    # Convert 'labels' and 'words' from string representation of list to actual lists
    entities_data = []
    for _, row in df.iterrows():
        words = eval(row['words'])
        labels = eval(row['labels'])
        entities = []
        for word, label in zip(words, labels):
            if label != 0:  # Assuming '0' means 'no entity'
                entities.append((word, label))
        entities_data.append(entities)
    return sentences, entities_data

# Preprocess the test data to extract entities
sentences, ground_truth_entities = preprocess_test_data(test_df)

# Show an example of the preprocessed data
sentences[0], ground_truth_entities[0]


('Create a 6 pm note in Notas.', [])

In [38]:
import spacy

# Load your trained spaCy model
nlp_trained = spacy.load("./output/model-best")

predicted_entities = []
for sentence in sentences:
    # Process each sentence through the model
    doc = nlp_trained(sentence)
    # Extract predicted entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    predicted_entities.append(entities)


In [39]:
from collections import Counter

def calculate_metrics(predicted_entities, ground_truth_entities):
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    for predicted, true in zip(predicted_entities, ground_truth_entities):
        predicted_counter = Counter(predicted)
        true_counter = Counter(true)
        
        true_positives += sum((predicted_counter & true_counter).values())
        false_positives += sum((predicted_counter - true_counter).values())
        false_negatives += sum((true_counter - predicted_counter).values())
    
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    
    return precision, recall, f1_score

# Calculate the evaluation metrics
precision, recall, f1_score = calculate_metrics(predicted_entities, ground_truth_entities)

print(f"Precision: {precision}\nRecall: {recall}\nF1 Score: {f1_score}")


Precision: 0.8300104931794333
Recall: 0.7427230046948357
F1 Score: 0.78394449950446


In [40]:
print(ground_truth_entities)

[[], [], [('María', 'person')], [('Bob', 'person')], [], [('Mitbringbuffet', 'content'), ('am', 'content'), ('Mittwoch', 'content')], [('Mama', 'person')], [('get', 'content'), ('kid', 'content'), ('son', 'content'), ('aus', 'content'), ('dem', 'content'), ('Kindergarten', 'content')], [], [], [('buy', 'content'), ('milk', 'content'), ('and', 'content'), ('eggs', 'content')], [], [('Love', 'content'), ('and', 'content'), ('other', 'content'), ('drugs', 'content')], [('François', 'person')], [], [], [('reunión', 'content'), ('12:30', 'content')], [('Hélène', 'person'), ('Gérard', 'person')], [], [], [('mamá', 'person')], [('Roberto', 'person')], [('太郎', 'person')], [], [('Anna', 'person')], [], [('buy', 'content'), ('milk', 'content'), ('and', 'content'), ('eggs', 'content')], [], [('私', 'person')], [], [('パパ', 'person')], [('buy', 'content'), ('chocolate', 'content')], [], [], [], [], [], [], [('book', 'content'), ('the', 'content'), ('flight', 'content'), ('to', 'content'), ('Bordeaux

In [41]:
print(predicted_entities)

[[], [], [('María', 'person')], [('Bob', 'person')], [], [], [('Mama', 'person')], [('get', 'person'), ('kid', 'person'), ('son', 'content'), ('aus', 'content'), ('Kindergarten', 'content')], [], [], [('milk', 'content'), ('and', 'content'), ('eggs', 'content')], [], [('Love', 'content'), ('and', 'content'), ('other', 'content'), ('drugs', 'content')], [('François', 'person')], [], [], [('reunión', 'content'), ('12:30', 'content')], [('Hélène', 'person'), ('Gérard', 'person')], [], [], [('mamá', 'person')], [('Roberto', 'person')], [('太郎', 'person')], [], [('Anna', 'person')], [], [('buy', 'content'), ('milk', 'content'), ('and', 'content'), ('eggs', 'content')], [], [('私', 'person')], [], [], [('buy', 'content'), ('chocolate', 'content')], [], [], [], [], [], [], [('book', 'content'), ('the', 'content'), ('flight', 'content')], [], [], [], [], [('Bunny', 'person')], [], [], [], [('my', 'person'), ('Vater', 'person')], [('papá', 'person')], [('夜9時', 'person')], [], [('kaufen', 'content

-----

1.Développer un classifier requête -> tâche

In [42]:
def send_message(receiver, content, **kwargs):
    return f"Sent to \"{receiver}\" the message: \"{content}\""


def ask_RAG(question):
    return f"Asked to RAG: \"{question}\"\nThe RAG replied: \"I don't know\""

In [43]:
send_message(**api_call_json)

'Sent to "the python teacher" the message: "when is next class"'