In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

from myMongoClient import MyMongoClient, BANQUE_DOCUMENT_FOLDER
%reload_ext autoreload
%autoreload 2

In [None]:
db = "documents-100039"
organisation_id = int(db[-6:])

mongo_client = MyMongoClient(db)

In [None]:
import spacy


nlp_spacy = spacy.load("en_core_web_trf")

# Function to extract entities from a given text using SpaCy
def extract_entities_spacy(text, split_paragraph):
    texts = text.split('\n') if split_paragraph else [text]
    entities = []
    for text in texts :
        if text :
            doc = nlp_spacy(text)
            entities.extend([(ent.text, ent.label_, ent.label_) for ent in doc.ents])
    return entities


tagger_flair_ontonote = SequenceTagger.load("flair/ner-english-ontonotes")

# Function to extract entities from a given text using Flair
def extract_entities_flair(text, tagger, split_paragraph = False):
    texts = text.split('\n') if split_paragraph else [text]
    entities = []
    for text in texts :
        if text :
            sentence = Sentence(text)
            tagger.predict(sentence)
            entities.extend([(entity.text, entity.get_labels()[0].value) for entity in sentence.get_spans('ner')])
    return entities

In [None]:
from myMongoClient import MyMongoClient, load, dump
gold_labels = load("data/labels_results/2024-02-02-gold_labels_filenames_clean.pkl")

In [None]:
def dump_documents(path) :
    documents = {}
    for doc in mongo_client.get_all_documents(BANQUE_DOCUMENT_FOLDER):
        doc_id = doc['_id']
        doc_content = mongo_client.get_document_content(document_id=doc_id)
        # entities = extract_entities_flair(doc_content, tagger_flair_ontonote, split_paragraph= True)
        entities = extract_entities_spacy(doc_content, split_paragraph= True)
        documents[doc_id] = {'doc_content' : doc_content,
                              'entities' : entities, 
                            #   'entities_full_doc' : extract_entities_spacy(doc_content, tagger_flair_ontonote, split_paragraph= False),
                              'entities_full_doc' : extract_entities_spacy(doc_content, split_paragraph= False),
                              'gold_labels' : gold_labels['labels'][doc_id] if doc_id in gold_labels['labels'] else {}}
    dump(documents, path)

In [None]:
path =  "data/labels_results/documents_spacy_entities.pkl"
documents = dump_documents(path) #load(path)

In [None]:
doc = documents['NP7dYhy+sWTjSoVq5wi4SQ==']

In [None]:
from datetime import datetime 
def is_valid_date(date_string):
    try:
        # Attempt to parse the date string
        date = datetime.fromisoformat(date_string).strftime('%Y-%m-%d')
        return date
    except ValueError:
        # If the parsing fails, it's not a valid date
        return False
    
def is_float(value):
    try:
        float_string = value.replace("'", "")
        float_string = float_string.replace(",", ".")
        float_value = float(float_string)
        return float_value
    except ValueError:
        return False
    
def is_int(value):
    try:
        int_string = value.replace("'", "")
        int_value = int(int_string)
        return int_value
    except ValueError:
        return False

def parse_values(value) :
    date = is_valid_date(value)
    if date :
        return date

    int_val = is_int(value)
    if int_val :
        return int_val
    
    float_val = is_float(value)
    value = float_val if float_val else value

    return value

nb_value_checked, value_found_split, value_found_doc = {},{}, {}
for doc_id, doc in documents.items():
    entities = set([parse_values(ent) for ent, tag in doc['entities']])
    entities_full_doc = set([parse_values(ent) for ent, tag in doc['entities_full_doc']])
    gold_values = [(gold_label,parse_values(gold_value)) for gold_label, gold_value in doc['gold_labels'].items() 
                   if gold_label not in ['language', 'client', 'document type','immeuble']]

    for gold_label, gold_value in gold_values :
        value_split, value_doc = int(gold_value in entities), int(gold_value in entities_full_doc)
        value_found_split[gold_label] = value_found_split[gold_label] + value_split if gold_label in value_found_split else value_split
        value_found_doc[gold_label] = value_found_doc[gold_label] + value_doc if gold_label in value_found_doc else value_doc
        nb_value_checked[gold_label] = nb_value_checked[gold_label] + 1 if gold_label in nb_value_checked else 1

score_by_field_split = {key : val/nb_value_checked[key] for key, val in value_found_split.items()}
score_by_field_doc = {key : val/nb_value_checked[key] for key, val in value_found_doc.items()}
score_split = sum(value_found_split.values())/sum(nb_value_checked.values())
score_doc = sum(value_found_doc.values())/sum(nb_value_checked.values())

score_split,score_doc