In [1]:
from pathlib import Path
from medkit.core.text import TextDocument
from medkit.text.segmentation import SentenceTokenizer
from medkit.text.ner import RegexpMatcher, RegexpMatcherRule , RegexpMatcherNormalization
from medkit.text.context import NegationDetector, NegationDetectorRule
from medkit.text.segmentation import SyntagmaTokenizer
from medkit.text.context import FamilyDetector
from unidecode import unidecode
import os
import re
import pandas as pd
import random
pd.set_option('display.max_colwidth', None)

In [2]:
def preprocessing(text):
    
    # On convertit les caractères spéciaux spécifiques avant la conversion en ASCII
    text = re.sub(r'n°', 'numero', text)  # Remplace "n°" par "numero"
    text = re.sub(r'/d°', 'deg', text)  # Remplace "/d°" par "deg"

    # On convertit le texte en ASCII
    ascii_text = unidecode(text)  # Convertit les caractères Unicode en ASCII

    # On normalise les espaces en remplaçant les espaces multiples par un seul espace
    ascii_text = re.sub(r'\s+', ' ', ascii_text)  # Remplace plusieurs espaces par un seul espace
    return ascii_text

In [3]:
def statut_extraction_tabac(doc):
    statut = "UNKNOWN"
    n_oui = 0
    n_non = 0
    value_is_negated = False
    value_other_detected = False
    count=0 # Indice qui permet de recuperer ce qui nous interesse
    for ann in doc.anns:
        for attr in ann.attrs:
            if ann.label == "tabagisme":
                if count==3:
                    count=1
                else:
                    count+=1
                    if attr.label == "is_negated":
                        value_is_negated = attr.value
                    elif attr.label == "other_detected":
                        value_other_detected = attr.value

                    if count ==2:
                        if value_other_detected:
                            continue
                        else:
                            if value_is_negated:
                                n_non += 1
                            else:
                                n_oui += 1

        if n_non > 0 and n_oui > 0:
            statut = "FUMEUR"
        elif n_non > 0:
            statut = "NON-FUMEUR"
        elif n_oui > 0:
            statut = "FUMEUR"
    return statut

In [4]:
def statut_extraction_alcool(doc):
    statut = "UNKNOWN"
    n_oui = 0
    n_non = 0
    value_is_negated = False
    value_other_detected = False
    count=0 # Indice qui permet de recuperer ce qui nous interesse
    for ann in doc.anns:
        for attr in ann.attrs:
            if ann.label == "tabagisme":
                if count==3:
                    count=1
                else:
                    count+=1
                    if attr.label == "is_negated":
                        value_is_negated = attr.value
                    elif attr.label == "other_detected":
                        value_other_detected = attr.value

                    if count ==2:
                        if value_other_detected:
                            continue
                        else:
                            if value_is_negated:
                                n_non += 1
                            else:
                                n_oui += 1

        if n_non > 0 and n_oui > 0:
            statut = "ALCOOLIQUE"
        elif n_non > 0:
            statut = "NON-ALCOOLIQUE"
        elif n_oui > 0:
            statut = "ALCOOLIQUE"
    return statut

In [5]:
def statut_extraction_situation_familiale(doc):
    ## Initialisation
    value_is_negated=False
    value_other_detected=False
    ## Initialisation
    situation = "UNKNOWN"
    count = 0  # Variable de compteur pour suivre le nombre d'occurrences

    # On parcourt le dico pour analyser chaque annotation trouvée
    for ann in doc.anns:
        if ann.label == "situation":
            for attr in ann.attrs:
                if count==3:
                    count=1
                else:
                    count+=1
                    if attr.label == "is_negated":
                        value_is_negated = attr.value
                    elif attr.label == "other_detected":
                        value_other_detected = attr.value

                # Si l'entité trouvée ne concerne pas le patient (other_detected == True),
                # on passe à l'annotation suivante
                if value_other_detected == True:
                    continue
                else:
                    if count == 2:  # Sortir de la boucle externe une fois que la troisième valeur est récupérée
                        situation = ann.text.lower()
                        ## NORMALISATION: Seul, pas seul ou inconnu
                        if re.search(r"\bmarie[e]?\b", situation):
                            situation = "PAS SEUL"
                        elif re.search(r"\bcelibataire\b", situation):
                            situation = "SEUL"
                        elif re.search(r"\bdivorce[e]?\b", situation):
                            situation = "SEUL"
                        elif re.search(r"\bveuf\b", situation):
                            situation = "SEUL"
                        elif re.search(r"\bveuve\b", situation):
                            situation = "SEUL"
                        elif re.search(r"\bpacse[e][s]?\b", situation):
                            situation = "PAS SEUL"
                        elif re.search(r"\bconcubinage\b", situation):
                            situation = "PAS SEUL"
                        elif re.search(r"\b(vit|habite)\sseul(e)?\b", situation):
                            situation = "SEUL"
                        # Si il y a une négation
                        if value_is_negated == True:
                            # On inverse statut_marital
                            if situation == "SEUL":
                                situation = "PAS SEUL"
                            else:
                                situation = "SEUL"

    return situation  

In [6]:
def clinical_case_recovery(output_folder,option_melange):
    # On récupère tous les fichiers texte dans le dossier
    txt_files = [f for f in os.listdir(output_folder) if f.endswith('.txt')]
    
    if option_melange == True:
        # On mélange les fichiers de manière aléatoire
        random.shuffle(txt_files)
    else:
        # On trie les fichiers par ordre alphabétique
        txt_files = sorted(txt_files)

    textes = {}  # dictionnaire de tous les cas cliniques

    # On ouvre et on extrait les textes dans textes
    for i in range(len(txt_files)):
        file_path = os.path.join(output_folder, txt_files[i])
        with open(file_path, 'r') as f:
            text = f.read()
        textes[txt_files[i]] = text
    return textes

## LES NEG DETECTOR

In [7]:

def neg_detector_tabac():

    ### QUELQUES REGEX NEGATION

    neg_rules = [       

    NegationDetectorRule(regexp=r"\bne\s*(semble|consomme|prend|fume)\s*pas"),
    NegationDetectorRule(regexp=r"jamais"),
    NegationDetectorRule(regexp=r"\bni\b"),
    NegationDetectorRule(regexp=r"\bnon\s+\b"),
    NegationDetectorRule(regexp=r"Tabac\s*[=:]?\s*0"),
    NegationDetectorRule(regexp=r"(pas|ni|ou)\s+de\s+(consommation\s+de\s+)?taba"),
    NegationDetectorRule(regexp=r"pas\s+d\'intoxication\s+tabagi"),
    NegationDetectorRule(regexp=r"0 tabac"),

    ]
    
    neg_detector = NegationDetector(output_label="is_negated", rules=neg_rules)
    return neg_detector

In [8]:
def neg_detector_alcool():

    ### QUELQUES REGEX NEGATION

    neg_rules = [

        NegationDetectorRule(regexp=r"ne\s*boit\s*pas"),
        NegationDetectorRule(regexp=r"\bne/s*consomme/s*pas\b"),
        NegationDetectorRule(regexp=r"\bni\b"),
        NegationDetectorRule(regexp=r"\bpas\b"),
        NegationDetectorRule(regexp=r"\becarte\b"),
        NegationDetectorRule(regexp=r"\bnulle|negative\b"),
        NegationDetectorRule(regexp=r"rarement|occasion"),

    ]
    neg_detector = NegationDetector(output_label="is_negated", rules=neg_rules)
    return neg_detector

In [9]:
def neg_detector_situation_familiale():

### QUELQUES REGEX NEGATION

    neg_rules = [

        NegationDetectorRule(regexp=r"\bn'est pas\b"),
        NegationDetectorRule(regexp=r"\bne vit pas\b"),
        NegationDetectorRule(regexp=r"\bn'habite pas\b"),
        NegationDetectorRule(regexp=r"\bni\b"),
    ]
    neg_detector = NegationDetector(output_label="is_negated", rules=neg_rules)
    return neg_detector

## REGEX

In [10]:
regexp_rules_tabac = [
    RegexpMatcherRule(regexp=r"cigare(tte)?[s]?\b", label="tabagisme", exclusion_regexp ="en bout de cigare"),
    RegexpMatcherRule(regexp=r"\bfume\b", label="tabagisme", exclusion_regexp = "residu(s)?/s+de/s+fumee(s)?"),
    RegexpMatcherRule(regexp=r"taba(c|gisme|gique)[s]?", label="tabagisme"),
    RegexpMatcherRule(regexp=r"fumeur|fumeuse", label="tabagisme"),
    RegexpMatcherRule(regexp=r"fumait", label="tabagisme"),
    RegexpMatcherRule(regexp=r"nicotine", label="tabagisme"),
]
regexp_rules_alcool = [
    RegexpMatcherRule(regexp=r"alcool", label="alcool", exclusion_regexp = "acido/s*alcoolo|acido-alcoolo"), 
    RegexpMatcherRule(regexp=r"ethylisme|ethylique|ethylemie|ethylo", label="alcool"),
    RegexpMatcherRule(regexp=r"biere[s]?", label="alcool"),
]

regexp_rules_familial = [
    RegexpMatcherRule(regexp=r"\bmarie[e]?\b", label="situation"),
    RegexpMatcherRule(regexp=r"celibataire", label="situation"),
    RegexpMatcherRule(regexp=r"divorce[e]?", label="situation"),
    RegexpMatcherRule(regexp=r"veuf", label="situation"),
    RegexpMatcherRule(regexp=r"\bveuve\b", label="situation"),
    RegexpMatcherRule(regexp=r"\bpacse[e][s]?\b", label="situation"),
    RegexpMatcherRule(regexp=r"\bconcubinage\b", label="situation"),
    RegexpMatcherRule(regexp=r"\b(vit|habite)\sseul(e)?\b", label="situation"),
]

## EXTRACTION STATUTS

In [126]:
def extraction_finale(clinical_case_repo,option_melange):
    # Listes pour stocker les données
    data_tabac = []
    data_alcool = []
    data_situation = []
    data = []
    docs = []
    # On charge les cas cliniques dans un dico {nom fichier: cas clinique}
    clinical_cases_dico = clinical_case_recovery(clinical_case_repo,option_melange)

    for fichier, clinical_case in clinical_cases_dico.items():
        clinical_case = preprocessing(clinical_case) # prétraitement du texte
        doc = TextDocument(text=clinical_case)

        ## On sépare le texte en phrases
        sent_tokenizer = SentenceTokenizer(
            output_label="sentence",
            punct_chars=[".", "?", "!"],
        )
        sentences = sent_tokenizer.run([doc.raw_segment])

        ## On sépare les phrases en syntagmas
        synt_tokenizer = SyntagmaTokenizer(
            output_label="sentence",
            separators=[r"\bmais\b", r"\bet\b"],
        )
        syntagmas = synt_tokenizer.run(sentences)

        # Création de l'objet neg detector
        neg_detector_tabac_obj = neg_detector_tabac()
        neg_detector_alcool_obj = neg_detector_alcool()
        neg_detector_statut_familial_obj = neg_detector_situation_familiale()

        # On applique neg detector aux syntagmas
        neg_detector_tabac_obj.run(syntagmas)
        neg_detector_alcool_obj.run(syntagmas)
        neg_detector_statut_familial_obj.run(syntagmas)

        # On applique family detector aux syntagmas
        family_detector = FamilyDetector(output_label='other_detected')
        family_detector.run(syntagmas)

        # CREATION OF ENTITIES
        regexp_matcher_tabac = RegexpMatcher(rules=regexp_rules_tabac, attrs_to_copy=["is_negated", "other_detected"])
        regexp_matcher_alcool = RegexpMatcher(rules=regexp_rules_alcool, attrs_to_copy=["is_negated", "other_detected"])
        regexp_matcher_familial = RegexpMatcher(rules=regexp_rules_familial, attrs_to_copy=["is_negated", "other_detected"])

        entities_tabac = regexp_matcher_tabac.run(syntagmas)
        entities_alcool = regexp_matcher_alcool.run(syntagmas)
        entities_familial = regexp_matcher_familial.run(syntagmas)

        for entity in entities_tabac:
            doc.anns.add(entity)

        for entity in entities_alcool:
            doc.anns.add(entity)

        for entity in entities_familial:
            doc.anns.add(entity)

        
        docs.append(doc)
               
        tabagisme = statut_extraction_tabac(doc)
        alcool = statut_extraction_alcool(doc)
        situation = statut_extraction_situation_familiale(doc)

        # Remplissage de data
        data_tabac.append([fichier, clinical_case, tabagisme])
        data_alcool.append([fichier, clinical_case, alcool])
        data_situation.append([fichier, clinical_case, situation])
        data.append([fichier, clinical_case, tabagisme,alcool,situation])

    df_tabac = pd.DataFrame(data_tabac, columns=["nom fichier", "cas clinique", "tabagisme"])
    df_alcool = pd.DataFrame(data_alcool, columns=["nom fichier", "cas clinique", "alcool"])
    df_situation = pd.DataFrame(data_situation, columns=["nom fichier", "cas clinique", "situation"])
    df = pd.DataFrame(data, columns=["nom fichier", "cas clinique", "tabagisme", "alcool", "situation"])
    
    return df,docs

## PREPARATION ANNOTATION TABAC

## On fait tourner le code sur ces fichiers uniquement

In [129]:
df,docs_medkit = extraction_finale("51_fichiers_annotation_tabac",option_melange=False)
print("fin")

fin


In [132]:
df['tabagisme'].value_counts()

tabagisme
FUMEUR        24
UNKNOWN       17
NON-FUMEUR    10
Name: count, dtype: int64

In [80]:
## TEST POUR COMPRENDRE COMMENT L EVALUATION FONCTIONNE
from medkit.core.text import TextDocument, Entity, Span
from medkit.text.metrics.ner import SeqEvalEvaluator

document = TextDocument("Il fume la cigarette", 
                        anns = [Entity(label="tabagisme",spans=[Span(4,8)],text="fumee"),
                                Entity(label="tabagisme",spans=[Span(12,20)],text="cigarette")])

pred_ents = [Entity(label="tabagisme",spans=[Span(12,20)],text="cigarette"),
            Entity(label="tabagisme",spans=[Span(4,7)],text="fume")]

# define a evaluator using `iob2` as tagging scheme
evaluator = SeqEvalEvaluator(tagging_scheme="iob2")
metrics = evaluator.compute(documents=[document], predicted_entities=[pred_ents])
print(metrics)

{'overall_precision': 0.5, 'overall_recall': 0.5, 'overall_f1-score': 0.5, 'overall_support': 2, 'overall_acc': 0.95, 'tabagisme_precision': 0.5, 'tabagisme_recall': 0.5, 'tabagisme_f1-score': 0.5, 'tabagisme_support': 2}


In [134]:
## PRETRAITEMENT DES FICHIERS BRAT
import os

# Chemin du dossier contenant les fichiers .txt
#dossier = "/home/mhassani/Documents/Stage/env/brat-master/data/tabac_test"

# Parcours des fichiers .txt dans le dossier
for filename in os.listdir(dossier):
    if filename.endswith(".txt"):
        # Chemin complet du fichier
        fichier = os.path.join(dossier, filename)
        
        # Lecture du contenu du fichier
        with open(fichier, "r") as file:
            contenu = file.read()
        
        # Application de la fonction de prétraitement
        contenu_preprocessed = preprocessing(contenu)
        
        # Écriture du contenu prétraité dans le fichier
        with open(fichier, "w") as file:
            file.write(contenu_preprocessed)
print("fin")

fin


## EVALUATION: COMPARAISON ENTRE ANNOTATION BRAT ET MEDKIT

In [135]:
# On récupère les annotations brat

from medkit.io.brat import BratInputConverter

# Define Input Converter 
brat_converter = BratInputConverter()

path= "/home/mhassani/Documents/Stage/env/brat-master/data/tabac_test"

# Load brat into a list of documents
docs_brat = brat_converter.load(dir_path = path)
len(docs_brat)

51

In [136]:
from medkit.core.text import Span
from medkit.core.text import Entity

def convert_to_pred_ents(docs_medkit):
    pred_ents = []
    
    for doc in docs_medkit:
        entities = []
        for entity in doc.anns:
            entity_spans = [Span(start=span.start, end=span.end) for span in entity.spans]
            entity_obj = Entity(label=entity.label, spans=entity_spans, text=entity.text)
            entities.append(entity_obj)
        
        pred_ents.append(entities)
    
    return pred_ents

In [137]:
pred_ents = convert_to_pred_ents(docs_medkit)
pred = convert_to_pred_ents(docs_brat)

In [138]:
from medkit.core.text import TextDocument, Entity, Span
from medkit.text.metrics.ner import SeqEvalEvaluator

# define a evaluator using `iob2` as tagging scheme
evaluator = SeqEvalEvaluator(tagging_scheme="iob2")
metrics = evaluator.compute(documents=docs_brat, predicted_entities=pred_ents)

In [140]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.9344262295081968
overall_recall: 1.0
overall_f1-score: 0.9661016949152543
overall_support: 57
overall_acc: 0.9998264757900357
alcool_precision: 0.8636363636363636
alcool_recall: 1.0
alcool_f1-score: 0.9268292682926829
alcool_support: 19
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 1
tabagisme_precision: 1.0
tabagisme_recall: 1.0
tabagisme_f1-score: 1.0
tabagisme_support: 37


In [139]:
from medkit.core.text import TextDocument, Entity, Span
count=0
for gold_doc, predicted_doc in zip(docs_brat, docs_medkit):
    gold_entities = gold_doc.anns
    predicted_entities = predicted_doc.anns
    for gold_entity, predicted_entity in zip(gold_entities, predicted_entities):
        if gold_entity != predicted_entity:
            count+=1
            if gold_entity.spans!=predicted_entity.spans:
                test = preprocessing(gold_doc.text)
                print(f"taille texte brat:{len(gold_doc.text)}")
                print(f"taille texte medkit:{len(predicted_doc.text)}")
                print(f"taille texte brat après prétraitement:{len(test)}")
                print(gold_entity.spans)
                print(predicted_entity.spans)
                print(gold_entity.text)
                print(predicted_entity.text)
            #if gold_entity.text!=predicted_entity.text:
                print(gold_doc.metadata["path_to_text"])
                print("\n\n")

taille texte brat:3476
taille texte medkit:3476
taille texte brat après prétraitement:3476
[Span(start=375, end=381)]
[Span(start=413, end=423)]
fumait
cigarettes
/home/mhassani/Documents/Stage/env/brat-master/data/tabac_test/filepdf-193-cas.txt



taille texte brat:3476
taille texte medkit:3476
taille texte brat après prétraitement:3476
[Span(start=413, end=423)]
[Span(start=375, end=381)]
cigarettes
fumait
/home/mhassani/Documents/Stage/env/brat-master/data/tabac_test/filepdf-193-cas.txt



taille texte brat:3162
taille texte medkit:3162
taille texte brat après prétraitement:3162
[Span(start=22, end=28)]
[Span(start=140, end=146)]
ethylo
alcool
/home/mhassani/Documents/Stage/env/brat-master/data/tabac_test/filepdf-474-cas.txt



taille texte brat:3162
taille texte medkit:3162
taille texte brat après prétraitement:3162
[Span(start=140, end=146)]
[Span(start=22, end=28)]
alcool
ethylo
/home/mhassani/Documents/Stage/env/brat-master/data/tabac_test/filepdf-474-cas.txt





In [None]:
preprocessing(text):


In [159]:
from medkit.core.text import TextDocument, Entity, Span
is_negated_medkit = False
is_negated_brat = False

# Compare the entities in the documents
for gold_doc, predicted_doc in zip(docs_brat, docs_medkit):
    gold_entities = gold_doc.anns
    predicted_entities = predicted_doc.anns

    for gold_entity, predicted_entity in zip(gold_entities, predicted_entities):
        if gold_entity != predicted_entity:
            i+=1
            """
            print(gold_entity.spans)
            print(predicted_entity.spans)
            print("\n")
            print(gold_entity.text)
            print(predicted_entity.text)
            print("\n")
            print(gold_entity)
            print(predicted_entity)
            #print(gold_doc)
            print("\n")
            """
            
            for attr_medkit, attr_gold in zip(predicted_entity.attrs, gold_entity.attrs):
                if predicted_entity.label == "tabagisme":
                    if attr_medkit.metadata == {'rule_id': 2}:
                        is_negated_medkit = attr.value
                    if attr_gold.label == "is_negated":
                        is_negated_brat = attr_gold.value

            #print("ICI")
            if is_negated_brat == None:
                pass
                #is_negated_brat=False
            else:
                print("OUI")
                print(is_negated_brat)
                print(gold_doc)
            #print(is_negated_medkit)
            #print(is_negated_brat)


In [110]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.6721311475409836
overall_recall: 0.7192982456140351
overall_f1-score: 0.6949152542372881
overall_support: 57
overall_acc: 0.9994201468961197
alcool_precision: 0.5454545454545454
alcool_recall: 0.631578947368421
alcool_f1-score: 0.5853658536585366
alcool_support: 19
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 1
tabagisme_precision: 0.7567567567567568
tabagisme_recall: 0.7567567567567568
tabagisme_f1-score: 0.7567567567567567
tabagisme_support: 37


In [35]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.6557377049180327
overall_recall: 0.7142857142857143
overall_f1-score: 0.6837606837606838
overall_support: 56
overall_acc: 0.9993801570268865
alcool_precision: 0.5
alcool_recall: 0.6111111111111112
alcool_f1-score: 0.55
alcool_support: 18
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 1
tabagisme_precision: 0.7567567567567568
tabagisme_recall: 0.7567567567567568
tabagisme_f1-score: 0.7567567567567567
tabagisme_support: 37


In [19]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.6
overall_recall: 0.6428571428571429
overall_f1-score: 0.6206896551724138
overall_support: 56
overall_acc: 0.9992468574627761
alcool_precision: 0.3333333333333333
alcool_recall: 0.3888888888888889
alcool_f1-score: 0.358974358974359
alcool_support: 18
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 1
tabagisme_precision: 0.7567567567567568
tabagisme_recall: 0.7567567567567568
tabagisme_f1-score: 0.7567567567567567
tabagisme_support: 37


In [67]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.7115384615384616
overall_recall: 0.6607142857142857
overall_f1-score: 0.6851851851851851
overall_support: 56
overall_acc: 0.9991802076807209
alcool_precision: 0.6153846153846154
alcool_recall: 0.4444444444444444
alcool_f1-score: 0.5161290322580646
alcool_support: 18
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 1
tabagisme_precision: 0.7567567567567568
tabagisme_recall: 0.7567567567567568
tabagisme_f1-score: 0.7567567567567567
tabagisme_support: 37


## PREPARATION ANNOTATION ALCOOL

In [35]:
df,docs_medkit = extraction_finale("clinical_case2",option_melange=True)
print("fin")

fin


In [37]:
df,docs_medkit = extraction_finale("51_fichiers_annotation_tabac",option_melange=False)
print("fin")

fin


In [38]:
df["tabagisme"].value_counts()

tabagisme
FUMEUR        24
UNKNOWN       17
NON-FUMEUR    10
Name: count, dtype: int64

## EVALUATION: COMPARAISON ENTRE ANNOTATION BRAT ET MEDKIT

In [17]:
# On récupère les annotations brat

from medkit.io.brat import BratInputConverter

# Define Input Converter 
brat_converter = BratInputConverter()

path= "/home/mhassani/Documents/Stage/env/brat-master/data/51_fichiers_annotation_alcool"

# Load brat into a list of documents
docs_brat = brat_converter.load(dir_path = path)
len(docs_brat)

51

In [18]:
len(docs_brat)

51

In [19]:
pred_ents = convert_to_pred_ents(docs_medkit)
pred = convert_to_pred_ents(docs_brat)

In [20]:
from medkit.core.text import TextDocument, Entity, Span
from medkit.text.metrics.ner import SeqEvalEvaluator

# define a evaluator using `iob2` as tagging scheme
evaluator = SeqEvalEvaluator(tagging_scheme="iob2")
metrics = evaluator.compute(documents=docs_brat, predicted_entities=pred_ents)

In [21]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.47435897435897434
overall_recall: 0.5873015873015873
overall_f1-score: 0.524822695035461
overall_support: 63
overall_acc: 0.9980194492929149
alcool_precision: 0.43103448275862066
alcool_recall: 0.5434782608695652
alcool_f1-score: 0.4807692307692307
alcool_support: 46
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 3
tabagisme_precision: 0.6428571428571429
tabagisme_recall: 0.6428571428571429
tabagisme_f1-score: 0.6428571428571429
tabagisme_support: 14


In [66]:
from medkit.core.text import TextDocument, Entity, Span
i=0
# Compare the entities in the documents
for gold_doc, predicted_doc in zip(docs_brat, docs_medkit):
    gold_entities = gold_doc.anns
    predicted_entities = predicted_doc.anns

    for gold_entity, predicted_entity in zip(gold_entities, predicted_entities):
        if gold_entity != predicted_entity:
            i+=1
print(i)

63


In [41]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.47435897435897434
overall_recall: 0.5873015873015873
overall_f1-score: 0.524822695035461
overall_support: 63
overall_acc: 0.9980194492929149
alcool_precision: 0.43103448275862066
alcool_recall: 0.5434782608695652
alcool_f1-score: 0.4807692307692307
alcool_support: 46
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 3
tabagisme_precision: 0.6428571428571429
tabagisme_recall: 0.6428571428571429
tabagisme_f1-score: 0.6428571428571429
tabagisme_support: 14


In [29]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.3888888888888889
overall_recall: 0.42424242424242425
overall_f1-score: 0.40579710144927533
overall_support: 66
overall_acc: 0.9976774837032024
alcool_precision: 0.3018867924528302
alcool_recall: 0.32653061224489793
alcool_f1-score: 0.31372549019607837
alcool_support: 49
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 3
tabagisme_precision: 0.6923076923076923
tabagisme_recall: 0.6428571428571429
tabagisme_f1-score: 0.6666666666666666
tabagisme_support: 14


In [58]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.5636363636363636
overall_recall: 0.4696969696969697
overall_f1-score: 0.5123966942148761
overall_support: 66
overall_acc: 0.9979125850461298
alcool_precision: 0.5405405405405406
alcool_recall: 0.40816326530612246
alcool_f1-score: 0.46511627906976744
alcool_support: 49
situation_precision: 0.5
situation_recall: 1.0
situation_f1-score: 0.6666666666666666
situation_support: 3
tabagisme_precision: 0.6666666666666666
tabagisme_recall: 0.5714285714285714
tabagisme_f1-score: 0.6153846153846153
tabagisme_support: 14


## PREPARATION ANNOTATION SITUATION FAMILIALE

## On fait tourner le code sur ces fichiers uniquement 

In [80]:
df,docs_medkit = extraction_finale("51_fichiers_annotation_situation",option_melange=False)
print("fin")

fin


## EVALUATION: COMPARAISON ENTRE ANNOTATION BRAT ET MEDKIT

In [81]:
# On récupère les annotations brat

from medkit.io.brat import BratInputConverter

# Define Input Converter 
brat_converter = BratInputConverter()

path= "/home/mhassani/Documents/Stage/env/brat-master/data/51_fichiers_annotation_situation"

# Load brat into a list of documents
docs_brat = brat_converter.load(dir_path = path)
len(docs_brat)

51

In [82]:
pred_ents = convert_to_pred_ents(docs_medkit)
pred = convert_to_pred_ents(docs_brat)

In [83]:
from medkit.core.text import TextDocument, Entity, Span
from medkit.text.metrics.ner import SeqEvalEvaluator

# define a evaluator using `iob2` as tagging scheme
evaluator = SeqEvalEvaluator(tagging_scheme="iob2")
metrics = evaluator.compute(documents=docs_brat, predicted_entities=pred_ents)

In [84]:
for metric, value in metrics.items():
    print(f"{metric}: {value}")

overall_precision: 0.5833333333333334
overall_recall: 0.7
overall_f1-score: 0.6363636363636365
overall_support: 40
overall_acc: 0.9988566528512219
alcool_precision: 0.3157894736842105
alcool_recall: 0.375
alcool_f1-score: 0.34285714285714286
alcool_support: 16
situation_precision: 0.7391304347826086
situation_recall: 0.9444444444444444
situation_f1-score: 0.8292682926829269
situation_support: 18
tabagisme_precision: 0.8333333333333334
tabagisme_recall: 0.8333333333333334
tabagisme_f1-score: 0.8333333333333334
tabagisme_support: 6


In [85]:
from medkit.core.text import TextDocument, Entity, Span
i=0
label_tabac=0
label_alcool=0
label_situation=0
num_fichier=1
# Compare the entities in the documents
for gold_doc, predicted_doc in zip(docs_brat, docs_medkit):
    print(num_fichier)
    num_fichier+=1
    gold_entities = gold_doc.anns
    predicted_entities = predicted_doc.anns

    for gold_entity, predicted_entity in zip(gold_entities, predicted_entities):
        if gold_entity != predicted_entity:
            i+=1
            if gold_entity.label != predicted_entity.label:
                print("FAUX")
                print(gold_doc)
                print(gold_entity)
                print(predicted_entity)
                print("\n")


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
FAUX
TextDocument(uid='14cdf40d-1a3a-11ee-ac6e-db8a74a26856', anns=TextAnnotationContainer(doc_id='14cdf40d-1a3a-11ee-ac6e-db8a74a26856', anns=[Entity(uid='14cdf408-1a3a-11ee-ac6e-db8a74a26856', label='situation', attrs=EntityAttributeContainer(ann_id='14cdf408-1a3a-11ee-ac6e-db8a74a26856', attrs=[]), metadata={'brat_id': 'T1'}, keys=set(), spans=[Span(start=16, end=21)], text='marié'), Entity(uid='14cdf409-1a3a-11ee-ac6e-db8a74a26856', label='alcool', attrs=EntityAttributeContainer(ann_id='14cdf409-1a3a-11ee-ac6e-db8a74a26856', attrs=[]), metadata={'brat_id': 'T2'}, keys=set(), spans=[Span(start=116, end=122)], text='alcool'), Entity(uid='14cdf40a-1a3a-11ee-ac6e-db8a74a26856', label='tabagisme', attrs=EntityAttributeContainer(ann_id='14cdf40a-1a3a-11ee-ac6e-db8a74a26856', attrs=[]), metadata={'brat_id': 'T3'}, keys=set(), spans=[Span(start=124, end=133)], text='tabagique'), Entity(uid='14cdf40b-1a3a-11