In [21]:
# %pip install beautifulsoup4
# %pip install stop-words
# !python -m spacy download fr_core_news_md

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting fr-core-news-md==3.2.0
  Using cached https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.2.0/fr_core_news_md-3.2.0-py3-none-any.whl (46.9 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_md')
Note: you may need to restart the kernel to use updated packages.


In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import lru_cache
from pathlib import Path
from tqdm import tqdm
import re
import json
from bs4 import BeautifulSoup
from collections import OrderedDict
import nltk
import Levenshtein
from nltk.tokenize import word_tokenize
from stop_words import get_stop_words
stopwords = get_stop_words('french')
# from nltk.corpus import stopwords
# stopwords = stopwords.words('french')

In [176]:
def yield_lines(filepath):
    with Path(filepath).open('r') as f:
        for line in f:
            yield line.strip('\n').strip()
def read_lines(filepath):
    return ' '.join(yield_lines(filepath))



In [177]:
REPO_DIR = Path('..').resolve()
DATA_DIR = REPO_DIR / 'resources/dataset/FIXED_ANNOTATED_CLINICALCASES'

filepaths = sorted(list(DATA_DIR.glob('*.xml')))
print('Number of files: ', len(filepaths))

files = set()
annotators = set()
for filepath in filepaths:
    chunks = filepath.stem.split('-')
    if len(chunks) > 2:
        annotators.add(chunks[0])
        files.add(int(chunks[1]))


print(len(files))
print(annotators)
print(sorted(files))

Number of files:  407
100
{'ml', 'ngr', 'dm', 'cc', 'eb', 'jd', 'ak', 'ep', 'oi'}
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


In [178]:
# annotators annotated number of files
for annotator in annotators:
    print(annotator, len(set([filepath.stem for filepath in filepaths if filepath.stem.startswith(annotator)])))

ml 1
ngr 100
dm 1
cc 1
eb 1
jd 3
ak 100
ep 100
oi 100


In [179]:
filepath = filepaths[0]
filepath


PosixPath('/home/kim/Sync/workspace/Lille/LexSimMed/FIXED_ANNOTATED_CLINICALCASES/ak-1-content.xml')

In [180]:
with filepath.open('r') as f:
    soup = BeautifulSoup(f.read())
    

In [181]:
import spacy

def tokenize(text, lang='french'):
    return word_tokenize(text, language=lang)

@lru_cache(maxsize=1)
def get_spacy_model():
    model = 'fr_core_news_md'
    if not spacy.util.is_package(model):
        spacy.cli.download(model)
        spacy.cli.link(model, model, force=True,
                       model_path=spacy.util.get_package_path(model))
    return spacy.load(model)  # python -m spacy download en_core_web_sm`

def to_lemmas(word):
    nlp = get_spacy_model()
    doc = nlp(word.lower())
    lemmas = []
    for token in doc:
        lemmas.append(token.lemma_.strip("'"))
    return lemmas

@lru_cache(maxsize=4048)
def is_stopword(word):
    tokens = to_lemmas(word)
    return len([token for token in tokens if token not in stopwords]) == 0

# print(len(stopwords))
# print(stopwords)
print(is_stopword('avec eux'))
print(is_stopword('est une fille'))
print(is_stopword("n'est"))



True
False
True


## Process all files

In [191]:
@lru_cache(maxsize=1)
def get_sentence_segment_model():
    from spacy.lang.fr import French

    nlp_fr = French()  # just the language with no pipeline
    nlp_fr.add_pipe("sentencizer")
    return nlp_fr
    
LABELS = ['par-défaut', 'ne-connais-pas', 'pas-sur-de-comprendre']

def process(filepath):
    doc = []    
    with filepath.open('r') as f:
        soup = BeautifulSoup(f.read())
        for p in soup.find_all('text:p'):
            text = p.getText() 
            target_words = re.findall(r"\[(.*?)\]", text)
            
            # skip target text that contains only stopwords
            target_words = [f'{w}' for w in target_words if not is_stopword(w)]
            # print(target_words)

            word_dict = OrderedDict({word: LABELS[0] for word in target_words if word})
            # print(dict)
            # print(text)
            text_spans = p.find_all('text:span')
            for span in text_spans:
                label = span.get('text:style-name')
                # print(span.getText(), label)
                if label in LABELS:
                    target_text = span.getText()
                    difficult_words = re.findall(r"\[(.*?)\]", target_text)
                    for difficult_word in difficult_words:
                        # print(difficult_word)
                        word_dict[difficult_word] = label

            text = re.sub(r'[\[\]]', '', text) # remove brackets
            nlp_fr = get_sentence_segment_model()
            nlp_doc = nlp_fr(text)

            for word in word_dict:
                
                item = {}
                item['annotator_file'] = filepath.stem.replace('-content','')
                item['paragraph'] = text # remove brackets
                
                # add sentence-level text
                item['sentence'] = ''
                for sentence in nlp_doc.sents:
                    # print(word, " :: ", sentence, " : ", word in sentence.text)
                    if re.search(re.escape(word), sentence.text, re.IGNORECASE):
                        item['sentence'] = sentence.text
                        break

                item['target'] = word
                item['label'] = word_dict[word]

                doc.append(item)
    return doc
    

import multiprocessing

pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
mapped_values = list(tqdm(pool.imap(process, filepaths), total=len(filepaths)))

100%|██████████| 407/407 [00:33<00:00, 12.17it/s]


In [192]:
data = pd.DataFrame([value for values in mapped_values for value in values])
data = data[data['sentence'] != ''] # drop rows with empty sentence

In [193]:
data

Unnamed: 0,annotator_file,paragraph,sentence,target,label
0,ak-1,Il s'agit d'une femme de 32 ans d'origine ghan...,Il s'agit d'une femme de 32 ans d'origine ghan...,s'agit,par-défaut
1,ak-1,Il s'agit d'une femme de 32 ans d'origine ghan...,Il s'agit d'une femme de 32 ans d'origine ghan...,d'une femme,par-défaut
2,ak-1,Il s'agit d'une femme de 32 ans d'origine ghan...,Il s'agit d'une femme de 32 ans d'origine ghan...,de 32 ans,par-défaut
3,ak-1,Il s'agit d'une femme de 32 ans d'origine ghan...,Il s'agit d'une femme de 32 ans d'origine ghan...,d'origine ghanéenne enceinte de 14 semaines,par-défaut
4,ak-1,Il s'agit d'une femme de 32 ans d'origine ghan...,Elle présente des vomissements depuis le début...,présente,par-défaut
...,...,...,...,...,...
51858,oi-99,"De principe , un traitement antituberculeux pa...","De principe , un traitement antituberculeux pa...",a été instauré,par-défaut
51859,oi-99,"De principe , un traitement antituberculeux pa...","De principe , un traitement antituberculeux pa...",pour 2 mois,par-défaut
51860,oi-99,"De principe , un traitement antituberculeux pa...","De principe , un traitement antituberculeux pa...",relayé,par-défaut
51861,oi-99,"De principe , un traitement antituberculeux pa...","De principe , un traitement antituberculeux pa...",par une bithérapie,pas-sur-de-comprendre


In [194]:
data.to_csv( REPO_DIR / 'resources/dataset/data_all_annotators.csv', index=False)

### Merge all data based on annotators

In [195]:
data = pd.read_csv(REPO_DIR / 'resources/dataset/data_all_annotators.csv')

# data['label'] = data['label'].replace('pas-sur-de-comprendre', 'ne-connais-pas')

In [196]:
unique_words = data['target'].unique()
# unique_words = data[data['label'] == 'ne-connais-pas']['target'].unique()
unique_words = set([word.lower() for word in unique_words])
# pd.DataFrame(unique_words).to_csv('unique_words.csv', index=False)
pd.DataFrame(unique_words)


Unnamed: 0,0
0,la voix
1,les metabolites
2,à type d'incontinence urinaire
3,et de myopie
4,au total trois mois
...,...
9704,à 5 %
9705,avec étude histologique
9706,lors de cette perfusion
9707,"de la nicotine,"


In [197]:
from collections import Counter
from tqdm import tqdm
import multiprocessing

def data_selection(word):
    fd = data[data['target'].str.lower() == word]
    labels = fd['label'].to_list()
    
    # majority approach
    # counter = Counter(labels)
    # label = 'ne-connais-pas' if counter['ne-connais-pas'] >= counter['par-défaut'] else 'par-défaut'

    # at least one ne-connais-pas approach
    label = 'ne-connais-pas' if 'ne-connais-pas' in labels else 'par-défaut'

    d = fd.iloc[0]
    d['label'] = label
    return d

pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
new_data = list(tqdm(pool.imap(data_selection, unique_words), total=len(unique_words)))


new_data = pd.DataFrame(new_data)
new_data



100%|██████████| 9709/9709 [00:26<00:00, 359.71it/s]


Unnamed: 0,annotator_file,paragraph,sentence,target,label
8642,ak-57,"En novembre 2002 , étant donné la fréquence du...","Après trois mois de ce régime , on note une ne...",la voix,par-défaut
11890,ak-89,Une femme de 29 ans est victime d'agression se...,Du GHB est retrouvé dans l'urine au taux de 4 ...,les metabolites,ne-connais-pas
482,ak-10,"Madame Nicole R., 63 ans, a été hospitalisée p...","A l'interrogatoire, il existait des troubles d...",à type d'incontinence urinaire,par-défaut
7061,ak-51,"Le patient est traité pour une acné vulgaire ,...",Il souffre par ailleurs d'allergies saisonnièr...,et de myopie,ne-connais-pas
8006,ak-54,"Au jour 0 , le jeune patient reçoit de l'acéta...",Les échanges plasmatiques pour ce patient ont ...,au total trois mois,par-défaut
...,...,...,...,...,...
69,ak-1,Lors de la première visite médicale à 11 semai...,Une perfusion intraveineuse continue de soluti...,à 5 %,par-défaut
11462,ak-82,"Mme L . K 50 ans , diabétique , est admise dan...",La biopsie avec étude histologique confirme le...,avec étude histologique,par-défaut
8026,ak-54,Le patient devait recevoir quatre doses de rit...,La dose totale reçue lors de cette perfusion a...,lors de cette perfusion,par-défaut
3029,ak-28,"Le 12 février, Mademoiselle M., 36 ans, 64 kg,...","Du méprobamate, de l'acéprométazine, des benzo...","de la nicotine,",par-défaut


In [198]:
# count each label
print(new_data[new_data['label'] == 'ne-connais-pas'].shape)
print(new_data[new_data['label'] == 'par-défaut'].shape)

(3482, 5)
(6227, 5)


In [199]:
# save final data
new_data.drop('annotator_file', inplace=True, axis=1)
new_data.to_csv(REPO_DIR / 'resources/dataset/data.csv', index=False)
new_data.to_excel(REPO_DIR / 'resources/dataset/data.xlsx', index=False)