In [37]:
import re
from bs4 import BeautifulSoup
from src.postprocess import PERSONS, match_person
from src.preprocess import fix_text
from pathlib import Path
import pandas as pd

In [2]:
PATH = Path('.data/text_1982')
SPEAKER_PATTERN = f"({'|'.join(list(PERSONS.keys())+['pr[ée]sident', 'rapporteur'])})"
TARGET_PATTERN = re.compile(f"{SPEAKER_PATTERN}(?!</u>)", re.I)
SPEAKER_PATTERN = re.compile(SPEAKER_PATTERN, re.I)

In [3]:
def get_question_id(u):
    question = u.find_parent(attrs={'type':'question'})
    return question['corresp'] if question and 'corresp' in question.attrs else ''

In [13]:
mentions = []

for path in PATH.glob('*.xml'):
    soup = BeautifulSoup(fix_text(path.read_text(encoding='utf-8')))
    speaker = ""
    for p in soup.find_all('p'):
        u = p.find('u')
        if u:
            speaker = SPEAKER_PATTERN.search(u.text)
            speaker = speaker.group() if speaker else ""
        mentions.append((speaker, TARGET_PATTERN.findall(str(p)), p.text.replace('\n', ' ').strip(), get_question_id(p), path.stem))

In [71]:
df_gold = pd.read_csv(f'data/82_speaker_target.csv')

In [22]:
ids = []

def is_p(p, sentence):
    return sentence.strip() in p[2]

for i,sentence in enumerate(df_gold.sent_text.unique()):
    m_list = list(filter(lambda x: is_p(x, sentence), mentions))
    ids.append(len(m_list))

    # assert len(m_list)>0, f"{i}, {sentence}"
    assert len(m_list)>0, f"{i}, {df_gold[df_gold['sent_text']==sentence]}"

    if len(m_list)>1:
        print(i)
        print(sentence)
        print(df_gold[df_gold['sent_text']==sentence][['mention_text', 'is_speaker']])
        print([m[0] for m in m_list])

98
Après la lecture de ce projet de décision, Monsieur le Président déclare ouverte la discussion générale.
    mention_text  is_speaker
123    Président        True
179    Président        True
224    Président        True
['Président', 'Président', 'Président']
277
Monsieur le Président indique que l'ordre du jour est le suivant :
    mention_text  is_speaker
406    Président        True
['', 'Président']


In [63]:
def eval_speaker(row):
    m = list(filter(lambda x: is_p(x, row['sent_text']), mentions))[-1]
    if row['is_speaker']:
        return  m[0], match_person({'mention_text':m[0], 'decision_id':m[3]}) == match_person(row)
    return m[0], None

In [79]:
def eval_target(row):
    m = list(filter(lambda x: is_p(x, row['sent_text']), mentions))[-1]
    if row['is_target']:
        for target in m[1]:
            if match_person({'mention_text':target, 'decision_id':m[3]}) == match_person(row):
                return m[1], True
        return m[1], False
    return m[1], None

In [80]:
df_gold[['pred_speaker', 'eval_speaker']] = df_gold.apply(eval_speaker, axis=1).tolist()
df_gold[['pred_target', 'eval_target']] = df_gold.apply(eval_target, axis=1).tolist()

  return asarray(a).ndim


In [83]:
df_gold[df_gold['eval_speaker']==False][['sent_text', 'mention_text', 'pred_speaker', 'is_speaker', 'file_name']]

Unnamed: 0,sent_text,mention_text,pred_speaker,is_speaker,file_name
9,Monsieur PERETTI et Monsieur JOXE sont d'accor...,JOXE,PERETTI,True,PV1982-01-05
67,Monsieur MONNERVILLE remercie Monsieur LECOURT...,MONNERVILLE,,True,PV1982-02-18-23
137,Monsieur GROS rappelle qu'il était rapporteur ...,rapporteur,GROS,True,PV1982-06-28
160,Monsieur SEGALAT déclare se rallier à la propo...,SEGALAT,VEDEL,True,PV1982-06-28
251,"Selon le Doyen VEDEL, l'article 34 n'invoquant...",VEDEL,SEGALAT,True,PV1982-07-30
265,Messieurs BROUILLET et MONNERVILLE déclarent a...,MONNERVILLE,BROUILLET,True,PV1982-07-30
287,Monsieur VEDEL présente le rapport suivant.,VEDEL,Président,True,PV1982-11-10
308,"En effet, Monsieur PERETTI expose que la loi d...",PERETTI,VEDEL,True,PV1982-11-18
339,"Monsieur VEDEL s'étant retiré, Monsieur le Pré...",Président,VEDEL,True,PV1982-11-18
341,"Monsieur VEDEL s'étant retiré, Monsieur le Pré...",GROS,VEDEL,True,PV1982-11-18


In [34]:
m_list

[('MONNERVILLE',
  [],
  'Monsieur MONNERVILLE lève alors la séance à 18 h 10.',
  '',
  'PV1982-12-29')]

In [26]:
df_gold.apply(lambda x: print(x['file_name']), axis=1)

PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-01-05
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-11
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-23
PV1982-02-18-2

0      None
1      None
2      None
3      None
4      None
       ... 
485    None
486    None
487    None
488    None
489    None
Length: 490, dtype: object

In [None]:
def get_mention_item(idx, mentions):
    def wrapped_get_mention_item(sent_text):
        m_list = list(filter(lambda x: is_p(x, sentence), mentions))
        

In [None]:
p = pd.DataFrame(mentions, columns=['mention_text', ])