In [1]:
from pathlib import Path
from functools import reduce
from operator import add

from spacy.matcher import DependencyMatcher
from spacy import displacy
import spacy

import pandas as pd

from src.preprocess import PE_PATTERN, get_pe_paragraphs

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp = spacy.load("fr_core_news_sm")

In [3]:
spacy.tokens.Doc.set_extension('fname', default=None)
spacy.tokens.Doc.set_extension('year', default=None)

In [4]:
paragraphs = get_pe_paragraphs(Path('.data/dones/').glob('*.xml'))

In [61]:
def process_doc(x):
    doc = nlp(x['text'])
    doc._.fname = x['fname']
    doc._.year = x['year']
    return list(filter(lambda x: PE_PATTERN.search(x.text), doc.sents))

docs = reduce(add, paragraphs[paragraphs['is_pe']].apply(process_doc, axis=1).values, [])

In [98]:
PE_DEP_PATTERN = [
  {
    "RIGHT_ID": "principe",
    "RIGHT_ATTRS": {"LOWER": "principe"}
  },
  {
    "LEFT_ID": "principe",
    "REL_OP": ">",
    "RIGHT_ID": "principe_egalite",
    "RIGHT_ATTRS": {"DEP": "nmod", "LOWER": {"REGEX": "[eé]galit[eé]"} },
  } 
]

DEP_PATTERNS = [
  PE_DEP_PATTERN + [ # Récupère principe d'égalité
    {
      "LEFT_ID": "principe",
      "REL_OP": "<<",
      "RIGHT_ID": "gov",
      "RIGHT_ATTRS": {"DEP":"obj"},
    },
    {
      "LEFT_ID": "gov",
      "REL_OP": ">>",
      "RIGHT_ID": "subj",
      "RIGHT_ATTRS": {"DEP":"nsubj"},
    },
  ],
  PE_DEP_PATTERN + [ # nmod of egalité
    {
      "LEFT_ID": "principe_egalite",
      "REL_OP": ">",
      "RIGHT_ID": "egalite_nmod",
      "RIGHT_ATTRS": {"DEP": "nmod"},
    }
  ],
  PE_DEP_PATTERN + 
  [ # nmod of egalité
    {
      "LEFT_ID": "principe",
      "REL_OP": ">",
      "RIGHT_ID": "other_nmod",
      "RIGHT_ATTRS": {"DEP": "nmod"},
    }
  ],
]

def get_pattern_value(pattern_key, storage):
  def wrapped_get_pattern_value(matcher, doc, i, matches):
    pattern = matcher.get("FOUNDED")[1][0]
    for _, token_ids in matches:
      for t_c, t_id in enumerate(token_ids):
        if pattern[t_c]['RIGHT_ID'] == pattern_key:
          storage.append(doc[t_id])
  return wrapped_get_pattern_value

In [99]:
govs = []
matcher = DependencyMatcher(nlp.vocab)
matcher.add("FOUNDED", [DEP_PATTERNS[0]], on_match=get_pattern_value('subj', govs))
_ = list(map(matcher, docs))

In [100]:
pd.Series(map(lambda x: x.lemma_, govs)).value_counts()

qui              49
principe         15
il               14
différence        6
législateur       6
décision          6
disposition       4
cour              3
exclusion         2
moyen             2
emploi            2
on                2
jurisprudence     2
lui               2
lequel            2
procureur         1
conseil           1
je                1
réalité           1
nous              1
loi               1
ministre          1
Name: count, dtype: int64

In [101]:
options = {"compact": True}
displacy.render(list(filter(lambda x: x.lemma_ == 'qui', govs))[0].doc, style="dep", options=options)

egalite nmod

In [102]:
govs = []
matcher = DependencyMatcher(nlp.vocab)
matcher.add("FOUNDED", [DEP_PATTERNS[1]], on_match=get_pattern_value('egalite_nmod', govs))
_ = list(map(matcher, docs))

In [103]:
pd.Series(map(lambda x: x.lemma_, govs)).value_counts()

suffrage      78
traitement     6
citoyen        5
impôt          2
détriment      2
accès          2
charge         2
servitude      2
article        1
élu            1
part           1
criminel       1
agent          1
party          1
CSG            1
Name: count, dtype: int64

In [104]:
options = {"compact": True}
doc = list(filter(lambda x: x.lemma_ == 'traitement', govs))[3].doc
print(doc.text)
displacy.render(doc, style="dep", options=options)

En deuxième lieu ils soutiennent que la loi déférée porte atteinte au droit de suffrage, viole le principe de libre administration des 
collectivités locales ainsi que le principe d'égalité de traitement entre les candidats. 


Other nmod of principe 

In [105]:
govs = []
matcher = DependencyMatcher(nlp.vocab)
matcher.add("FOUNDED", [DEP_PATTERNS[2]], on_match=get_pattern_value('other_nmod', govs))
_ = list(map(matcher, docs))

In [108]:
len(docs)

546

In [107]:
pd.Series(map(lambda x: x.lemma_, govs)).value_counts()

égalité          645
loi               65
justice           19
charge             8
service            8
ce                 7
liberté            6
egalite            5
respect            4
candidat           4
suffrage           4
vue                3
disposition        3
différence         3
cas                3
raison             3
article            3
expression         3
redevable          2
validation         2
atteinte           2
électeur           2
conseiller         2
matière            2
nature             2
garde              2
principe           2
application        2
égard              2
déroulement        2
rupture            2
fonction           2
entreprise         2
déclaration        2
peu                2
catégorie          2
part               2
impôt              2
sexe               2
personne           2
député             2
communication      2
égalité.qu'        1
Name: count, dtype: int64

In [109]:
options = {"compact": True}
doc = list(filter(lambda x: x.lemma_ == 'traitement', govs))[3].doc
print(doc.text)
displacy.render(doc, style="dep", options=options)

IndexError: list index out of range