# Quotation Extraction: Version 3
## 0 Imports

In [1]:
import numpy
import csv
# spaCy
import spacy
# spaCy Visualizer
from spacy import displacy
from spacy.matcher import Matcher

## 1 Text processing
### 1.1 Loading and cleaning the text file
Loads the article, removes all new line characters and replaces all variants of quotes by a unique one. Puts the text through the NLP pipeline.

I have also noticed that the model doesn't deal with ";" symbols to seperate sentences, which it sometimes treats as the end of the sentence and sometimes not. I replace them with commas (",").

In [2]:
quotes = ["«", "»", "“", "”", "„", "‹", "›", "‟", "〝", "〞"]

# Replace all formats of quotation marks by the quotation mark <">
def normalize_quotes(text):
    for q in quotes:
        text = text.replace(q, '"')
    return text


with open('../data/article01.txt', 'r') as file:
    text = normalize_quotes(file.read().replace('\n', ' '))
    text = text.replace(";", ",")

nlp = spacy.load("fr_core_news_md")
doc = nlp(text)

## 2 Loading Model Parameters
### 2.1 Loading cue verbs

In [3]:
with open('../data/cue_verbs.csv', 'r') as f:
    reader = csv.reader(f)
    cue_verbs = set(list(reader)[0])

print(cue_verbs)

{'observer', 'insister', 'répondre', 'analyser', 'confirmer', 'noter', 'raconter', 'proclamer', 'relever', 'suggérer', 'établir', 'ajouter', 'annoncer', 'préciser', 'sourir', "s'interroger", 'expliquer', 'dire', 'souligner', 'recenser', 'démontrer', 'ricaner', 'découvrir', 'constater', 'proposer', 'écrire', 'abonder', 'exprimer', 'citer', 'poursuivre', 'résumer', 'formuler', 'lâcher', 'reconnaître', 'affirmer', 'montrer', 'se souvenir', 'calculer', 'définir', 'prôner', 'élaborer', 'souvenir', 'dénoncer', 'murmurer', 'adopter', 'soutenir', 'déclarer', 'prouver', 'estimer', 'remarquer', 'identifier', 'étudier', 'donner', 'prétendre', 'mettre en évidence', 'rapporter', 'déplorer', 'crier', 'commenter', 'concevoir', 'corroborer', 'décrire', 'présenter', "mettre l'accent sur", 'considérer'}


### 2.2 Loading Quotations Structures
The quotation structures are stored in a CSV file, respecting the following format. All Part-Of-Speech elements are abbreviated as follows.
* RS: Reported Speech
* CV: Cue Verb
* QT: Quotee
* text: distinct words
One structure per line, with each element seperated by a comma.

In [4]:
with open('../data/quote_structures.csv', 'r') as f:
    reader = csv.reader(f)
    structures = list(reader)

print(structures)

[['parataxis', 'obj', 'ancestor'], ['parataxis', 'nsubj', 'ancestor'], ['ROOT', 'obj', 'advcl']]


## 3 Finding quote containing sentences
### 3.1 Finding sentences containing direct quotes, and replacing the quotes with special characters

In [5]:
def is_quote(text):
    # Quote at least 3 tokens long
    a = len(text) >= 3
    # Non-proper noun words are not all capitalized 
    b = [word.shape_[0] == 'X' for word in text if not word.pos_ in ["PROPN", "PUNCT"]]
    return a and (False in b)

def quote_finder(text):
    matcher = Matcher(nlp.vocab, validate=True)
    # Add match ID "Quote"
    pattern = [{"TEXT": '"'}]
    matcher.add("Quote", None, pattern)

    # Find the quote matches
    matches = matcher(text)
    opening_quotes = []
    closing_quotes = []
    for match_id, start, end in matches:
        if len(opening_quotes) == len(closing_quotes):
            opening_quotes.append(start)
        else:
            closing_quotes.append(start + 1)

    quote_pos = zip(opening_quotes, closing_quotes)
    potential_quotes = []
    for (start, end) in quote_pos:
        quote = text[start:end]
        if is_quote(quote):
            potential_quotes.append((start, end))
    return potential_quotes

def encode_quotes(text):
    potential_quotes = quote_finder(text)
    encoded_text = ""
    prev_quote_end = 0
    for (start, end) in potential_quotes:
        encoded_text += text[prev_quote_end:start].text
        encoded_text += " * "
        prev_quote_end = end
    encoded_text += text[prev_quote_end:].text
    return encoded_text

encoded_text = encode_quotes(doc)
encoded_doc = nlp(encoded_text)
encoded_sentences = list(encoded_doc.sents)

In [6]:
# Returns true iff the sentence contains quotation marks.
def contains_cue(sentence):
    for token in sentence:
        if token.lemma_ in cue_verbs:
            return True

def contains_quotes(sentence):
    for token in sentence:
        if token.text == '*':
            return True

cues = []
        
for s in encoded_sentences:
    if contains_cue(s) and contains_quotes(s):
        cues.append(s)
        print(s, "\n")

Mais * , observe Alexandre Roulin, de l’Université de Lausanne. 

* , souligne Luis San-Jose, premier auteur de l’étude. 

* voler * , raconte Alexandre Roulin. 

* , explique le chercheur. 



### 3.2 Observing the structure of encoded sentences containing cue verbs.

In [7]:
for s in cues:
    displacy.render(s, style="dep", options={"compact": True})

### 3.3 Extracting Named Entites that are the object of the sentence
Extracting Named Entities by simply returning named entities that are children of the cue verb

In [9]:
def extract_cue_verb(sentence):
    for token in sentence:
        if token.lemma_ in cue_verbs:
            return token

def extract_quotee(token):
    quotee = ""
    for t in token.subtree:
        quotee += t.text + t.whitespace_
    return quotee
        
def find_quotes(encoded_doc):
    encoded_sentences = list(encoded_doc.sents)
    for s in encoded_sentences:
        if contains_cue(s) and contains_quotes(s):
            cv = extract_cue_verb(s)
            quotee = None
            for child in cv.children:
                if child.pos_ == "PROPN" and quotee is None:
                    quotee = extract_quotee(child)
            print("In <", s, ">, the quotee was", quotee)

find_quotes(encoded_doc)

In < Mais * , observe Alexandre Roulin, de l’Université de Lausanne. >, the quotee was Alexandre Roulin
In < * , souligne Luis San-Jose, premier auteur de l’étude. >, the quotee was Luis San-Jose, premier auteur de l’étude
In < * voler * , raconte Alexandre Roulin. >, the quotee was Alexandre Roulin
In < * , explique le chercheur. >, the quotee was None


In [10]:
test_shit = 'Jean-Michel Beauf souligne que "les chercheurs sont tous des cons"'
test_doc = nlp(test_shit)
displacy.render(test_doc, style="dep", options={"compact": True})