# Quotation Extraction: Version 1
## 0 Imports

In [1]:
import numpy
import csv
# spaCy
import spacy
# spaCy Visualizer
from spacy import displacy

## 1 Text processing
### 1.1 Loading and cleaning the text file
Loads the article, removes all new line characters and replaces all variants of quotes by a unique one. Puts the text through the NLP pipeline.

I have also noticed that the model doesn't deal with ";" symbols to seperate sentences, which it sometimes treats as the end of the sentence and sometimes not. I replace them with commas (",").

In [2]:
quotes = ["«", "»", "“", "”", "„", "‹", "›", "‟", "〝", "〞"]

# Replace all formats of quotation marks by the quotation mark <">
def normalize_quotes(text):
    for q in quotes:
        text = text.replace(q, '"')
    return text


with open('../data/article02.txt', 'r') as file:
    text = normalize_quotes(file.read().replace('\n', ' '))
    text = text.replace(";", ",")

nlp = spacy.load("fr_core_news_md")
doc = nlp(text)

### 1.2 Splitting text into sentences
Splits the text into a list of sentences

In [3]:
sentences = list(doc.sents)

## 2 Loading Model Parameters
### 2.1 Loading cue verbs

In [30]:
with open('../data/cue_verbs.csv', 'r') as f:
    reader = csv.reader(f)
    cue_verbs = set(list(reader)[0])

print(cue_verbs)

{'citer', 'élaborer', 'expliquer', 'poursuivre', 'reconnaître', "s'interroger", 'constater', 'analyser', 'annoncer', 'recenser', 'suggérer', 'observer', 'formuler', 'préciser', 'adopter', 'décrire', 'démontrer', 'déclarer', 'relever', 'rapporter', 'considérer', 'étudier', 'mettre en évidence', 'proposer', 'affirmer', 'dénoncer', 'insister', 'ajouter', 'lâcher', 'concevoir', 'commenter', 'déplorer', 'estimer', 'raconter', 'corroborer', 'répondre', 'établir', 'confirmer', 'résumer', 'calculer', 'abonder', 'prétendre', 'identifier', 'montrer', 'présenter', 'sourir', 'donner', 'exprimer', 'murmure', 'prôner', 'soutenir', 'publier', 'remarquer', 'se souvenir', 'définir', "mettre l'accent sur", 'souvenir', 'prouver', 'découvrir', 'ricaner', 'écrire', 'crier', 'proclamer', 'dire', 'souligner', 'noter'}


### 2.2 Loading Quotations Structures
The quotation structures are stored in a CSV file, respecting the following format. All Part-Of-Speech elements are abbreviated as follows.
* RS: Reported Speech
* CV: Cue Verb
* QT: Quotee
* text: distinct words
One structure per line, with each element seperated by a comma.

In [31]:
with open('../data/quote_structures.csv', 'r') as f:
    reader = csv.reader(f)
    structures = list(reader)

print(structures)

[['parataxis', 'obj', 'ancestor'], ['parataxis', 'nsubj', 'ancestor'], ['ROOT', 'obj', 'advcl']]


## 3 Structure extraction
### 3.1 Finding sentences containing cue verbs

In [32]:
# Returns true iff the sentence contains quotation marks.
def contains_cue(sentence):
    for token in sentence:
        if token.lemma_ in cue_verbs:
            return True


cues = []
        
for s in sentences:
    if contains_cue(s):
        cues.append(s)
        print(s, "\n")

Et nous marchons dans le lit du fleuve", fait constater Ousmane, agropasteur malien. 

C’est comme si nous étions au début du cycle agricole, mi-juin", poursuit Ousmane. 

Djehamé, la mère d’Ousmane, se souvient: "A partir de là, les choses ont changé. 

On observe également un accroissement des événements climatiques extrêmes, telles que les sécheresses et les inondations. 

"C’est une véritable compétition autour des ressources naturelles, une spéculation même", résume M. Dolo, ingénieur des Eaux et Forêts et coordinateur du Programme de développement durable pour la plaine du Niger. 

Finalement, Housseyni devra donner deux bêtes à Nafa, immédiatement. 

J’apprends qu’ils sont tous partis, un à un", confirme Arkietou. 

Si tu n’as pas fini l’école et que tu ne peux pas poursuivre des études, ou que tes parents ne peuvent pas t’aider pour étudier, tu n’as pas d’autres possibilités que de chercher des moyens ailleurs. 

Et ils se disent aussi "si moi, je ne le fais pas, d’autres le fe

### 3.2 Sentences with quotes: formats
Our first sentence obviously contains a quote. First we obtain the cue verbs token, as well as its relationship with other elements in the sentence. Using displaCy, we can have a visual representation of the sentence dependancies.

In [33]:
sentence = cues[0]

displacy.render(sentence, style="dep")

In [34]:
for token in sentence:
    if token.lemma_ in cue_verbs:
        verb = token

print("Verb: ", verb.lemma_, "   Dependance:", verb.dep_)

print("Ancestors:")
for token in verb.ancestors:
    print(token.text, token.pos_, token.dep_)
    
print("Children:") 
for token in verb.children:
    print(token.text, token.pos_, token.dep_)

Verb:  constater    Dependance: xcomp
Ancestors:
fait VERB acl
nous PRON nsubj
marchons VERB ROOT
Children:
Ousmane PROPN obj


We can now make our first rule to determine if the cue verb is actually used in a quote or not.

In [35]:
token_dep = "parataxis"
token_children = [["obj"], ["obj", "obl"]]

def cue_verb(token):
    # The list of dependencies of the token's children
    children = sorted(list(map(lambda t:t.dep_, list(verb.children))))
    return token.dep_ == token_dep and children in token_children

print(cue_verb(verb))

False


### 3.3 Normalizing the quote structure
We want to define the following quote format as a string: 
"Reported speech", "Cue verb" "Quotee" "Optional Obl"
Based on 3.2, we know that in this case the cue verb is the parataxis of the sentence, that the quotee is an object of the parataxis and that it has an oblique nominal, which is optional. In this case, the quote is all that is under the root, except the parataxis. Basically we can model this format of quotes as the list of strings:

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; \["CV=parataxis", "QT=obj", "RS=root"\]

We can then do this for all sentences containing quotes in the article.

In [38]:
def extract_cue(sentence):
    for token in sentence:
        if token.lemma_ in cue_verbs:
            return token

def extract_quotee(token):
    quotee = ""
    for t in token.subtree:
        quotee += t.text + t.whitespace_
    return quotee

def quote_text(token, cv_dep):
    quote = ""
    for t in token.lefts:
        if t.dep_ != cv_dep:
            quote += quote_text(t, cv_dep)
    quote += token.text + token.whitespace_
    for t in token.rights:
        if t.dep_ != cv_dep:
            quote += quote_text(t, cv_dep)
    return quote

def is_quote(sentence):
    if contains_cue(sentence):
        cv = extract_cue(sentence)
        children = list(map(lambda t:t.dep_, list(cv.children)))
        for struct in structures:
            cv_dep = struct[0]
            qt_pos = struct[1]
            q_pos = struct[2]
            if cv.dep_ == cv_dep and qt_pos in children:
                if q_pos == "ancestor":
                    quote = quote_text(next(cv.ancestors), cv_dep)
                elif q_pos in children:
                    for child in cv.children:
                        if child.dep_ == q_pos:
                            q_token = child
                            quote = quote_text(q_token, cv_dep)
                else:
                    return "No quote in this sentence"
                for child in cv.children:
                    if child.dep_ == qt_pos:
                        qt_token = child
                quotee = extract_quotee(qt_token)
                return quotee + " said " + quote
    return "No quote in this sentence"



In [39]:
for s in cues:
    print("Sentence:\n")
    print(s.text, "\n")
    print("The result was:\n")
    print(is_quote(s), "\n\n\n")

Sentence:

Et nous marchons dans le lit du fleuve", fait constater Ousmane, agropasteur malien. 

The result was:

No quote in this sentence 



Sentence:

C’est comme si nous étions au début du cycle agricole, mi-juin", poursuit Ousmane. 

The result was:

Ousmane said C’est comme si nous étions au début du cycle agricole, mi-juin".  



Sentence:

Djehamé, la mère d’Ousmane, se souvient: "A partir de là, les choses ont changé. 

The result was:

No quote in this sentence 



Sentence:

On observe également un accroissement des événements climatiques extrêmes, telles que les sécheresses et les inondations. 

The result was:

No quote in this sentence 



Sentence:

"C’est une véritable compétition autour des ressources naturelles, une spéculation même", résume M. Dolo, ingénieur des Eaux et Forêts et coordinateur du Programme de développement durable pour la plaine du Niger. 

The result was:

du Programme de développement durable pour la plaine du Niger said "C’est une véritable comp