# Extraction of features for quote detection
## Imports

In [1]:
import numpy as np
import spacy
import csv 

nlp = spacy.load("fr_core_news_md")

## Constants
### SpaCy POS tags

In [2]:
POS_TAGS = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET',\
            'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
            'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']

NE_PER = 'PER'

sentence = 'Mais "les nuits de pleine lune, les plus claires d’entre elles resplendissent '\
            'comme des soleils", observe Alexandre Roulin, de l’Université de Lausanne.'

s = nlp(sentence)

pos_tags = []
pos_tags_int = []
pos_tag_feature = []
tag_tags = []
tag_tags_int = []

for token in s:
    pos_tags.append(token.pos_)
    pos_tags_int.append(token.pos)
    pos_tag_feature.append(POS_TAGS.index(token.pos_))
    tag_tags.append(token.tag_)
    tag_tags_int.append(token.tag)
    
print(pos_tags[:10])
print(pos_tags_int[:10])
print(pos_tag_feature)
print(tag_tags[:2])
print(tag_tags_int[:2])

['CCONJ', 'PUNCT', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT', 'DET', 'ADV']
[89, 97, 90, 92, 85, 84, 92, 97, 90, 86]
[5, 13, 6, 8, 1, 0, 8, 13, 6, 2, 0, 5, 1, 11, 16, 1, 6, 8, 13, 13, 16, 12, 12, 13, 1, 6, 8, 1, 12, 13]
['CCONJ___', 'PUNCT___']
[8673025306212932083, 16177087412692307460]


In [3]:
import csv 

with open('../data/cue_verbs.csv', 'r') as f:
    reader = csv.reader(f)
    cue_verbs = set(list(reader)[0])

cue_verbs = [nlp(verb)[0] for verb in cue_verbs]
print(cue_verbs)

[s', murmurer, dénoncer, soutenir, élaborer, montrer, prôner, considérer, poursuivre, préciser, mettre, ajouter, identifier, crier, raconter, estimer, donner, concevoir, déplorer, écrire, corroborer, recenser, souvenir, prétendre, dire, résumer, calculer, commenter, sourir, définir, expliquer, proclamer, affirmer, lâcher, découvrir, prouver, souligner, déclarer, formuler, confirmer, mettre, observer, établir, rapporter, reconnaître, relever, remarquer, constater, annoncer, insister, répondre, abonder, démontrer, ricaner, adopter, étudier, présenter, se, citer, proposer, suggérer, exprimer, noter, décrire, analyser]


## Feature Extraction
Given a sentence that needs to be classified as either a quote or not a quote, the sentences central verb (called the "target") needs to be extracted.

### Verbe Central

In [4]:
def token_in_quotes(sentence):
    """
    Returns a list, the same length as the number of tokens in the sentence, where each index is 0 if the
    corresponding token isn't between quotes and 1 if it is.
    """
    in_quote = 0
    in_quotes = []
    for token in sentence:
        in_quotes.append(in_quote)
        if token.text == '"' and in_quote == 0:
            in_quote = 1
        elif token.text == '"':
            in_quote = 0
            in_quotes[-1] = 0
    return in_quotes

in_quotes = token_in_quotes(s)
print([z for z in zip(s, in_quotes)])

[(Mais, 0), (", 0), (les, 1), (nuits, 1), (de, 1), (pleine, 1), (lune, 1), (,, 1), (les, 1), (plus, 1), (claires, 1), (d’, 1), (entre, 1), (elles, 1), (resplendissent, 1), (comme, 1), (des, 1), (soleils, 1), (", 0), (,, 0), (observe, 0), (Alexandre, 0), (Roulin, 0), (,, 0), (de, 0), (l’, 0), (Université, 0), (de, 0), (Lausanne, 0), (., 0)]


In [8]:
def verb_similarities(sentence, in_quotes, cue_verbs):
    """
    Determines the minimum distance between each verbs that isn't between quotes in the sentence and a cue_verb.
    Returns the index of each such verb, it's text and its similarity
    """
    verbs = []
    for index, (in_quotes, token) in enumerate(zip(in_quotes, sentence)):
        if in_quotes == 0 and token.pos_ == 'VERB':
            # Get the token of the base form
            lemma = nlp(token.lemma_)[0]
            best_similarity = 0
            best_verb = ''
            for verb in cue_verbs:
                if lemma.similarity(verb) > best_similarity:
                    best_similarity = lemma.similarity(verb)
                    best_verb = verb
            verbs.append((index, token.text, best_similarity, best_verb))
    return verbs

def center_verb(sentence, in_quotes, cue_verbs):
    """
    Determines the verb which isn't between quotes with the highest similarity
    to a cue verb, and returns its position and token
    """
    top_index = 0
    top_token = None
    top_similarity = -1
    for index, (in_quotes, token) in enumerate(zip(in_quotes, sentence)):
        if in_quotes == 0 and token.pos_ == 'VERB':
            # Get the token of the base form
            lemma = nlp(token.lemma_)[0]
            for verb in cue_verbs:
                if lemma.similarity(verb) > top_similarity:
                    top_similarity = lemma.similarity(verb)
                    top_token = token
                    top_index = index
    return top_index, top_token
    
print(verb_similarities(s, in_quotes, cue_verbs))
print(center_verb(s, in_quotes, cue_verbs))

[(20, 'observe', 1.0, observer)]
(20, observe)


### Guillemets Features

In [10]:
def guillemet_features(sentence, in_quotes):
    """
    returns a 0 or 1 to indicate if the sentence contains quotes or not, as well as
    the largest number of continuous tokens between quotes
    """
    if sum(in_quotes) == 0:
        return 0, 0
    longest_sequence = 0
    current_sequence = 0
    for t in in_quotes:
        if t == 1:
            current_sequence += 1
            if current_sequence > longest_sequence:
                longest_sequence = current_sequence
        else:
            current_sequence = 0
    return 1, longest_sequence

print(guillemet_features(s, in_quotes))

(1, 16)


### Sentence Features

In [None]:
def sentence_features(sentence):
    """
    Features indicating whether the sentence contains a quotation mark, a
    named entity, a verb-cue, or a pronoun, as well as a sentence length
    feature.
    
    :param sentence:
    """
    return 0

### Dependency Features

In [None]:
def dependency_features(target):
    """
    Features for the relation with parent, the relation with any dependants,
    and versions of these that included the head and dependant token.
    
    :param target: The target verb
    """
    return 0

### External Knowledge Features

In [None]:
def external_features(target):
    """
    Position-indexed features for whether any of the tokens in the sentence
    match a known role, organisation, or title.
    
    :param sentence:
    """
    return 0

### Other Features

In [None]:
def other_features(target):
    """
    Features for whether the target is within quotation marks, and for whether
    there is a verb-cue near the end of the sentence.
    
    :param target: The target verb
    """
    return 0

## Extracting a word vector

In [19]:
example_token = s[0]
print(example_token.text)
print(example_token.vector[:6])
print(len(example_token.vector))

Mais
[ 0.277226  0.600244 -0.540657 -2.277228 -0.145105 -0.707782]
300
