# Extraction of features for quote detection
## Imports

In [4]:
import numpy as np
import spacy

nlp = spacy.load("fr_core_news_md")

## Constants
### SpaCy POS tags

In [16]:
POS_TAGS = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET',\
            'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT',\
            'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']

NE_PER = 'PER'

sentence = 'Mais «les nuits de pleine lune, les plus claires d’entre elles resplendissent '\
            'comme des soleils», observe Alexandre Roulin, de l’Université de Lausanne.'

s = nlp(sentence)

pos_tags = []
pos_tags_int = []
pos_tag_feature = []
tag_tags = []
tag_tags_int = []

for token in s:
    pos_tags.append(token.pos_)
    pos_tags_int.append(token.pos)
    pos_tag_feature.append(POS_TAGS.index(token.pos_))
    tag_tags.append(token.tag_)
    tag_tags_int.append(token.tag)
    
print(pos_tags[:10])
print(pos_tags_int[:10])
print(pos_tag_feature)
print(tag_tags[:2])
print(tag_tags_int[:2])

['CCONJ', 'VERB', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT', 'DET', 'ADV']
[89, 100, 90, 92, 85, 84, 92, 97, 90, 86]
[5, 16, 6, 8, 1, 0, 8, 13, 6, 2, 0, 5, 1, 11, 16, 1, 6, 8, 13, 13, 16, 12, 12, 13, 1, 6, 8, 1, 12, 13]
['CCONJ___', 'VERB__Mood=Ind|Number=Plur|Person=3|Tense=Imp|VerbForm=Fin']
[8673025306212932083, 12284149216315539529]


## Feature Extraction
Given a sentence that needs to be classified as either a quote or not a quote, the sentences central verb (called the "target") needs to be extracted.

### Lexical Features

In [3]:
def lexical_features(target):
    """
    Unigram and bigram versions of the token, lemma, and POS tags
    within a window of 5 tokens either side of the target, all indexed
    by position.
    
    :param target: The target verb
    """
    
    return 0

### Sentence Features

In [None]:
def sentence_features(sentence):
    """
    Features indicating whether the sentence contains a quotation mark, a
    named entity, a verb-cue, or a pronoun, as well as a sentence length
    feature.
    
    :param sentence:
    """
    return 0

### Dependency Features

In [None]:
def dependency_features(target):
    """
    Features for the relation with parent, the relation with any dependants,
    and versions of these that included the head and dependant token.
    
    :param target: The target verb
    """
    return 0

### External Knowledge Features

In [None]:
def external_features(target):
    """
    Position-indexed features for whether any of the tokens in the sentence
    match a known role, organisation, or title.
    
    :param sentence:
    """
    return 0

### Other Features

In [None]:
def other_features(target):
    """
    Features for whether the target is within quotation marks, and for whether
    there is a verb-cue near the end of the sentence.
    
    :param target: The target verb
    """
    return 0

## Extracting a word vector

In [19]:
example_token = s[0]
print(example_token.text)
print(example_token.vector[:6])
print(len(example_token.vector))

Mais
[ 0.277226  0.600244 -0.540657 -2.277228 -0.145105 -0.707782]
300
