# Analyse de texte

Ce notebook va nous servir à illustrer le domaine de la compréhension de texte par les algorithmes. 

Comment mesure-t-on une compréhension de texte ? En faisant lire un texte à notre programme et en lui posant des questions par la suite.

Le jeu de données utilisées dans cet exemple est le babi dataset ( https://research.fb.com/downloads/babi/ ) qui contient des textes simples en anglais.

Ce notebook reprend en grande partie le script python : https://github.com/keras-team/keras/blob/master/examples/babi_memnn.py

## Données disponibles

In [None]:
data_dir = 'snowcamp/datasets/text/'

In [None]:
ls snowcamp/datasets/text/tasks_1-20_v1-2/en/

## Single supporting facts

In [None]:
more snowcamp/datasets/text/tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt

## Two supporting facts

In [None]:
more snowcamp/datasets/text/tasks_1-20_v1-2/en/qa2_two-supporting-facts_train.txt

## Compound coreference

In [None]:
more snowcamp/datasets/text/tasks_1-20_v1-2/en/qa13_compound-coreference_train.txt

##  Motivations

In [None]:
more snowcamp/datasets/text/tasks_1-20_v1-2/en/qa20_agents-motivations_train.txt

## Imports

In [None]:
from __future__ import print_function
from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Activation, Dense, Permute, Dropout, add, dot, concatenate
from keras.layers import LSTM
from keras.utils.data_utils import get_file
from keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tarfile
import numpy as np
import re

In [None]:

def tokenize(sent):
    '''Return the tokens of a sentence including punctuation.
    >>> tokenize('Bob dropped the apple. Where is the apple?')
    ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
    '''
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [None]:
tokenize('Bienvenue au snowcamp!')

### Fonctions de lecture du dataset

In [None]:
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true, only the sentences
    that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data

In [None]:
from functools import reduce

def get_stories(f, only_supporting=False, max_length=None):
    '''Given a file name, read the file,
    retrieve the stories,
    and then convert the sentences into a single story.
    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    '''
    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    flatten = lambda data: reduce(lambda x, y: x + y, data)
    data = [(flatten(story), q, answer) for story, q, answer in data if not max_length or len(flatten(story)) < max_length]
    return data

In [None]:
def vectorize_stories(data):
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
        answers.append(word_idx[answer])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen), np.array(answers))

In [None]:
challenges = {
    # QA1 with 10,000 samples
    'single_supporting_fact_10k': data_dir + 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    # QA2 with 10,000 samples
    'two_supporting_facts_10k': data_dir + 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
    'motivations' : data_dir + 'tasks_1-20_v1-2/en-10k/qa20_agents-motivations_{}.txt',
    'copound'  : data_dir + 'tasks_1-20_v1-2/en-10k/qa13_compound-coreference_{}.txt'
}

challenge_types = ['single_supporting_fact_10k', 'motivations', 'copound']
train_stories = []
test_stories = []

for challenge_type in challenge_types:
    challenge = challenges[challenge_type]

    print('Extracting stories for the challenge:', challenge_type)
    train_stories = train_stories + get_stories(open(challenge.format('train')))
    test_stories = test_stories + get_stories(open(challenge.format('test')))

In [None]:
vocab = sorted(reduce(lambda x, y: x | y, (set(story + q + [answer]) for story, q, answer in train_stories + test_stories)))
# Reserve 0 for masking via pad_sequences
vocab_size = len(vocab) + 1
story_maxlen = max(map(len, (x for x, _, _ in train_stories + test_stories)))
query_maxlen = max(map(len, (x for _, x, _ in train_stories + test_stories)))

## Vocabulaire rencontré

In [None]:
vocab

## Nombre de mots

In [None]:
print('-')
print('Vocab size:', vocab_size, 'unique words')
print('Story max length:', story_maxlen, 'words')
print('Query max length:', query_maxlen, 'words')
print('Number of training stories:', len(train_stories))
print('Number of test stories:', len(test_stories))
print('-')
print('Here\'s what a "story" tuple looks like (input, query, answer):')
print(train_stories[0])
print('-')


## Vectorisation des suites de mots (des phrases..)

In [None]:
import numpy as np


word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
word_idx_reverse = {v: k for k, v in word_idx.items()}
inputs_train, queries_train, answers_train = vectorize_stories(train_stories)
inputs_test, queries_test, answers_test = vectorize_stories(test_stories)

In [None]:
print('-')
print('inputs: integer tensor of shape (samples, max_length)')
print('inputs_train shape:', inputs_train.shape)
print('inputs_test shape:', inputs_test.shape)
print('-')
print('queries: integer tensor of shape (samples, max_length)')
print('queries_train shape:', queries_train.shape)
print('queries_test shape:', queries_test.shape)
print('-')
print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
print('answers_train shape:', answers_train.shape)
print('answers_test shape:', answers_test.shape)
print('-')

### Création du modèle

In [None]:
def create_model():
    input_sequence = Input((story_maxlen,))
    question = Input((query_maxlen,))
    
    # encoders
    # embed the input sequence into a sequence of vectors
    input_encoder_m = Sequential()
    input_encoder_m.add(Embedding(input_dim=vocab_size,
                                  output_dim=64))
    input_encoder_m.add(Dropout(0.3))
    # output: (samples, story_maxlen, embedding_dim)

    # embed the input into a sequence of vectors of size query_maxlen
    input_encoder_c = Sequential()
    input_encoder_c.add(Embedding(input_dim=vocab_size,
                                  output_dim=query_maxlen))
    input_encoder_c.add(Dropout(0.3))
    # output: (samples, story_maxlen, query_maxlen)

    # embed the question into a sequence of vectors
    question_encoder = Sequential()
    question_encoder.add(Embedding(input_dim=vocab_size,
                                   output_dim=64,
                                   input_length=query_maxlen))
    question_encoder.add(Dropout(0.3))
    # output: (samples, query_maxlen, embedding_dim)

    # encode input sequence and questions (which are indices)
    # to sequences of dense vectors
    input_encoded_m = input_encoder_m(input_sequence)
    input_encoded_c = input_encoder_c(input_sequence)
    question_encoded = question_encoder(question)

    # compute a 'match' between the first input vector sequence
    # and the question vector sequence
    # shape: `(samples, story_maxlen, query_maxlen)`
    match = dot([input_encoded_m, question_encoded], axes=(2, 2))
    match = Activation('softmax')(match)

    # add the match matrix with the second input vector sequence
    response = add([match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
    response = Permute((2, 1))(response)  # (samples, query_maxlen, story_maxlen)

    # concatenate the match matrix with the question vector sequence
    answer = concatenate([response, question_encoded])

    # the original paper uses a matrix multiplication for this reduction step.
    # we choose to use a RNN instead.
    answer = LSTM(32)(answer)  # (samples, 32)

    # one regularization layer -- more would probably be needed.
    answer = Dropout(0.3)(answer)
    answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
    # we output a probability distribution over the vocabulary
    answer = Activation('softmax')(answer)

    # build the final model
    model = Model([input_sequence, question], answer)
    return model

In [None]:
model = create_model()

### Compilation

In [None]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

### Train !

In [None]:
# train
model.fit([inputs_train, queries_train], answers_train,
          batch_size=32,
          epochs=1,
validation_data=([inputs_test, queries_test], answers_test))

In [None]:
"""
    En cas de blocage ... 
"""
if 'session' in locals() and session is not None:
    print('Close interactive session')
    session.close()

### Effectuons quelques prédictions

In [None]:
num_test_story = 1000

story = [test_stories[...][...], test_stories[...][...]]
answer = test_stories[...][...]

print("Histoire = " + str(story[0]))
print("Question = " + str(story[1]))
print("Réponse attendue = " + str(answer))

In [None]:
def vectorize_user_stories(data):
    """
        Vectorise une histoire et une question
    """
    inputs, queries = [], []
    for story, query in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen))

In [None]:
vector_story, vector_question = vectorize_user_stories([...])

In [None]:
print("Vector story = " + str(vector_story))
print("Vector question = " + str(vector_question))

In [None]:
result_probas = model.predict([..., ...])

In [None]:
print("Result = " + str(result_probas))

In [None]:
result_classes = result_probas.argmax(axis=-1)

In [None]:
print("Result classes = " + str(result_classes))

In [None]:
def get_class_labels(_class):
    return [word_idx_reverse[c] for c in _class]

In [None]:
get_class_labels(result_classes)

In [None]:
def predict_answer_json(story_idx):
    """
        Construit un dictionnaire avec l'histoire demandée, la réponse réelle et la réponse prédite.
    """
        story = [test_stories[story_idx][0], test_stories[story_idx][1]]
        answer = test_stories[story_idx][2]
        vector_story, vector_question = vectorize_user_stories([story])
        result_probas = model.predict([vector_story, vector_question])
        result_classes = result_probas.argmax(axis=-1)
        labels = get_class_labels(result_classes)
        return {
            "story" : ...[...],
            "question" : ...[...],
            "answer" : answer,
            "predicted_answer" : ...[...]
        }  

In [None]:
predict_answer_json(1000)

### API (avec flask)

In [None]:
import flask

app = flask.Flask(__name__)

In [None]:
@app.route('/get/answer', methods=['GET'])
def get_answer():
    question_idx  = flask.request.args.get('question_idx')

    return flask.jsonify(predict_answer_json(int(question_idx)))

In [None]:
app.run()