In [1]:
import os
import numpy as np
from os import path
from urllib.request import urlretrieve
import sklearn
from sklearn.manifold import TSNE
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.tokenize import sent_tokenize
import unidecode
EPSILON = 1e-15

In [2]:
nltk.download('punkt')

### I - Loading and visualizing pre-trained embeddings

$\large cosine\_similarity(w_1, w_2) = \frac{\langle w_1, w_2 \rangle}{||w_1|| \cdot ||w_2||}$

In [3]:
class PretrainedEmbeddings():
    def __init__(self, language, embeddings):
        self.vec_file = None
        if language == 'en':
            if embeddings == 'glove':
                self.vec_file = 'glove_100k.en.vec'
            elif embeddings == 'ft':
                self.vec_file = 'ft_300k.en.vec'
        elif language == 'fr':
            if embeddings == 'glove':
                print('No GloVe french embeddings!')
                return None
            elif embeddings == 'ft':
                self.vec_file = 'ft_50k.fr.vec'
        self.language = language
        self.url = "https://github.com/ECE-Deep-Learning/courses_labs/releases/download/0.1/" + self.vec_file
        self.file_location = os.path.join('data', self.vec_file)
        self.embeddings_index = None
        self.embeddings_index_inversed = None
        self.embeddings_vectors = None
        self.voc_size = None
        self.dim = None
    
    @staticmethod
    def _normalize(array):
        return array / np.linalg.norm(array, axis=-1, keepdims=True)
        
    def download(self):
        if not path.exists(self.file_location):
            print('Downloading from %s to %s...' % (self.url, self.file_location))
            urlretrieve(self.url, self.file_location)
            print('Downloaded embeddings')        
            
    # Note that you can choose to normalize directly the embeddings 
    # to make the cosine similarity computation easier afterward
    def load(self, normalize=False):
        self.embeddings_index, self.embeddings_index_inversed = {}, {}
        self.embeddings_vectors = []
        file = open(self.file_location)
        header = next(file)
        self.voc_size, self.dim = [int(token) for token in header.split()]
        print('Vocabulary size: {0}\nEmbeddings dimension: {1}'.format(self.voc_size, self.dim))
        print('Loading embeddings in memory...')
        for idx, line in enumerate(file):
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            self.embeddings_index[word] = idx
            self.embeddings_index_inversed[idx] = word
            self.embeddings_vectors.append(vector)
        self.embeddings_vectors = np.asarray(self.embeddings_vectors)
        print('Embeddings loaded')
        if normalize:
            self.embeddings_vectors = self._normalize(self.embeddings_vectors)
            print('Embeddings normalized')
        file.close()
        
    # Return an embedding vector associated to a given word
    # Be sure to handle the case where the received word is not in the embeddings' vocabulary
    def word_to_vec(self, word):
        # TODO:
        return None
    
    # Return the closer word associated to a given embedding vector
    # In other terms, you have to compute every cosine similarities and return the most similar word
    def vec_to_word(self, vec):
        # TODO:
        return None

    # Return the n top similar words from a given string input
    # The similarities are based on the cosine similarities between the embeddings vectors
    # Note that the string could be a full sentence composed of several words
    # Maybe you should split the sentence and average every word embedding in it
    def most_similar(self, query, top=10):
        # TODO:
        return None
    
    def project_and_visualize(self, sample=1000):
        embeddings_tsne = TSNE(perplexity=30).fit_transform(self.embeddings_vectors[:sample])
        plt.figure(figsize=(40, 40))
        axis = plt.gca()
        np.set_printoptions(suppress=True)
        plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], marker=".", s=1)
        for idx in range(sample):
            plt.annotate(
                self.embeddings_index_inversed[idx],
                xy=(embeddings_tsne[idx, 0], embeddings_tsne[idx, 1]),
                xytext=(0, 0), textcoords='offset points'
            )

In [4]:
pretrained_embeddings = PretrainedEmbeddings(language='en', embeddings='glove')
pretrained_embeddings.download()
pretrained_embeddings.load(normalize=True)

In [5]:
pretrained_embeddings.project_and_visualize()

In [6]:
pretrained_embeddings.most_similar('french city')

### II - Language modelling

## Sampling random text from the model

First part of language modelling will be about predicting the next character of a finite sequence of characters of size $k$.

Recursively generate one character at a time:

Your model outputs the probability distribution $p_{\theta}(c_{n} | c_{n-1}, \ldots, c_{n-k})$

Using this probability distribution, a predicted character $c_{n}$ will be sampled. The temperature parameter makes it possible to remove additional entropy (bias) into the parameterized multinoulli distribution of the output of the model.

Then use your prediction $c_{n}$ to compute $c_{n+1}$. Your model outputs:
$p_{\theta}(c_{n+1} | $<span style="color:red">$c_{n}$</span>$, \ldots, c_{n-k+1})$

In [7]:
class LanguageModel():
    def __init__(self):
        self.corpus_path = None
        self.corpus = None
    
    def load_data(self, corpus_path):
        self.corpus_path = os.path.join('data', corpus_path)
        file = open(self.corpus_path)
        self.corpus = unidecode.unidecode(file.read().lower().replace("\n", " "))
        print('Corpus length: {0} characters'.format(len(self.corpus)))
        file.close()
    
    def get_contiguous_sample(self, size):
        index = np.random.randint(1, len(self.corpus) - size)
        return self.corpus[index:index+size]

In [8]:
sample_size = 500

language_model = LanguageModel()
language_model.load_data('rousseau.txt')
print('Sample of {0} characters:\n{1}'.format(
    sample_size, language_model.get_contiguous_sample(sample_size)
))

### II - a) Character-based language modelling

## Measuring per-character perplexity

To measure the quality of a language model we usually use the perplexity.
(https://en.wikipedia.org/wiki/Perplexity)

Here is how it is defined:

$$perplexity_\theta = 2^{-\frac{1}{n} \sum_{i=1}^{n} log_2 (p_\theta(c_i)^T\cdot y_i)}$$
$p_\theta(c_i)$ is your predicted column vector of probabilities over the possible next characters for the $i^{th}$ sequence.
$y_i$ is the one-hot encoding vector of the answer: the next character of the $i^{th}$ sequence.

You just compute the average negative loglikelihood like you have done previously, only using a log2 logarithm. Then just perform a base $2$ exponentiation of the quantity just computed.

In [9]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam

class CharLanguageModel(LanguageModel):
    def __init__(self):
        super(LanguageModel, self).__init__()
        self.char_index = None
        self.char_index_inversed = None
        self.vocabulary_size = None
        self.max_length_sequence = None
        self.X = None
        self.y = None
        self.model = None
    
    def extract_vocabulary(self):
        chars = sorted(set(self.corpus))
        self.char_index = dict((c, i) for i, c in enumerate(chars))
        self.char_index_inversed = dict((i, c) for i, c in enumerate(chars))
        self.vocabulary_size = len(self.char_index)
        print('Vocabulary size: {0}'.format(self.vocabulary_size))
        
    def plot_vocabulary_distribution(self):
        counter = Counter(self.corpus)
        chars, counts = zip(*counter.most_common())
        indices = np.arange(len(counts))
        plt.figure(figsize=(16, 5))
        plt.bar(indices, counts, 0.8)
        plt.xticks(indices, chars)
        
    """
    Convert X and y into one-hot encoded matrices
    
    Importante note: if the sequence length if smaller than max_length_sequence, 
    we pad the input with zeros vectors at the beginning of the one-hot encoded matrix
    """
    def _one_hot_encoding(self, X, y):
        X_one_hot = np.zeros(
            (len(X), self.max_length_sequence, self.vocabulary_size), 
            dtype=np.float32
        )
        y_one_hot = np.zeros(
            (len(X), self.vocabulary_size), 
            dtype=np.float32
        )
        # TODO:
              
        return X_one_hot, y_one_hot 
    
    """
    The matrices X and y are created in this method
    It consists of sampling sentences in the corpus as training vectors with the next character as target
    """
    def build_dataset(self, 
                      max_length_sequence=40, min_length_sentence=5, max_length_sentence=200, 
                      step=3):
        self.X, self.y = [], []
        
        sentences = sent_tokenize(self.corpus)
        sentences = filter(
            lambda x: len(x) >= min_length_sentence and len(x) <= max_length_sentence, 
            sentences
        )
        for sentence in sentences:
            for i in range(0, max(len(sentence) - max_length_sequence, 1), step):
                last_index = min(i+max_length_sequence, i+len(sentence)-1)
                self.X.append(sentence[i:last_index])
                self.y.append(sentence[last_index])

        self.max_length_sequence = max_length_sequence
        self.X, self.y = sklearn.utils.shuffle(self.X, self.y)
        print('Number of training sequences: {0}'.format(len(self.X)))
        self.X, self.y = self._one_hot_encoding(self.X, self.y)
        print('X shape: {0}\ny shape: {1}'.format(self.X.shape, self.y.shape))
       
    """
    Define, compile, and fit a Keras model on (self.X, self.y)
    It should be composed of :
        - one recurrent LSTM layer projecting into hidden_size dimensions
        - one Dense layer with a softmax activation projecting into vocabulary_size dimensions
    """
    def train(self, hidden_size=128, batch_size=128, epochs=10):
        self.model = None
        # TODO:
    
    """
    Return the prediction of our model, meaning the next token given an input sequence
    
    If preprocessed is specified as True, we consider X as an array of strings and we will transform
    it to a one-hot encoded matrix
    Importante note: if the sequence length if smaller than max_length_sequence, 
    we pad the input with zeros vectors at the beginning of the one-hot encoded matrix
    
    If preprocessed is specified as False, we apply the model predict on X as it is
    """
    def predict(self, X, verbose=1, preprocessed=True):
        if not preprocessed:
            X_one_hot = np.zeros(
                (len(X), self.max_length_sequence, self.vocabulary_size), dtype=np.float32
            )
            # TODO:
        else:
            X_one_hot = X
        return self.model.predict(X_one_hot, verbose=verbose)
    
    # Perplexity metric used to appreciate the performance of our model
    def perplexity(self, y_true, y_pred):
        likelihoods = np.sum(y_pred * y_true, axis=1)
        return 2 ** -np.mean(np.log2(likelihoods + EPSILON))
    
    """
    Sample the next character according to the predictions.
    
    Use a lower temperature to force the model to output more
    confident predictions: more peaky distribution.
    """
    def _sample_next_char(self, preds, temperature=1.0):
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds + EPSILON) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds + EPSILON)
        probs = np.random.multinomial(1, preds, size=1)
        return np.argmax(probs)
    
    def generate_text(self, seed_string, length=300, temperature=1.0):
        if self.model is None:
            print('The language model has not been trained yet!')
            return None
        generated = seed_string
        prefix = seed_string
        for i in range(length):
            predictions = np.ravel(self.predict([prefix], verbose=0, preprocessed=False))
            next_index = self._sample_next_char(predictions, temperature)
            next_char = self.char_index_inversed[next_index]
            generated += next_char
            prefix = prefix[1:] + next_char
        return generated

In [10]:
language_model = CharLanguageModel()
language_model.load_data('rousseau.txt')

In [11]:
language_model.extract_vocabulary()
language_model.char_index

In [12]:
language_model.plot_vocabulary_distribution()

In [13]:
language_model.build_dataset()

In [14]:
epochs = 5
language_model.train(epochs=epochs)
if language_model.model is not None:
    print('Perplexity after {0} epochs: {1}'.format(
        epochs, language_model.perplexity(language_model.y, language_model.model.predict(language_model.X))
    ))

In [15]:
language_model.generate_text("l'etat n'est pas au-dessus de la loi", temperature=0.25)

In [16]:
language_model.generate_text("la republique", temperature=0.25)

### II - b) Word-based language modelling

In [17]:
from spacy.lang.fr import French
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import Adam

class WordLanguageModel(LanguageModel):
    def __init__(self):
        super(LanguageModel, self).__init__()
        self.pretrained_embeddings = PretrainedEmbeddings(language='fr', embeddings='ft')
        self.pretrained_embeddings.download()
        self.pretrained_embeddings.load()
        self.parser = None
        self.word_index = None
        self.word_index_inversed = None
        self.vocabulary_size = None
        self.max_length_sequence = None
        self.tokens = None
        self.X = None
        self.y = None
        self.model = None
        
    def extract_vocabulary(self, max_vocabulary=1500000):
        self.parser = French(max_length=max_vocabulary)
        self.tokens = [token.orth_ for token in self.parser(self.corpus) if token.is_alpha]
        unique_tokens = set(self.tokens)
        self.word_index = dict((w, i) for i, w in enumerate(unique_tokens))
        self.word_index_inversed = dict((i, w) for i, w in enumerate(unique_tokens))
        self.vocabulary_size = len(self.word_index)
        print('Vocabulary size: {0}'.format(self.vocabulary_size))
        
    """
    Convert X and y into embedded matrices
    Hint: use the self.pretrained_embeddings.word_to_vec method for each token found
    
    Importante note: if the sequence length if smaller than max_length_sequence, 
    we pad the input with zeros vectors at the beginning of the embedded matrix
    """
    def _token_embedding(self, X, y):
        X_embedding = np.zeros(
            (len(X), self.max_length_sequence, self.pretrained_embeddings.dim), 
            dtype=np.float32
        )       
        y_one_hot = np.zeros(
            (len(X), self.vocabulary_size), 
            dtype=np.float32
        )
        # TODO:
        
        return X_embedding, y_one_hot
        
    def build_dataset(self, max_length_sequence=40, step=3):
        self.X, self.y = [], []
        for i in range(0, len(self.tokens) - max_length_sequence, step):
            self.X.append(self.tokens[i:i+max_length_sequence])
            self.y.append(self.tokens[i+max_length_sequence])
        self.max_length_sequence = max_length_sequence
        self.X, self.y = sklearn.utils.shuffle(self.X, self.y)
        print('Number of training sequences: {0}'.format(len(self.X)))
        self.X, self.y = self._token_embedding(self.X, self.y)
        print('X shape: {0}\ny shape: {1}'.format(self.X.shape, self.y.shape))
        
    """
    Define, compile, and fit a Keras model on (self.X, self.y)
    It should be composed of :
        - one or many recurrent LSTM layers projecting into hidden_size dimensions
        - one Dense layer with a relu activation projecting into hidden_size dimensions
        - one Dense layer with a softmax activation projecting into vocabulary_size dimensions
    """
    def train(self, hidden_size=128, batch_size=128, epochs=10):
        self.model = None
        # TODO:
        
    """
    Return the prediction of our model, meaning the next token given an input sequence
    
    If preprocessed is specified as True, we consider X as an array of strings and we will transform
    it to an embedded matrix
    Importante note: if the sequence length if smaller than max_length_sequence, 
    we pad the input with zeros vectors at the beginning of the embedded matrix
    
    If preprocessed is specified as False, we apply the model predict on X as it is
    """
    def predict(self, X, verbose=1, preprocessed=True):
        if not preprocessed:
            X_embedding = np.zeros(
                (len(X), self.max_length_sequence, self.pretrained_embeddings.dim), 
                dtype=np.float32
            )
            # TODO:
        else:
            X_embedding = X
        return self.model.predict(X_embedding, verbose=verbose)
    
    """
    Sample the next word according to the predictions.
    
    Use a lower temperature to force the model to output more
    confident predictions: more peaky distribution.
    """
    def _sample_next_word(self, preds, temperature=1.0):
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds + EPSILON) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds + EPSILON)
        probs = np.random.multinomial(1, preds, size=1)
        return np.argmax(probs)
    
    def generate_text(self, seed_string, length=50, temperature=1.0):
        if self.model is None:
            print('The language model has not been trained yet!')
            return None
        seed_tokens = [token.orth_ for token in self.parser(seed_string) if token.is_alpha]
        prefix = seed_tokens
        generated = seed_tokens
        for i in range(length):
            predictions = np.ravel(self.predict([prefix], verbose=0, preprocessed=False))
            next_index = self._sample_next_word(predictions)
            next_word = self.word_index_inversed[next_index]
            generated += [next_word]
            prefix = prefix[1:] + [next_word]
        return " ".join(generated)

In [18]:
language_model = WordLanguageModel()
language_model.load_data('rousseau.txt')

In [19]:
language_model.extract_vocabulary()

In [20]:
language_model.build_dataset()

In [21]:
epochs = 5
language_model.train(epochs=epochs)

In [22]:
language_model.generate_text("un état ne saurait réussir à", temperature=0.5)

In [23]:
language_model.generate_text("la république ne doit pas", temperature=1)

### III - Supervised text classification

In [24]:
from sklearn.datasets import fetch_20newsgroups
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

In this third part, we will:
- Train a simple network to learn embeddings on a classification task
- Use pre-trained embeddings like GloVe or FastText and see the difference
- Train a recurrent neural network to handle the text structure

However keep in mind:
- We are here to learn deep learning methods for NLP tasks, but simple sparse TF-IDF bigrams features without any embedding or Logistic Regression are often competitive in small to medium datasets for text classification.

#### 20 Newsgroups Dataset

The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups

### III - a) Load, handle, and preprocess the data

In [25]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

In [26]:
print("sklearn object type : {}".format(type(newsgroups_train)))
print("sklearn object keys :")
for k in newsgroups_train:
    print(k)

In [27]:
print("Classes to predict : {}".format(os.linesep.join(newsgroups_train['target_names'])))
print()
print("Integer mapped-classes to predict :")
print(newsgroups_train['target'])

In [28]:
class_int_str = dict(
    zip(range(len(newsgroups_train['target_names'])), newsgroups_train['target_names'])
)

In [29]:
class_int_str

In [30]:
print("Example of document in dataset:", os.linesep)
sample_idx = np.random.randint(len(newsgroups_train["data"]))
print(newsgroups_train["data"][sample_idx])
sample_idx_class = class_int_str[newsgroups_train["target"][sample_idx]]
print("Example class to predict : {}".format(sample_idx_class))

#### Preprocessing text for the (supervised) CBOW model

We will implement a simple classification model in Keras. 
Also we will have to perform preprocessing on raw text.

The following cells use Keras to preprocess text.

- Use a tokenizer: https://keras.io/preprocessing/text/#tokenizer
   - This converts the texts into sequences of integers representing the MAX_NB_WORDS most frequent words
   
- The following methods from the Tokenizer object should be useful:
   - tokenizer.fit_on_texts(corpus)
   - tokenizer.texts_to_sequences(corpus)

In [31]:
MAX_NB_WORDS = 20000  # number of different integers mapping our vocabulary

# get the raw text data
texts_train = newsgroups_train["data"]
texts_test = newsgroups_test["data"]

# finally, vectorize the text samples into a 2D integer tensor
# TODO:
tokenizer = None
sequences_train = None
sequences_test = None

if tokenizer is not None:
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

- Each text has been converted to a list of token_ids
- Each token_id represents $1$ of MAX_NB_WORDS most frequent words in train dataset

In [32]:
print("First raw text example: ", os.linesep, texts_train[0])

In [33]:
if sequences_train is not None:
    print("First text conversion to token_ids: ", os.linesep, sequences_train[0])
    print("First text number of token_ids: {}".format(len(sequences_train[0])))

The tokenizer object stores a mapping (vocabulary) from word strings to token ids that can be inverted to reconstruct the original message:

In [34]:
if tokenizer is not None:
    word_to_index = tokenizer.word_index.items()
    index_to_word = dict((i, w) for w, i in word_to_index)

In [35]:
if sequences_train is not None:
    print("Original sentence retrieved :", os.linesep)
    print(" ".join([index_to_word[i] for i in sequences_train[0]]))

Let's truncate and pad all the sequences to $1000$ symbols to build the training set.

Use a padding function: https://keras.io/preprocessing/sequence/#pad_sequences
   - Function actually also truncates sequences longer than max length
   - Default mode is to remove first elems of sequences longer than max length or pad with $0$s the beginning of sequences shorter than max length

In [36]:
MAX_SEQUENCE_LENGTH = 200

# pad sequences with 0s
# use the pad_sequences method on your sequences
# TODO:
x_train = None
x_test = None
if x_train is not None and x_test is not None:
    print('Shape of data tensor:', x_train.shape)
    print('Shape of data test tensor:', x_test.shape)

In [37]:
if x_train is not None:
    print("Example of tensor after padding/truncating : ", os.linesep, x_train[0])

In [38]:
y_train = newsgroups_train["target"]
y_test = newsgroups_test["target"]

# One-hot encode integer-mapped classes
y_train_onehot = to_categorical(np.asarray(y_train))
print('Shape of train target tensor:', y_train_onehot.shape)

### III - b) Simple classification model

The following computes a very simple model:

<img src="../images/supervised_text_classification.png" style="width: 600px;" />

Use either Sequential or Functional Keras API:

- Embedding() Layer: build an embedding layer mapping each word to a vector representation
- GlobalAveragePooling1D() Layer: average the vector representation of all words in each sequence
- Dense Layer(): end with a dense layer with softmax to output 20 classes

In [39]:
from keras.models import Sequential
from keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout
from keras.optimizers import Adam

EMBEDDING_DIM = 50
N_CLASSES = y_train_onehot.shape[1]

model = None
# TODO:

In [40]:
model.summary() if model is not None else None

In [41]:
if model is not None and x_train is not None:
    model.fit(x_train, y_train_onehot, validation_split=0.1,
              epochs=150, batch_size=128)

In [42]:
if model is not None and x_test is not None:
    print("test accuracy:", np.mean(model.predict(x_test).argmax(axis=-1) == y_test))

### III - c) Simple classification model with pre-trained embeddings

In [43]:
"""
Get an input tensor and replace the word->integer mapping with pretrained embeddings
Be sure that the word is existing (not a 0 padding) and is in the embeddings' vocabulary
"""

def preprocess_with_pretrained_embeddings(X, language, embeddings):
    pretrained_embeddings = PretrainedEmbeddings(language=language, embeddings=embeddings)
    pretrained_embeddings.download()
    pretrained_embeddings.load()
    X_embedding = np.zeros((X.shape[0], X.shape[1], pretrained_embeddings.dim))
    # TODO:
    return X_embedding

In [44]:
if x_train is not None and x_test is not None:
    x_train_embedding = preprocess_with_pretrained_embeddings(x_train, language='en', embeddings='glove')
    x_test_embedding = preprocess_with_pretrained_embeddings(x_test, language='en', embeddings='glove')
    print('Embedded training matrix shape:', x_train_embedding.shape)
    print('Embedded test matrix shape:', x_test_embedding.shape)

Use either Sequential or Functional Keras API:

- GlobalAveragePooling1D() Layer: average the vector representation of all words in each sequence
- Dense Layer(): end with a dense layer with softmax to output 20 classes

In [45]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from keras.optimizers import Adam

N_CLASSES = y_train_onehot.shape[1]

model = None
# TODO:

In [46]:
model.summary() if model is not None else None

In [47]:
if model is not None and x_train_embedding is not None:
    model.fit(x_train_embedding, y_train_onehot, validation_split=0.1, 
              epochs=200, batch_size=128)

In [48]:
if model is not None and x_test_embedding is not None:
    print("test accuracy:", np.mean(model.predict(x_test_embedding).argmax(axis=-1) == y_test))

### III - d) Recurrent classification model

Now we are going to enrich the previous model with recurrent LSTM layers:

<img src="../images/rnn.png" style="width: 600px;" />

Use either Sequential or Functional Keras API:

- Embedding() Layer: build an embedding layer mapping each word to a vector representation
- MaxPooling1D() Layer: add a MaxPooling1D layer in your sequence to reduce the dimension
- LSTM() Layer: add a LSTM layer to extract information from the reduced sequence
- Dense Layer(): end with a dense layer with softmax to output 20 classes

N.B. you can either use the pretrained embeddings or recompute them with a Embedding layer here

In [49]:
from keras.models import Sequential
from keras.layers import Embedding, MaxPooling1D, LSTM, Dense
from keras.optimizers import Adam

EMBEDDING_DIM = 50
N_CLASSES = y_train_onehot.shape[1]
pooling_size = 5
hidden_size = 64

model = None
# TODO:

In [50]:
model.summary() if model is not None else None

In [51]:
if model is not None and x_train is not None:
    model.fit(x_train, y_train_onehot, validation_split=0.1, 
              epochs=25, batch_size=128)

In [52]:
if model is not None and x_test is not None:
    print("test accuracy:", np.mean(model.predict(x_test).argmax(axis=-1) == y_test))