# RNN

1. Text classification
2. Language Model

In [1]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Input, Add, Concatenate,\
    Bidirectional, SimpleRNN, LSTM, GRU, TimeDistributed
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

In [2]:
def load_data(file_name):
    """
    :param file_name: a file name, type: str
    return a list of ids, a list of reviews, a list of labels
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)

    return df["id"], df["text"], df["label"]

def load_labels(file_name):
    """
    :param file_name: a file name, type: str
    return a list of labels
    """
    return pd.read_csv(file_name)["label"]

def write_predictions(file_name, pred):
    df = pd.DataFrame(zip(range(len(pred)), pred))
    df.columns = ["id", "label"]
    df.to_csv(file_name, index=False)
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """

    return [ps.stem(token).lower() for token in tokens]    
def get_feats_dict(feats, min_freq=-1, max_freq=-1, max_size=-1):
    """
    :param data: a list of features, type: list(list)
    :param min_freq: the lowest fequency that the fequency of a feature smaller than it will be filtered out, type: int
    :param max_freq: the highest fequency that the fequency of a feature larger than it will be filtered out, type: int
    :param max_size: the max size of feature dict, type: int
    return a feature dict that maps features to indices, sorted by frequencies
    # Counter document: https://docs.python.org/3.6/library/collections.html#collections.Counter
    """
    # count all features
    feat_cnt = Counter(feats) # ["text", "text", "mine"] --> {"text": 2, "mine": 1}
    if max_size > 0 and min_freq == -1 and max_freq == -1:
        valid_feats = ["<pad>", "<unk>"] + [f for f, cnt in feat_cnt.most_common(max_size-2)]
    else:
        valid_feats = ["<pad>", "<unk>"]
        for f, cnt in feat_cnt.most_common():
            if (min_freq == -1 or cnt >= min_freq) and \
                (max_freq == -1 or cnt <= max_freq):
                valid_feats.append(f)
    if max_size > 0 and len(valid_feats) > max_size:
        valid_feats = valid_feats[:max_size]
    print("Size of features:", len(valid_feats))
    
    # build a mapping from features to indices
    feats_dict = dict(zip(valid_feats, range(len(valid_feats))))
    return feats_dict

def get_index_vector(feats, feats_dict, max_len):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    :param feats: a list of features, type: list
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(max_len, dtype=np.int64)
    for i, f in enumerate(feats):
        if i == max_len:
            break
        # get the feature index, return 1 (<unk>) if the feature is not existed
        f_idx = feats_dict.get(f, 1)
        vector[i] = f_idx
    return vector

In [3]:
train_file = "data/train.csv"
test_file = "data/test.csv"
ans_file = "data/ans.csv"
pred_file = "data/pred.csv"
min_freq = 3

# load data
train_ids, train_texts, train_labels = load_data(train_file)
test_ids, test_texts, _ = load_data(test_file)
test_labels = load_labels(ans_file)

# extract features
train_tokens = [tokenize(text) for text in train_texts]
test_tokens = [tokenize(text) for text in test_texts]

train_stemmed = [stem(tokens) for tokens in train_tokens]
test_stemmed = [stem(tokens) for tokens in test_tokens]

train_feats = train_stemmed
test_feats = test_stemmed

# build a mapping from features to indices
feats_dict = get_feats_dict(
    chain.from_iterable(train_feats),
    min_freq=min_freq)

Size of features: 3978


## RNN architecture

In this tutorial, we will try to use the recurrent neural network for text classification.

![RNN for Text](rnn_for_text.png)

The RNN consists of three parts: (1) the word representation part, (2) the recurrent part, and (3) the fully connected part. The word representation part is the word embedding layer; the recurrent part includes multiple (bi-directional) recurrent layers to memorize and summarize contextualized word features; the fully connected part utilizes a multi-layer perceptron to make predictions.


### Formula

Input: $[w_1, w_2, \cdots, w_n]$

Model: 
1. Embedding layer: $[e_1, e_2, \cdots, e_n]$
2. RNN -> $[h_1, h_2, \cdots, h_n]$
3. Retrieve the last hidden state $h_n$ as the output embedding for the whole sentence.

Output layer:
1. Dense layer for classification

In [4]:
def build_RNN(input_length, vocab_size, embedding_size,
              hidden_size, output_size,
              num_rnn_layers, num_mlp_layers,
              rnn_type="lstm",
              bidirectional=False,
              activation="tanh",
              dropout_rate=0.0,
              batch_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.001,
              metric="accuracy"):
    """
    :param input_length: the maximum length of sentences, type: int
    :param vocab_size: the vacabulary size, type: int
    :param embedding_size: the dimension of word representations, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_rnn_layers: the number of layers of the RNN, type: int
    :param num_mlp_layers: the number of layers of the MLP, type: int
    :param rnn_type: the type of RNN, type: str
    :param bidirectional: whether to use bidirectional rnn, type: bool
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a RNN for text classification,
    # activation document: https://keras.io/activations/
    # dropout document: https://keras.io/layers/core/#dropout
    # embedding document: https://keras.io/layers/embeddings/#embedding
    # recurrent layers document: https://keras.io/layers/recurrent
    # batch normalization document: https://keras.io/layers/normalization/
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_length,))
    
    ################################
    ###### Word Representation #####
    ################################
    # word representation layer
    emb = Embedding(input_dim=vocab_size,
                    output_dim=embedding_size,
                    input_length=input_length,
                    embeddings_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=0))(x)
    
    ################################
    ####### Recurrent Layers #######
    ################################
    # recurrent layers
    # Referennce: https://keras.io/api/layers/#recurrent-layers
    if rnn_type == "rnn":
        fn = SimpleRNN
    elif rnn_type == "lstm":
        fn = LSTM
    elif rnn_type == "gru":
        fn = GRU
    else:
        raise NotImplementedError
        
    h = emb
    for i in range(num_rnn_layers):
        is_last = (i == num_rnn_layers-1)
        if bidirectional:
            h = Bidirectional(fn(hidden_size,
                   kernel_initializer=keras.initializers.glorot_uniform(seed=0),
                   recurrent_initializer=keras.initializers.Orthogonal(gain=1.0, seed=0),
                   return_sequences=not is_last))(h)
            # return_sequences:
            # Boolean. Whether to return the last output. in the output sequence, or the full sequence.
            # [h_1, h_2, ..., h_n] or h_n
        else:
            h = fn(hidden_size,
                   kernel_initializer=keras.initializers.glorot_uniform(seed=0),
                   recurrent_initializer=keras.initializers.Orthogonal(gain=1.0, seed=0),
                   return_sequences=not is_last)(h)
        h = Dropout(dropout_rate, seed=0)(h)
    
    ################################
    #### Fully Connected Layers ####
    ################################
    # multi-layer perceptron
    for i in range(num_mlp_layers-1):
        new_h = Dense(hidden_size,
                      kernel_initializer=keras.initializers.he_normal(seed=0),
                      bias_initializer="zeros",
                      kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
        # add batch normalization layer
        if batch_norm:
            new_h = BatchNormalization()(new_h)
        # add residual connection
        if i == 0:
            h = new_h
        else:
            h = Add()([h, new_h])
        # add activation
        h = Activation(activation)(h)
    y = Dense(output_size,
              activation="softmax",
              kernel_initializer=keras.initializers.he_normal(seed=0),
              bias_initializer="zeros")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [5]:
max_len = 50

# build the feats_matrix
# convert each example to a index vector, and then stack vectors as a matrix
train_feats_matrix = np.vstack(
    [get_index_vector(f, feats_dict, max_len) for f in train_feats])
test_feats_matrix = np.vstack(
    [get_index_vector(f, feats_dict, max_len) for f in test_feats])

# convert labels to label_matrix
num_classes = max(train_labels)
# convert each label to a ont-hot vector, and then stack vectors as a matrix
train_label_matrix = keras.utils.to_categorical(train_labels-1, num_classes=num_classes)
test_label_matrix = keras.utils.to_categorical(test_labels-1, num_classes=num_classes)

In [7]:
embedding_size = 50
hidden_size = 100 
num_rnn_layers = 1
num_mlp_layers = 1
os.makedirs("models", exist_ok=True)
model = build_RNN(max_len, len(feats_dict), embedding_size,
              hidden_size, num_classes,
              num_rnn_layers, num_mlp_layers,
              rnn_type="lstm",
              bidirectional=True,
              activation="tanh",
              dropout_rate=0.0,
              batch_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.001,
              metric="accuracy")

checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)
earlystopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0)

np.random.seed(0)
tf.random.set_seed(0)
rnn_history = model.fit(train_feats_matrix, train_label_matrix,
                    validation_split=0.1,
                    epochs=100, batch_size=100, verbose=0,
                    callbacks=[checkpointer, earlystopping])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_feats_matrix, train_label_matrix,
                             batch_size=100)
test_score = model.evaluate(test_feats_matrix, test_label_matrix,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

training loss: 0.8068780303001404 training accuracy 0.6779999732971191
test loss: 1.4762108325958252 test accuracy 0.45750001072883606


# Language Model

An RNN Language model is provided here.

Input:
- word tokens $[w_1, w_2, \cdots, w_n]$

Model：
- embedding layer: get the representation of all the words as $[e_1, e_2, \cdots, e_n]$.
- RNN: get the hidden representation of the sentence $[h_1, h_2, \cdots, h_n]$.
- Objective: Minimize the log probability of the sentence.


**Chain rule:**

$P(w_1w_2\cdots w_n) = P(w_1)P(w_2|w_1)P(w_3|w_1w_2)\cdots$

**Markov approximation**

$P(w_1w_2\cdots w_n) \approx P(w_1|w_0)P(w_2|w_1)P(w_3|w_2)\cdots$

$P(w_1w_2\cdots w_n) \approx \prod_{i=1}^{n}P(w_i|h_i)$

In [8]:
from ptb_loader import load_data
from keras.callbacks import ModelCheckpoint, Callback

class TestCallback(Callback):
    """
    Calculate Perplexity
    """
    def __init__(self, test_data, model):
        self.test_data = test_data
        self.model = model
    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        x_probs = self.model.predict(x)
        ppl = self.evaluate_batch_ppl(x_probs,y)
        print('\nValidation Set Perplexity: {0:.2f} \n'.format(ppl))
    @staticmethod
    def evaluate_ppl(x, y):
        x = x.reshape(-1, x.shape[-1])
        y = y.reshape(-1)
        return np.exp(np.mean(-np.log(np.diag(x[:, y]))))
    def evaluate_batch_ppl(self, x, y):
        eval_batch_size = 8
        x = x.reshape(-1, x.shape[-1])
        y = y.reshape(-1)
        ppl = 0.0
        for i in range(math.ceil(len(x)/eval_batch_size)):
            batch_x = x[i*eval_batch_size:(i+1)*eval_batch_size,:]
            batch_y = y[i*eval_batch_size:(i+1)*eval_batch_size]
            ppl += np.sum(np.log(np.diag(batch_x[:, batch_y])))
        return np.exp(-ppl/x.shape[0])

print('Loading data')
x_train, y_train, x_valid, y_valid, vocabulary_size, vocab = load_data()

print(x_train.shape)
print(y_train.shape)

num_training_data = x_train.shape[0]
sequence_length = x_train.shape[1]

print('Vocab Size',vocabulary_size)

Loading data
(32389, 30)
(32389, 30, 1)
Vocab Size 9860


In [9]:
# training parameters
drop = 0.5
epochs = 10
batch_size = 8
embedding_dim = 10

# lstm parameters
hidden_size = 10

inputs = Input(shape=(sequence_length,), dtype='int32')
# inputs -> [batch_size, sequence_length]

emb_layer = Embedding(input_dim=vocabulary_size, 
                    output_dim=embedding_dim, 
                    input_length=sequence_length)
# emb_layer.trainable = False
# if you uncomment this line, the embeddings will be untrainable

embedding = emb_layer(inputs)
# embedding -> [batch_size, sequence_length, embedding_dim]

drop_embed = Dropout(drop)(embedding) 
# dropout at embedding layer

# add a LSTM here, set units=hidden_size, return_sequences=True
# Boolean. Whether to return the last output. in the output sequence, or the full sequence.
lstm_out_1 = LSTM(units=hidden_size, return_sequences=True)(drop_embed)
# NER [tag1, tag2, tag3, ...]
# output: lstm_out_1 -> [batch_size, sequence_length, hidden_size]


# add a TimeDistributed here, set layer = Dense(units=vocabulary_size,activation='softmax')
# please read  https://keras.io/layers/wrappers/
# output: outputs -> [batch_size, sequence_length, vocabulary_size]
outputs = TimeDistributed(Dense(units=vocabulary_size,
    activation='softmax'))(lstm_out_1)
# [batch_size, sequence_length, output_size]

# End of Model Architecture
# ----------------------------------------#

In [10]:
model = Model(inputs=inputs, outputs=outputs)

adam = keras.optimizers.Adam()
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)

print(model.summary())

print("Traning Model...")
history = model.fit(
        x_train, 
        y_train, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1,
        callbacks=[TestCallback((x_valid,y_valid),model=model)])

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 10)            98600     
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 10)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 10)            840       
_________________________________________________________________
time_distributed (TimeDistri (None, 30, 9860)          108460    
Total params: 207,900
Trainable params: 207,900
Non-trainable params: 0
_________________________________________________________________
None
Traning Model...
Epoch 1/10
Validation Set Perplexity: 450.09 

Epoch 2/10
Validation Set Perplexity: 365.04 

## Test perplexity

In [11]:
sent_1 = [vocab[s] for s in "i visited the campus last monday".split()]
sent_2 = [vocab[s] for s in "i visited the campus last pizza".split()]
sent_1_input = np.expand_dims(np.array(sent_1 + [0] * (x_train.shape[1]-len(sent_1))), 0)
sent_2_input = np.expand_dims(np.array(sent_2 + [0] * (x_train.shape[1]-len(sent_2))), 0)
sent_1_y = np.expand_dims([sent_1[1:]+[sent_1[0]]], -1)
sent_2_y = np.expand_dims([sent_2[1:]+[sent_2[0]]], -1)

In [12]:
print("perplexity of sentence 1:", TestCallback.evaluate_ppl(model.predict(sent_1_input)[:len(sent_1)], 
                          sent_1_y))
print("perplexity of sentence 2:", TestCallback.evaluate_ppl(model.predict(sent_2_input)[:len(sent_1)], 
                          sent_2_y))

perplexity of sentence 1: 2119.398
perplexity of sentence 2: 3703.2559
