# Session 8 - Language modelling with RNNs (Text Generation)

In [3]:
# data processing tools
import string, os 
import pandas as pd
import numpy as np
np.random.seed(42)

# keras module for building LSTM 
import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.utils as ku 
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# surpress warnings
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

2023-03-22 10:22:47.865926: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Some helper functions

In [4]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

def get_sequence_of_tokens(tokenizer, corpus):
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences

def generate_padded_sequences(input_sequences):
    # get the length of the longest sequence
    max_sequence_len = max([len(x) for x in input_sequences])
    # make every sequence the length of the longest on
    input_sequences = np.array(pad_sequences(input_sequences, 
                                            maxlen=max_sequence_len, 
                                            padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, 
                            num_classes=total_words)
    return predictors, label, max_sequence_len

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 
                        10, 
                        input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], 
                                    maxlen=max_sequence_len-1, 
                                    padding='pre')
        predicted = np.argmax(model.predict(token_list),
                                            axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

## Load the data

In [5]:
data_dir = os.path.join("..", "..", "..", "431868", "news_data")

We're then going to load the data one at a time and append *only* the headlines to our list of data.

In [7]:
all_headlines = []
for filename in os.listdir(data_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(data_dir + "/" + filename)
        all_headlines.extend(list(article_df["headline"].values))

We then clean up a little bit and see how many data points we have.

In [8]:
all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

8603

We call out ```clean_text()``` function and then inspect the first 10 texts.

In [9]:
corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['my beijing the sacred city',
 '6 million riders a day 1930s technology',
 'seeking a crossborder conference',
 'questions for despite the yuck factor leeches are big in russian medicine',
 'who is a criminal',
 'an antidote to europes populism',
 'the cost of a speech',
 'degradation of the language',
 'on the power of being awful',
 'trump garbles pitch on a revised health bill']

## Tokenize

We're then going to tokenize our data, using the ```Tokenizer()``` class from ```TensorFlow```, about which you can read more [here](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer).

We then use the ```get_sequence_of_tokens()``` function we defined above, which turns every text into a sequence of tokens based on the vocabulary from the tokenizer.

In [10]:
tokenizer = Tokenizer()
## tokenization
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1 # account for the fact that we also have a "<unk>" token for words that aren't present in our tokenizer vocabulary

In [20]:
inp_sequences = get_sequence_of_tokens(tokenizer, corpus)
inp_sequences[:10] # returning lots of the same document with varying n-gram lengths. Making more data ! Teaching the model how to deal with input sentences of different lengths 

[[46, 1601],
 [46, 1601, 1],
 [46, 1601, 1, 1951],
 [46, 1601, 1, 1951, 120],
 [122, 331],
 [122, 331, 1952],
 [122, 331, 1952, 2],
 [122, 331, 1952, 2, 125],
 [122, 331, 1952, 2, 125, 2484],
 [122, 331, 1952, 2, 125, 2484, 812]]

We then want to *pad* our input sequences to make them all the same length.

In [13]:
predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

## Create model

We then use the ```create_model()``` function created above to initialize a model, telling the model the length of sequences and the total size of the vocabulary.

In [15]:
model = create_model(max_sequence_len, total_words)
model.summary()

2023-03-22 10:25:19.197510: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 23, 10)            112650    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 11265)             1137765   
                                                                 
Total params: 1,294,815
Trainable params: 1,294,815
Non-trainable params: 0
_________________________________________________________________


Model training is exactly the same as last week, but instead of document labels, we're fitting the model to predict next word.

*NB!* This will take some time to train! It took me 35 minutes on UCloud 32xCPU.

In [38]:
history = model.fit(predictors, 
                    label, 
                    epochs=30,
                    batch_size=128, 
                    verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


When the model has trained, we can then use this to generate *new text*.

In [61]:
print (generate_text("apple iphone", 10, model, max_sequence_len))

Apple Iphone Is Bad For Puppies Every Sneakers In The Dryer Losses


## Using pre-trained word embeddings

Instead of having the embedding layer as a trainable parameter, we can instead using a *pretrained word embedding* model like ```word2vec```.

In the following examples, we're using [GloVe embeddings](https://nlp.stanford.edu/projects/glove/). These are trained a little differently from ```word2vec``` but they behave in the same way.

In [64]:
path_to_glove_file = os.path.join("..", "..", "..", "431868", "glove_models", "glove.6B.300d.txt")

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


We can define some variables that we're going to use later.

With hits and misses, we're counting how many words in the corpus vocabulary have a corresponding GloVe embedding; misses are the words which appear in our vocabulary but which do not have a GloVe embedding.

In [65]:
num_tokens = total_words
embedding_dim = 100
hits = 0
misses = 0

In [66]:
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

ValueError: could not broadcast input array from shape (300,) into shape (100,)

In [67]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer - notice that this is different
    model.add(Embedding(
            total_words,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False,
            input_length=input_len)
    )
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(500))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, 
                    activation='softmax'))

    model.compile(loss='categorical_crossentropy', 
                    optimizer='adam')
    
    return model

In [68]:
model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 23, 100)           1126500   
                                                                 
 lstm_1 (LSTM)               (None, 500)               1202000   
                                                                 
 dropout_1 (Dropout)         (None, 500)               0         
                                                                 
 dense_1 (Dense)             (None, 11265)             5643765   
                                                                 
Total params: 7,972,265
Trainable params: 6,845,765
Non-trainable params: 1,126,500
_________________________________________________________________


In [69]:
history = model.fit(predictors, 
                    label, 
                    epochs=10,
                    batch_size=128, 
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
print (generate_text("china", 30, model, max_sequence_len))