<a href="https://colab.research.google.com/github/MelqonyanG/ML_Intro/blob/master/ACA_ML_8_3_Language_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ACA ML 8.3 - Language modelling

This notebook is part of the [Machine Learning Course](https://aca.am/en/machine-learning/) at Armenian Code Academy, and accompanies lectures slides and lectures nots for the 2-week block on NLP. This notebook covers the topics of language modelling with a particular focus on Recurrent Neural Networks.

Created: March 25, 2021 | Last major update: March 25, 2021

Author: [Vahe Tshitoyan](https://vtshitoyan.github.io) (please reach out with any questions or suggestions)

In [None]:
import numpy as np
import tensorflow as tf

import gensim
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


You can get the corpus_100k file from [here](https://storage.googleapis.com/allnews_am/corpus_100k.zip). It is already pre-processed and tokenized.

In [None]:
with open('corpus_100k', 'r') as f:
  sentences = [s.strip().split(' ') for s in f.readlines()]
print(f'Number of sentences: {len(sentences)}')
use_first_n = 2000
sentences = sentences[:use_first_n]
print(f'Using: {len(sentences)}')

Number of sentences: 1163126
Using: 2000


In [None]:
# Build a vocabulary from sentences
dct = gensim.corpora.Dictionary(sentences)

word2idx = dct.token2id
idx2word = [dct[i] for i in range(len(dct.token2id))]
idx2word = ['PAD', 'UNK'] + idx2word  # Add a token for unknown words and padding
word2idx = {w: i for i, w in enumerate(idx2word)}

print(f'Unique words: {len(idx2word)}')
print(idx2word[:30])

with open('idx2word.txt', 'w', encoding='utf-8') as idx2word_file:
  idx2word_file.write('\n'.join(idx2word))

Unique words: 9195
['PAD', 'UNK', ',', '.', 'ՀՀ', 'Հիմա', 'Փաշինյան', 'ամիս', 'այդպես', 'անզգույշ', 'արտահայտեմ', 'բառ', 'ես', 'երեք', 'եւ', 'է', 'էլ', 'կարող', 'կյանքն', 'մի', 'ներքաղաքական', 'չխաղաղվի', 'քիչ', 'Մենք', 'առաջ', 'առումով', 'գնում', 'ենք', 'ճիշտ', 'վեկտորի']


In [None]:
def sentence_batches(sentences, batch_size=32):
    """Produces training batches for the language model from sentences.

    Returns data as tensorflow data.Dataset.

    Args:
      sentences: The sentences as a sequence of sequences of words.
      batch_size: The size of the training batch.

    Returns:
      Training examples in the form of ([w1], [w2]), ([w1, w2], [w3]), etc.
    """
    # Convert each sentence from list of Tokens (words) to list of word_index
    # In the vocabulary.
    encoded_sentences = [
      [word2idx[w] if w in word2idx else 1 for w in s]  # 1 is for 'UNK'
      for s in sentences
    ]

    def data_generator():
      for encoded_sentence in encoded_sentences:
        # Add an artificial examples for the first word after end of last 
        # sentence.
        yield ([word2idx['։']], encoded_sentence[0])
        for target_word_index in range(1, len(encoded_sentence)):
          # Create the rest of the examples.
          yield  (encoded_sentence[:target_word_index], 
                  encoded_sentence[target_word_index])
  
    output_signature = (
        tf.TensorSpec(shape=(None,), dtype=tf.int32),
        tf.TensorSpec(shape=(), dtype=tf.int32)
    )

    data = tf.data.Dataset.from_generator(
        lambda: data_generator(),
        output_signature=output_signature
    ).shuffle(10000).padded_batch(batch_size)
    return data

In [None]:
default_embedding_size = 50  # Used when there are no pre-trained embeddings.

def rnn_model(pretrained_embeddings_file=None):
  word_in = tf.keras.layers.Input(shape=(None,))  # The index of the word
  if pretrained_embeddings_file is not None:
    emb_model = gensim.models.fasttext.FastText.load(
        pretrained_embeddings_file)
    embedding_matrix = np.zeros((len(idx2word), emb_model.wv.vector_size))
    for i, word in enumerate(idx2word):
      if word in emb_model.wv.vocab:
        embedding_matrix[i] = emb_model.wv.get_vector(word)
      else:
        embedding_matrix[i] = 0  # Unknown word, just use all 0s.
    emb_word = tf.keras.layers.Embedding(
        input_dim=len(word2idx),
        output_dim=emb_model.wv.vector_size,
        trainable=False,  # <- You can set this to True to fine-tune the embeddings.
        weights=[embedding_matrix],  # <- Load pre-trained embeddings
        mask_zero=True)(word_in)
  else:
    emb_word = tf.keras.layers.Embedding(
        input_dim=len(word2idx),
        output_dim=default_embedding_size,
        mask_zero=True)(word_in)
  rnn_out = tf.keras.layers.SimpleRNN(50)(emb_word)
  predicted_word = tf.keras.layers.Dense(
      len(idx2word), activation = 'softmax')(rnn_out)
  return tf.keras.models.Model(word_in, predicted_word)

In [None]:
model = rnn_model('ft_50_1679k_and_wiki_lr0025_cn36_ss000001.model')
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), 
              optimizer=tf.keras.optimizers.Adam())

In [None]:
_ = model.fit(
    sentence_batches(sentences, batch_size=32), 
    epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [None]:
save_model = True
if save_model:
  model.save('arm_rnn_language_model.h5')

# Text generation using the trained model
You can run this without the training part above if you have already saved the idx2word.txt (the vocabulary) and the arm_rnn_language_model.h5 (the language model).

In [None]:
model = tf.keras.models.load_model('arm_rnn_language_model.h5')
with open('idx2word.txt', 'r', encoding='utf-8') as idx2word_file:
  idx2word = [w.strip() for w in idx2word_file.readlines()]
word2idx = {w: i for i, w in enumerate(idx2word)}

In [None]:
def generate_text(input_sentence, nr_extra_words):
  # Converts words to their indexes in the vocabulary.
  encoded_input = [word2idx[w] if w in word2idx else 1 for w in input_sentence]
  for _ in range(nr_extra_words):
    next_word_distribution = model.predict([encoded_input])[0]
    # Sample a word using the output distribution.
    next_word = np.random.choice(
        range(len(idx2word)), 
        p=next_word_distribution)
    # Add the new word to the sequence to continue generating text.
    encoded_input.append(next_word)
  return [idx2word[encoded_word] for encoded_word in encoded_input]

In [None]:
' '.join(generate_text(['ՀՀ', 'քաղաքացիները'], 50))

'ՀՀ քաղաքացիները , անհանդուրժողականությունը . ՀՔԾ օրենսգրքի 178-րդ հոդվածի 2-րդ մասի հատկանիշներով 5 մահապարտ են տարել ։ Սարավանի լեռնանցքում գետնաբուք այսօրվա . Ռոբերտ Քոչարյանի պարագայում մերսման համար ։ Ու թող դրան ընթացքում ՝ նախարարի առաջին պայթյունը մի քանի օր է տեղում ։ 00-ը - նշեց արդարադատության նախարարի նորանշանակ աշխատակազմի ղեկավարի Վլադիմիր Զելենսկին'

# Improvement ideas
- Use more RNN layers
- Use more modern Recurrent layers such as LSTMs.
- Use beam search instead of sampling.
- Increase the dimensions of the hidden layer.