<a href="https://colab.research.google.com/github/GreihMurray/NLP-4/blob/master/anything_goes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.models import Sequential
import keras
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from math import log2

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


Heavily based on https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/#h2_7

In [3]:
def encode(grams, raw_data):
    chars = sorted(list(set(raw_data)))
    mapping = dict((c, i) for i, c in enumerate(chars))

    sequences = list()
    for line in tqdm(grams, desc='Encoding'):
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences, mapping

Based on code from https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/#h2_7

In [4]:
def build_model(vocab):
    model = Sequential()
    model.add(Embedding(vocab, 20, input_length=GRAMS-1, trainable=True))
    model.add(GRU(25, recurrent_dropout=0.1, dropout=0.1))
    model.add(Dense(vocab, activation='softmax'))

    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

    return model

original

In [16]:
def read_file(file_name):
    f = open(file_name, "r")

    full_text = f.read().split(" ")
    split_data = []

    for line in full_text: #tqdm(full_text, desc='Splitting words'):
        split_data.append(line.lower().strip())

    train = ' '.join(split_data[:int(len(split_data) * 0.3)])
    test = ' '.join(split_data[int(len(split_data) * 0.5):])

    return train, test

original

In [6]:
def read_test_data(file_name):
    f = open(file_name, "r")

    full_text = f.read().split(" ")
    split_data = []

    for line in full_text: #tqdm(full_text, desc='Splitting words'):
        split_data.append(line.lower().strip())

    return ' '.join(split_data)

original

In [7]:
def gen_n_grams(data, n=3):
    descript = "Generating " + str(n) + " Grams:"

    n_grams = [''.join(data[i:i+n]) for i in tqdm(range(len(data) - n + 1), desc=descript)]

    return n_grams

In [8]:
def main():
    data, hold_out = read_file('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/sw-train.txt')

    n_grams = gen_n_grams(data, GRAMS)
    n_grams, mapping = encode(n_grams, data)

  # Below code from https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/#h2_7
    vocab = len(mapping)
    sequences = np.array(n_grams)
    # create X and y
    x, y = sequences[:, :-1], sequences[:, -1]
    # one hot encode y
    y = to_categorical(y, num_classes=vocab)
    # create train and validation sets
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

    print('Train shape:', x_tr.shape, 'Val shape:', x_val.shape)

    model = build_model(vocab)

  # Original addition
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=5)

    model.fit(x_tr, y_tr, epochs=10, verbose=1, validation_data=(x_val, y_val), callbacks=stop_early, batch_size=500)
    model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/sw_act_model500')


In [9]:
GRAMS = 6

In [17]:
main()

Generating 6 Grams:: 100%|██████████| 11771996/11771996 [00:09<00:00, 1228130.01it/s]
Encoding: 100%|██████████| 11771996/11771996 [00:23<00:00, 502319.74it/s]


Train shape: (10594796, 5) Val shape: (1177200, 5)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Original

In [11]:
def calc_entropy(preds):
    entropy = 0

    count = 0
    pred_len = len(preds)

    for row in tqdm(preds, desc='Calculating Entropy'):
      entropy -= (1/(pred_len)) * log2(max(row))

    return entropy

Original

In [12]:
def load_my_model():
    model = keras.models.load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/sw_act_model500')

    data = read_test_data('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/sw-train.txt')

    data = data[int(len(data)*0.7):]

    n_grams = gen_n_grams(data, GRAMS-1)
    n_grams, mapping = encode(n_grams, data)

    vocab = len(mapping)
    sequences = np.array(n_grams)

    preds = model.predict(sequences)
    
    entropy = calc_entropy(preds)
    print('\n', entropy)
    

In [18]:
load_my_model()

Generating 5 Grams:: 100%|██████████| 11778315/11778315 [00:09<00:00, 1239915.58it/s]
Encoding: 100%|██████████| 11778315/11778315 [00:24<00:00, 485425.36it/s]




Calculating Entropy: 100%|██████████| 11778315/11778315 [00:58<00:00, 200654.96it/s]



 1.4849363199958634


In [14]:
# List of models & Performance (KWERE) (12 grams)
  # act_model - 1.235 entropy (Batch size 250)
  # act_model500 - 1.282 entropy (Batch size 500)
  # act_model125 - 1.233 entropy (Batch size 125)
  # act_model50 - 1.201 entropy (Batch size 50)

In [15]:
# List of models & Performance (SWAHILI) (6 grams)
  # act_model - 
  # sw_act_model500 - 1.474 entropy (Batch size 500)
  # act_model125 - 
  # act_model50 - 