<a href="https://colab.research.google.com/github/GreihMurray/NLP-4/blob/master/anything_goes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.models import Sequential
import keras
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from math import log2

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [20]:
def encode(grams, raw_data):
    chars = sorted(list(set(raw_data)))
    mapping = dict((c, i) for i, c in enumerate(chars))

    sequences = list()
    for line in tqdm(grams, desc='Encoding'):
        # integer encode line
        encoded_seq = [mapping[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences, mapping

In [16]:
def build_model(vocab):
    model = Sequential()
    model.add(Embedding(vocab, 20, input_length=GRAMS-1, trainable=True))
    model.add(GRU(25, recurrent_dropout=0.1, dropout=0.1))
    model.add(Dense(vocab, activation='softmax'))

    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

    return model

In [6]:
def read_file(file_name):
    f = open(file_name, "r")

    full_text = f.read().split(" ")
    split_data = []

    for line in full_text: #tqdm(full_text, desc='Splitting words'):
        split_data.append(line.lower().strip())

    train = ' '.join(split_data[:int(len(split_data) * 0.8)])
    test = ' '.join(split_data[int(len(split_data) * 0.8):])

    return train, test

In [27]:
def read_test_data(file_name):
    f = open(file_name, "r")

    full_text = f.read().split(" ")
    split_data = []

    for line in full_text: #tqdm(full_text, desc='Splitting words'):
        split_data.append(line.lower().strip())

    return ' '.join(split_data)

In [7]:
def gen_n_grams(data, n=3):
    descript = "Generating " + str(n) + " Grams:"

    n_grams = [''.join(data[i:i+n]) for i in tqdm(range(len(data) - n + 1), desc=descript)]

    return n_grams

In [25]:
def main():
    data, hold_out = read_file('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/cwe-train.txt')

    n_grams = gen_n_grams(data, GRAMS)
    n_grams, mapping = encode(n_grams, data)

    vocab = len(mapping)
    sequences = np.array(n_grams)
    # create X and y
    x, y = sequences[:, :-1], sequences[:, -1]
    # one hot encode y
    y = to_categorical(y, num_classes=vocab)
    # create train and validation sets
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

    print('Train shape:', x_tr.shape, 'Val shape:', x_val.shape)

    model = build_model(vocab)

    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_acc', patience=5)

    model.fit(x_tr, y_tr, epochs=10, verbose=1, validation_data=(x_val, y_val), callbacks=stop_early, batch_size=250)
    model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/act_model')


In [18]:
GRAMS = 12

In [26]:
main()

Generating 12 Grams:: 100%|██████████| 482805/482805 [00:00<00:00, 1027639.76it/s]
Encoding: 100%|██████████| 482805/482805 [00:01<00:00, 309078.63it/s]


Train shape: (434524, 11) Val shape: (48281, 11)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [54]:
def calc_entropy(preds):
    entropy = 0

    count = 0
    pred_len = len(preds)

    for row in tqdm(preds, desc='Calculating Entropy'):
      entropy -= (1/(pred_len)) * log2(max(row))

    return entropy

In [51]:
def load_my_model():
    model = keras.models.load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/act_model')

    data = read_test_data('/content/gdrive/MyDrive/Colab_Notebooks/NLP/swahili/cwe-train.txt')

    data = data[int(len(data)*0.7):]

    n_grams = gen_n_grams(data, GRAMS-1)
    n_grams, mapping = encode(n_grams, data)

    vocab = len(mapping)
    sequences = np.array(n_grams)

    preds = model.predict(sequences)
    
    entropy = calc_entropy(preds)
    print('\n', entropy)
    

In [55]:
load_my_model()

Generating 11 Grams:: 100%|██████████| 181020/181020 [00:00<00:00, 1032077.41it/s]
Encoding: 100%|██████████| 181020/181020 [00:00<00:00, 299971.68it/s]




Calculating Entropy: 100%|██████████| 181020/181020 [00:00<00:00, 248707.71it/s]


 1.2353199400424082



