<a href="https://colab.research.google.com/github/GreihMurray/NLP-5/blob/master/Copy_of_anything_goes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.models import Sequential
import keras
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from math import log2
import json

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Heavily based on https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/#h2_7

In [None]:
def split_sents(data):
    split_data = []

    for row in tqdm(data, desc='splitting data'):
        split_data.append(row.split(' '))

    return split_data

In [None]:
def encode(grams, raw_data, loader=False):
    chars = sorted(list(set(raw_data)))
    chars.append('<UNK>')
    mapping = dict((c, i) for i, c in enumerate(chars))

    if loader == True:
        with open('/content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/encode.json') as infile:
          mapping = json.load(infile) 

    sequences = list()
    for line in tqdm(grams, desc='Encoding'):
        # integer encode line
        try:
          encoded_seq = [mapping[char] for char in line]
        except KeyError:
          encoded_seq = [mapping['<UNK>'] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences, mapping

Based on code from https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/#h2_7

In [None]:
def build_model(vocab):
    model = Sequential()
    model.add(Embedding(vocab, 20, input_length=GRAMS-1, trainable=True))
    model.add(GRU(75, recurrent_dropout=0.1, dropout=0.1))
    model.add(Dense(vocab, activation='softmax'))

    model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')

    return model

original

In [None]:
def read_train_file(file_name):
    all_data = []
    descript = 'Reading ' + file_name

    f = open(file_name, 'r', encoding='utf-8')
    full_text = f.read()

    cur_sent = []

    for line in tqdm(full_text.split('\n'), desc=descript):
        if line == '<s>':
            cur_sent = []
            continue
        if line in '()':
            continue
        if line == '</s>':
            if len(cur_sent) <= 50:
              all_data.append(cur_sent)
            continue
        else:
            cur_sent.append(line.lower())

    return all_data

In [None]:
def read_lang_train(source_file, targ_file):
    all_data = []
    descript = 'Reading files'
    clean_punct = '-.,\''

    source_data = read_train_file(source_file)
    target_data = read_train_file(targ_file)

    for i in tqdm(range(0, len(source_data)), desc='Cleaning data'):
        clean_sentence = '<OGA> ' + ' '.join(source_data[i]) + ' <NGA> ' + ' '.join(target_data[i])

        all_data.append(clean_sentence)

    data = ' '.join(all_data[:int(len(all_data) * 0.8)])
    hold_out = ' '.join(all_data[int(len(all_data) * 0.8):])

    return data, hold_out

original

In [None]:
def read_test_data(file_name):
    all_data = []
    descript = 'Reading files'
    clean_punct = '-.,\''

    source_data = read_train_file(file_name)

    for i in tqdm(range(0, len(source_data)), desc='Cleaning data'):
        clean_sentence = '<OGA> ' + ' '.join(source_data[i]) + ' <NGA> '

        all_data.append(clean_sentence)

    return all_data

original

In [None]:
def gen_n_grams(data, n=3):
    descript = "Generating " + str(n) + " Grams:"

    n_grams = [''.join(data[i:i+n]) for i in tqdm(range(len(data) - n + 1), desc=descript)]

    return n_grams

In [None]:
# def gen_n_grams(data, n=3):
#     descript = "Generating and counting " + str(n) + " Grams:"
#     counts = {}
#     n_grams = []

#     for row in tqdm(data, desc=descript):
#         row = row.split(' ')
#         gram_list = [' '.join(row[i:i+n]) for i in range(len(row) - n + 1)]
        
#         for gram in gram_list:
#           if gram in counts.keys():
#             counts[gram] += 1
#           else:
#             counts[gram] = 1

#     #n_grams = [''.join(data[i:i+n]) for i in tqdm(range(len(data) - n + 1), desc=descript)]

#     return counts

In [None]:
def main():
    data, hold_out = read_lang_train('/content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/train-source.txt', '/content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/train-target.txt')
    
    data = data[:int(len(data) * 0.5)]

    n_grams = gen_n_grams(data, GRAMS)

    n_grams, mapping = encode(n_grams, data)

    print(mapping)

    with open('/content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/encode.json', "w") as outfile:
      json.dump(mapping, outfile)

  # Below code from https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/#h2_7
    vocab = len(mapping)
    sequences = np.array(n_grams)

    print(vocab)

    # create X and y
    x, y = sequences[:, :-1], sequences[:, -1]
    # one hot encode y
    y = to_categorical(y, num_classes=vocab)
    # create train and validation sets
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

    print('Train shape:', x_tr.shape, 'Val shape:', x_val.shape)

    model = build_model(vocab)

    print(model.summary())

  # Original addition
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

    model.fit(x_tr, y_tr, epochs=4, verbose=1, validation_data=(x_val, y_val), callbacks=stop_early, batch_size=125)
    model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/lang_model')


In [None]:
GRAMS = 10

In [None]:
main()

Reading /content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/train-source.txt: 100%|██████████| 925535/925535 [00:00<00:00, 1581813.17it/s]
Reading /content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/train-target.txt: 100%|██████████| 910805/910805 [00:00<00:00, 1610360.19it/s]
Cleaning data: 100%|██████████| 44544/44544 [00:00<00:00, 433275.69it/s]
Generating 10 Grams:: 100%|██████████| 3357516/3357516 [00:03<00:00, 1048334.64it/s]
Encoding: 100%|██████████| 3357516/3357516 [00:10<00:00, 335307.31it/s]


{' ': 0, '!': 1, '"': 2, "'": 3, ',': 4, '-': 5, '.': 6, '1': 7, '2': 8, '3': 9, '4': 10, '5': 11, '6': 12, '7': 13, '8': 14, '9': 15, ':': 16, ';': 17, '<': 18, '>': 19, '?': 20, 'A': 21, 'G': 22, 'N': 23, 'O': 24, '[': 25, ']': 26, '^': 27, '_': 28, '`': 29, 'a': 30, 'b': 31, 'c': 32, 'd': 33, 'e': 34, 'f': 35, 'g': 36, 'h': 37, 'i': 38, 'j': 39, 'k': 40, 'l': 41, 'm': 42, 'n': 43, 'o': 44, 'p': 45, 'q': 46, 'r': 47, 's': 48, 't': 49, 'u': 50, 'v': 51, 'w': 52, 'x': 53, 'y': 54, 'z': 55, '£': 56, '«': 57, '»': 58, 'á': 59, 'ã': 60, 'é': 61, 'í': 62, 'ó': 63, 'õ': 64, 'ú': 65, '‑': 66, '—': 67, '‘': 68, '’': 69, '“': 70, '”': 71, '…': 72, '<UNK>': 73}
74
Train shape: (3021764, 9) Val shape: (335752, 9)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 9, 20)             1480      
                                                                 
 

Original

In [None]:
def calc_entropy(preds, mapping, sequences, vocab):
    entropy = 0

    count = 0
    pred_len = len(preds)

    keys = sequences[:, -1]

    for i in range(0, len(sequences)):
      entropy -= (1/(pred_len)) * log2(preds[i][keys[i]])

    return entropy

Original

In [None]:
def load_my_model():
    model = keras.models.load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/lang_model')

    data = read_test_data('/content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/train-source.txt')

    n_grams = gen_n_grams(data, GRAMS)
    clean = n_grams
    n_grams, mapping = encode(n_grams, data, loader=True)

    vocab = len(mapping)
    sequences = np.array(n_grams)

    print(se)

    seqs = sequences[:,:-1]

    preds = model.predict(seqs)
    print(preds[:5])

    entropy = calc_entropy(preds, mapping, sequences, vocab)
    print('\n', entropy)
    

In [None]:
load_my_model()

Reading /content/gdrive/MyDrive/Colab_Notebooks/NLP/translation/train-source.txt: 100%|██████████| 925535/925535 [00:03<00:00, 288219.05it/s]
Cleaning data: 100%|██████████| 44544/44544 [00:00<00:00, 644154.02it/s]
Generating 10 Grams:: 100%|██████████| 44535/44535 [00:00<00:00, 648011.94it/s]
Encoding: 100%|██████████| 44535/44535 [00:03<00:00, 11354.02it/s]
  # This is added back by InteractiveShellApp.init_path()


IndexError: ignored

In [None]:
# List of models & Performance (KWERE) (12 grams)
  # act_model - 1.235 entropy (Batch size 250)
  # act_model500 - 1.282 entropy (Batch size 500)
  # act_model125 - 1.233 entropy (Batch size 125)
  # act_model50 - 1.201 entropy (Batch size 50)

In [None]:
# List of models & Performance (SWAHILI) (6 grams)
  # act_model - 
  # sw_act_model500 - 1.474 entropy (Batch size 500)
  # act_model125 - 
  # act_model50 - 