<a href="https://colab.research.google.com/github/GreihMurray/NLP-3/blob/Super_Murray/supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape
from tensorflow.keras.optimizers import SGD
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import joblib
from sklearn.metrics import accuracy_score
import pickle
import nltk
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

Original

In [39]:
#/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/word2indextest.json | /content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/tag2indextest.json PAIR WITH LOW_DIM
#/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/word2indexdectree.json | /content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/tag2indextestdectree.json PAIR WITH DEC TREE


word2index_file = "/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/word2indextestNEW.json"
tag2index_file = "/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/tag2indextestNEW.json"

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


Original

In [4]:
def read_file_to_sents():
    all_data = []
    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/train.tsv", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")
        for line in tqdm(f, desc="Reading data..."):
            word = line[0]
            graphemes = line[1].split('-')

            cur_word = []

            for i in range(0, len(graphemes)):
                for j in range(0, len(graphemes[i])):
                    if j == 0:
                        cur_word.append((graphemes[i][j], 'B'))
                    else:
                        cur_word.append((graphemes[i][j], 'I'))

            all_data.append(cur_word)

    return all_data

In [51]:
def read_test_data():
    all_data = []
    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/test.txt", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")
        for line in tqdm(f, desc="Reading data..."):
            word = line[0]

            all_data.append([*word])

    print(len(all_data))

    return all_data

From Challenge 2 https://github.com/GreihMurray/NLP-2

In [54]:
def pad(data):
  vocab = list(set([w for sent in data for (w,t) in sent]))
  vocab.append('<PAD>')
  tags = list(set([t for sent in data for (w,t) in sent]))
  tags.append('<PAD>')

  return vocab, tags

In [53]:
def pad_test(data):
    vocab = list(set([w for sent in data for w in sent]))
    vocab.append('<PAD>')

    return vocab

Combination of code from Challenge 2 (https://github.com/GreihMurray/NLP-2) and custom

In [6]:
def encode(vocab, tags, data, load=False):
  max_len = max([len(i) for i in data])

  word2index = {}
  tag2index = {}

  if load is False:
      word2index = {w: i for i, w in enumerate(vocab)}
      tag2index = {t: i for i, t in enumerate(tags)}
  else:
      with open(word2index_file) as infile:
          word2index = json.load(infile)  

      with open(tag2index_file) as outfile:
          tag2index = json.load(outfile)

  onehot = [[word2index[w[0]] for w in s] for s in data]
  X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)  

  onehot_y = [[tag2index[w[1]] for w in s] for s in data]
  y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
  y = to_categorical(y, num_classes=len(tags))

  # Used for saving word2index and tag2index in order to encode additional data in the same manner
  # Currently commented out due to issues with loading model
  with open(word2index_file, "w") as outfile:
    json.dump(word2index, outfile)

  with open(tag2index_file, "w") as outfile:
    json.dump(tag2index, outfile)

  return X, y, max_len

In [71]:
def encode_test(vocab, data, load=True):
  max_len = 18

  word2index = {}

  if load is False:
      word2index = {w: i for i, w in enumerate(vocab)}
  else:
      with open(word2index_file) as infile:
          word2index = json.load(infile)  

  onehot = [[word2index[w[0]] for w in s] for s in data]
  X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab))  

  return X, max_len

Code from Challenge 2 & Dr. Scannell (https://github.com/GreihMurray/NLP-2, 

In [45]:
def seq_model(data):
  #Original
    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data)

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
  
  
  # Dr. Scannell
    model = Sequential()
    model.add(Embedding(input_dim=len(vocab), output_dim=15, input_length=max_len))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.01)))
    model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
  # From https://towardsdatascience.com/hyperparameter-tuning-with-kerastuner-and-tensorflow-c4a4d690b31a
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

    print("[INFO] training network...")
    sgd = SGD(0.05)
    history = model.fit(X_train, y_train, batch_size=16, epochs=50, validation_split=0.15, verbose=1, callbacks=stop_early)

    return model, X_test, y_test

Original

In [8]:
def eval_model(model, x_test, y_test):
    eval = model.evaluate(x_test, y_test)
    print(eval)

Original

In [33]:
def supervised():
    data = read_file_to_sents()

    print(data[:5])

    model, x_test, y_test = seq_model(data)

    eval_model(model, x_test, y_test) # Eval sequential model

    model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32_seq_model_low_dimTESTNEW') # Current best adamPoisson32_seq_model_low_dim

    new_model = load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32_seq_model_low_dimTESTNEW')

    custom_eval(new_model, x_test, y_test)

Original

In [10]:
def load_and_eval_model():
    data = read_file_to_sents()

    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data, load=True)

    new_model = load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32_seq_model_low_dim')

    evals = new_model.evaluate(x, y)

    print('Default Accuracy: ', (evals[1] * 100))

    preds = custom_eval(new_model, x, y)
    clean_x = undo_encode_x(x)
    combined = recombine(clean_x, preds)

    graphs = to_graphemes(combined)

    print_results_to_file(graphs)


In [72]:
def load_eval_test():
    print("Reading data")
    data = read_test_data()

    print("Encoding data")
    vocab = pad_test(data)
    x, _ = encode_test(vocab, data)

    print("Loading model")
    model = load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32_seq_model_low_dimTESTNEW')

    print("Predicting")
    preds = model.predict(x)

    print("Preparing predictions")
    preds = undo_encode_y(preds)
    clean_x = undo_encode_x(x)

    print(x[25:30])
    print(clean_x[25:30])

    combined = recombine(clean_x, preds)
    print(combined[:5])

    print("Converting to graphemes")
    graphs = to_graphemes(combined)

    print("Printing to file")
    print_results_to_file(graphs)

Original

In [28]:
def print_results_to_file(data):
    with open('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/supervised_resultsTEST.tsv', 'w', newline='') as tsvfile:
      writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')

      for row in data:
          writer.writerow(row)

Original

In [12]:
def recombine(words, preds):
    all_data = []

    for i in range(0, len(words)):
        all_data.append((words[i], preds[i]))

    return all_data

Original

In [64]:
def undo_encode_x(x, test=False):
    all_words = []

    pad_val = 51

    if test == True:
        pad_val = 50

    word2index = {}
    tag2index = {}

    with open(word2index_file) as infile:
          word2index = json.load(infile)  

    with open(tag2index_file) as outfile:
          tag2index = json.load(outfile)

    for word in x:
        cur_word = []
        for letter in word:
            if letter == pad_val:
                break
            true_letter = list(word2index.keys())[list(word2index.values()).index(letter)]
            cur_word.append(true_letter)

        all_words.append(''.join(cur_word))

    return all_words

Original

In [14]:
def undo_encode(x, y):
    all_data = []
    all_words = []
    all_tags = []

    word2index = {}
    tag2index = {}

    with open(word2index_file) as infile:
          word2index = json.load(infile)  

    with open(tag2index_file) as outfile:
          tag2index = json.load(outfile)

    for word in x:
        cur_word = []
        for letter in word:
            if letter == 51:
                break
            true_letter = list(word2index.keys())[list(word2index.values()).index(letter)]
            cur_word.append(true_letter)

        all_words.append(''.join(cur_word))

    for tags in y:
        cur_tags = []
        for tag in tags:
            if tag[0] == 1:
                cur_tags.append('I')
            elif tag[1] == 1:
                cur_tags.append('B')
            else:
                continue

        all_tags.append(''.join(cur_tags))

    for i in range(0, len(all_words)):
        all_data.append((all_words[i], all_tags[i]))

    return all_data

Original

In [15]:
def undo_encode_y(y):
    all_tags = []

    word2index = {}
    tag2index = {}

    with open(word2index_file) as infile:
          word2index = json.load(infile)  

    with open(tag2index_file) as outfile:
          tag2index = json.load(outfile)

    for tags in y:
        cur_tags = []
        for tag in tags:
            if round(tag[0]) == 1:
                cur_tags.append('B')
            elif round(tag[1]) == 1:
                cur_tags.append('I')
            else:
                continue

        all_tags.append(''.join(cur_tags))

    return all_tags

Original

In [16]:
def calc_accuracy(preds, y_test):
    total_right = 0

    for i in range(0, len(preds)):
        if preds[i] == y_test[i]:
            total_right += 1

    accuracy = 100 * (total_right/len(preds))

    return accuracy

Original

In [17]:
def calc_precision(preds, y_test):
    true_pos = 0
    false_pos = 0

    for i in range(0, len(preds)):
        for j in range(0, len(preds[i])):
            if preds[i][j] == 'I' and y_test[i][j] == 'I':
                true_pos += 1
            if preds[i][j] == 'I' and y_test[i][j] == 'B':
                false_pos += 1

    if (true_pos + false_pos) == 0:
        return 0.01

    precision = 100 * (true_pos / (true_pos + false_pos))

    return precision

Original

In [18]:
def calc_recall(preds, y_test):
    true_pos = 0
    false_neg = 0

    for i in range(0, len(preds)):
        for j in range(0, len(preds[i])):
            if preds[i][j] == 'I' and y_test[i][j] == 'I':
                true_pos += 1
            if preds[i][j] == 'B' and y_test[i][j] == 'I':
                false_neg += 1

    if true_pos + false_neg == 0:
        return 0
        
    recall = 100 * (true_pos / (true_pos + false_neg))

    return recall

Original

In [19]:
def custom_eval(model, x, y):
    preds = model.predict(x)

    preds = undo_encode_y(preds)
    y_clean = undo_encode_y(y)

    acc = calc_accuracy(preds, y_clean)

    print("Custom calculated Accuracy: ", acc)

    prec = calc_precision(preds, y_clean)

    print("Precision: ", prec)

    recall = calc_recall(preds, y_clean)

    print("Recall: ", recall)

    fscore = (2 * (prec * recall)) / (prec + recall)

    print("Fscore: ", fscore)

    return preds

Original

In [20]:
def dec_tree_eval(preds, y_test):
    acc = calc_accuracy(preds, y_test)

    print("Custom calculated Accuracy: ", acc)

    prec = calc_precision(preds, y_test)

    print("Precision: ", prec)

    recall = calc_recall(preds, y_test)

    print("Recall: ", recall)

    fscore = (2 * (prec * recall)) / (prec + recall)

    print("Fscore: ", fscore)

Original

In [21]:
def to_graphemes(data):
    graph_data = []

    for word_pair in data:
        word = word_pair[0]
        grap = word_pair[1]

        cur_word = []

        for i in range(0, len(word)):
            if i == (len(word) - 1):
                cur_word.append(word[i])

            else:
                if grap[i+1] == 'I':
                    cur_word.append(word[i])
                else:
                    cur_word.append(word[i] + '-')

        graph_data.append((word, ''.join(cur_word)))

    return graph_data


Original

In [22]:
def split_x_y(data):
    all_x = []
    all_y = []

    for word in data:
        cur_x = []
        cur_y = []
        for letter in word:
            all_x.append(letter[0])
            all_y.append(letter[1])
        

    return np.asarray(all_x), np.asarray(all_y)

Mostly original, some work from https://nlpforhackers.io/training-pos-tagger/amp/

In [23]:
def dec_tree():
    data = read_file_to_sents()

    x, y = split_x_y(data)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

    y_test_clean = y_test

    le = LabelEncoder()
    le.fit(x_train)
    x_train = le.transform(x_train)
    x_train = x_train.reshape(-1, 1)
    x_test = le.transform(x_test)
    x_test = x_test.reshape(-1, 1)

    yle = LabelEncoder()
    yle.fit(y_train)
    y_train = yle.transform(y_train)
    y_train = y_train.reshape(-1, 1)
    y_test = yle.transform(y_test)
    y_test = y_test.reshape(-1, 1)

    clf = Pipeline([
        ('classifier', DecisionTreeClassifier(criterion='entropy')),
    ], verbose=1)

    clf.fit(x_train, y_train)

    preds = clf.predict(x_test)


    clean_preds = yle.inverse_transform(preds)

    dec_tree_eval(clean_preds, y_test_clean)

Original

In [24]:
def hmm_eval(model, test, model_type):
    acc = model.accuracy(test)

    print(model_type, " Accuracy: ", acc)

    prec = model.precision(test)

    print(model_type, ' Precision: ', prec)

    rec = model.recall(test)

    print(model_type, ' Recall: ', rec)

    fscore = (2 * (prec['I'] * rec['I'])) / (prec['I'] + rec['I'])

    print(model_type, ' FScore (I): ', fscore)

    print('\n\n')

Original

In [None]:
def HMMs():
    data = read_file_to_sents()

  # Loosely based on work from https://nlpforhackers.io/training-pos-tagger/amp/
    cutoff = int(.75 * len(data))
    train = data[:cutoff]
    test = data[cutoff:]

  # Original
    uni_tag = nltk.UnigramTagger(train)
    hmm_eval(uni_tag, test, 'Unigram')

    bitag = nltk.BigramTagger(train)
    hmm_eval(bitag, test, 'Bigram')

    tritag = nltk.TrigramTagger(train)
    hmm_eval(tritag, test, 'Trigram')

Original

In [None]:
HMMs()

NameError: ignored

Original

In [None]:
dec_tree()

Reading data...: 12812it [00:00, 74704.00it/s]


[Pipeline] ........ (step 1 of 1) Processing classifier, total=   0.0s
Custom calculated Accuracy:  97.15938376954355
Precision:  75.51637279596977
Recall:  99.33730947647449
Fscore:  85.80423583285634


Original

In [38]:
load_and_eval_model()

Reading data...: 12812it [00:00, 147273.83it/s]


Default Accuracy:  99.98655915260315
Custom calculated Accuracy:  99.77364970340305
Precision:  99.77663907502298
Recall:  99.81598317560463
Fscore:  99.79630724751954


In [73]:
load_eval_test()

Reading data


Reading data...: 1427it [00:00, 144799.12it/s]

1427
Encoding data
Loading model





Predicting
Preparing predictions
[[20 34 47 23 21 15 18 51 51 51 51 51 51 51 51 51 51 51]
 [20 13 28 13 21 15  0 34 51 51 51 51 51 51 51 51 51 51]
 [50 23  3 15  2 20 36 18 51 51 51 51 51 51 51 51 51 51]
 [34  2 34 51 51 51 51 51 51 51 51 51 51 51 51 51 51 51]
 [18 23 47 23  3 24  2 51 51 51 51 51 51 51 51 51 51 51]]
['seramik', 'sodomize', 'Ravinsèk', 'ene', 'karavàn']
[('Oradye', 'BBBBBB'), ('bous', 'BBIB'), ('titan', 'BBBBI'), ('refi', 'BBBB'), ('netwayè', 'BBBBBBB')]
Converting to graphemes
Printing to file


Currently best performance is with output dim of 10, better than output dim of 50 by approx. 0.4%

Original

In [46]:
supervised()

Reading data...: 12812it [00:00, 151729.48it/s]


[[('k', 'B'), ('o', 'B'), ('n', 'I'), ('s', 'B'), ('i', 'B'), ('l', 'B'), ('t', 'B'), ('a', 'B'), ('n', 'I')], [('d', 'B'), ('e', 'B'), ('p', 'B'), ('o', 'B'), ('t', 'B'), ('w', 'B'), ('a', 'B')], [('s', 'B'), ('o', 'B'), ('s', 'B'), ('y', 'B'), ('o', 'B'), ('p', 'B'), ('w', 'B'), ('o', 'B'), ('f', 'B'), ('e', 'B'), ('s', 'B'), ('y', 'B'), ('o', 'B'), ('n', 'B'), ('è', 'B'), ('l', 'B')], [('v', 'B'), ('e', 'B'), ('j', 'B'), ('e', 'B'), ('t', 'B'), ('a', 'B'), ('l', 'B')], [('r', 'B'), ('e', 'B'), ('p', 'B'), ('i', 'B'), ('b', 'B'), ('l', 'B'), ('i', 'B'), ('y', 'B'), ('e', 'B')]]
[INFO] training network...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
[0.0008808380807749927, 0.999696671962738]




Custom calculated Accuracy:  99.53198127925117
Precision:  99.19354838709677
Recall:  99.86468200270636
Fscore:  99.52798381658799
