<a href="https://colab.research.google.com/github/GreihMurray/NLP-3/blob/Super_Murray/supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape
from tensorflow.keras.optimizers import SGD
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import joblib
from sklearn.metrics import accuracy_score
import pickle
import nltk
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [170]:
def read_file_to_sents():
    all_data = []
    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/train.tsv", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")
        for line in tqdm(f, desc="Reading data..."):
            word = line[0]
            graphemes = line[1].split('-')

            cur_word = []

            for i in range(0, len(graphemes)):
                for j in range(0, len(graphemes[i])):
                    if j == 0:
                        cur_word.append((graphemes[i][j], 'B'))
                    else:
                        cur_word.append((graphemes[i][j], 'I'))

            all_data.append(cur_word)

    return all_data

In [13]:
def pad(data):
  vocab = list(set([w for sent in data for (w,t) in sent]))
  vocab.append('<PAD>')
  tags = list(set([t for sent in data for (w,t) in sent]))
  tags.append('<PAD>')

  return vocab, tags

In [127]:
def encode(vocab, tags, data, load=False):
  max_len = max([len(i) for i in data])

  word2index = {}
  tag2index = {}

  if load is False:
      word2index = {w: i for i, w in enumerate(vocab)}
      tag2index = {t: i for i, t in enumerate(tags)}
  else:
      with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/word2index.json") as infile:
          word2index = json.load(infile)  

      with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/tag2index.json") as outfile:
          tag2index = json.load(outfile)

  print(type(word2index))
  print(type(tag2index))

  onehot = [[word2index[w[0]] for w in s] for s in data]
  X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)  

  onehot_y = [[tag2index[w[1]] for w in s] for s in data]
  y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
  y = to_categorical(y, num_classes=len(tags))

  # Used for saving word2index and tag2index in order to encode additional data in the same manner
  # Currently commented out due to issues with loading model
  with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/word2index.json", "w") as outfile:
    json.dump(word2index, outfile)

  with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/tag2index.json", "w") as outfile:
    json.dump(tag2index, outfile)

  return X, y, max_len

In [43]:
def seq_model(data):
  #Original
    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data)

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
  
  
  # Dr. Scannell
    model = Sequential()
    model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.05)))
    model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
    model.compile(optimizer="adam", loss="poisson", metrics=["accuracy"])
  # From https://towardsdatascience.com/hyperparameter-tuning-with-kerastuner-and-tensorflow-c4a4d690b31a
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

    print("[INFO] training network...")
    sgd = SGD(0.05)
    history = model.fit(X_train, y_train, batch_size=32, epochs=50, validation_split=0.15, verbose=1, callbacks=stop_early)

    return model, X_test, y_test

In [7]:
def calc_precision(preds, y_test):
    true_pos = 0
    false_pos = 0

    for i in range(0, len(preds)):
        if str(preds[i]) == 'B' and str(y_test[i]) == 'B':
            true_pos += 1
        if str(preds[i]) == 'B' and str(y_test[i]) == 'I':
            false_pos += 1

    if (true_pos + false_pos) == 0:
        return 0.01

    precision = 100 * (true_pos / (true_pos + false_pos))

    return precision

In [8]:
def calc_recall(preds, y_test):
    true_pos = 0
    false_neg = 0

    for i in range(0, len(preds)):
        if str(preds[i]) == 'B' and str(y_test[i]) == 'B':
            true_pos += 1
        if str(preds[i]) == 'I' and str(y_test[i]) == 'B':
            false_neg += 1

    if true_pos + false_neg == 0:
        return 0
        
    recall = 100 * (true_pos / (true_pos + false_neg))

    return recall

In [9]:
def eval_model(model, x_test, y_test):
    eval = model.evaluate(x_test, y_test)
    print(eval)

In [104]:
def supervised():
    data = read_file_to_sents()

    model, x_test, y_test = seq_model(data)

    eval_model(model, x_test, y_test) # Eval sequential model

    model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32TEST_seq_model')

    new_model = load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32TEST_seq_model')

    eval_model(new_model, x_test, y_test)

In [174]:
def load_and_eval_model():
    data = read_file_to_sents()

    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data, load=True)

    undo_encode(x, y)

    new_model = load_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32TEST_seq_model')

    eval_model(new_model, x, y)

In [175]:
def undo_encode(x, y):
    all_data = []

    word2index = {}
    tag2index = {}

    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/word2index.json") as infile:
          word2index = json.load(infile)  

    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/tag2index.json") as outfile:
          tag2index = json.load(outfile)

    for word in x:
        cur_word = []
        for letter in word:
            if letter == 51:
                break
            true_letter = list(word2index.keys())[list(word2index.values()).index(letter)]
            cur_word.append(true_letter)

        all_data.append(''.join(cur_word))

    print(all_data[:5])

In [176]:
load_and_eval_model()

Reading data...: 12812it [00:00, 152955.14it/s]


<class 'dict'>
<class 'dict'>
['konsiltan', 'depotwa', 'sosyopwofesyonèl', 'vejetal', 'repibliye']


KeyboardInterrupt: ignored

In [103]:
supervised()

Reading data...: 12812it [00:00, 176605.76it/s]


[INFO] training network...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
[0.3346741795539856, 0.9983532428741455]
None




[0.3346741795539856, 0.9983532428741455]
