<a href="https://colab.research.google.com/github/GreihMurray/NLP-3/blob/Super_Murray/supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape
from tensorflow.keras.optimizers import SGD
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import joblib
from sklearn.metrics import accuracy_score
import pickle
import nltk
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
def read_file_to_sents():
    all_data = []
    with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/train.tsv", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")
        for line in tqdm(f, desc="Reading data..."):
            word = line[0]
            graphemes = line[1].split('-')

            cur_word = []

            for i in range(0, len(graphemes)):
                if len(graphemes[i]) == 1:
                    cur_word.append((word[i], 'B'))
                else:
                    cur_word.append((word[i], 'B'))
                    for j in range(i+1, (i + len(graphemes[i]))):
                        cur_word.append((word[j], 'I'))
                        i += j

            all_data.append(cur_word)

    return all_data

In [13]:
def pad(data):
  vocab = list(set([w for sent in data for (w,t) in sent]))
  vocab.append('<PAD>')
  tags = list(set([t for sent in data for (w,t) in sent]))
  tags.append('<PAD>')

  return vocab, tags

In [100]:
def encode(vocab, tags, data):
  max_len = max([len(i) for i in data])
  word2index = {w: i for i, w in enumerate(vocab)}
  tag2index = {t: i for i, t in enumerate(tags)}
  onehot = [[word2index[w[0]] for w in s] for s in data]
  X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)  

  onehot_y = [[tag2index[w[1]] for w in s] for s in data]
  y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
  y = to_categorical(y, num_classes=len(tags))


  # Used for saving word2index and tag2index in order to encode additional data in the same manner
  # Currently commented out due to issues with loading model
  # with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/word2index.json", "w") as outfile:
  #   json.dump(word2index, outfile)

  # with open("/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/tag2index.json", "w") as outfile:
  #   json.dump(tag2index, outfile)

  return X, y, max_len

In [43]:
def seq_model(data):
  #Original
    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data)

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
  
  
  # Dr. Scannell
    model = Sequential()
    model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.05)))
    model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
    model.compile(optimizer="adam", loss="poisson", metrics=["accuracy"])
  # From https://towardsdatascience.com/hyperparameter-tuning-with-kerastuner-and-tensorflow-c4a4d690b31a
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

    print("[INFO] training network...")
    sgd = SGD(0.05)
    history = model.fit(X_train, y_train, batch_size=32, epochs=50, validation_split=0.15, verbose=1, callbacks=stop_early)

    return model, X_test, y_test

In [7]:
def calc_precision(preds, y_test):
    true_pos = 0
    false_pos = 0

    for i in range(0, len(preds)):
        if str(preds[i]) == 'B' and str(y_test[i]) == 'B':
            true_pos += 1
        if str(preds[i]) == 'B' and str(y_test[i]) == 'I':
            false_pos += 1

    if (true_pos + false_pos) == 0:
        return 0.01

    precision = 100 * (true_pos / (true_pos + false_pos))

    return precision

In [8]:
def calc_recall(preds, y_test):
    true_pos = 0
    false_neg = 0

    for i in range(0, len(preds)):
        if str(preds[i]) == 'B' and str(y_test[i]) == 'B':
            true_pos += 1
        if str(preds[i]) == 'I' and str(y_test[i]) == 'B':
            false_neg += 1

    if true_pos + false_neg == 0:
        return 0
        
    recall = 100 * (true_pos / (true_pos + false_neg))

    return recall

In [9]:
def eval_model(model, x_test, y_test):
    eval = model.evaluate(x_test, y_test)
    print(eval)

In [10]:
def supervised():
    data = read_file_to_sents()

    model, x_test, y_test = seq_model(data)

    print(eval_model(model, x_test, y_test)) # Eval sequential model

    #model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/kreole/adamPoisson32_seq_model')

In [49]:
supervised()

Reading data...: 12812it [00:00, 33530.58it/s]


{'F': 0, 'e': 1, 'y': 2, 'd': 3, 'c': 4, 'V': 5, 'O': 6, 'S': 7, 'D': 8, 'E': 9, 'ò': 10, 'p': 11, 'A': 12, 'j': 13, 'k': 14, 'M': 15, 'T': 16, 'o': 17, 'u': 18, 'G': 19, 'r': 20, 'R': 21, 'P': 22, 'i': 23, 'v': 24, 'l': 25, 'è': 26, 'a': 27, 'W': 28, 'f': 29, 'B': 30, 's': 31, 'Z': 32, 'b': 33, 'C': 34, 'L': 35, 'z': 36, 'Y': 37, 'N': 38, 'w': 39, 'h': 40, 'J': 41, 'I': 42, 'È': 43, 'n': 44, 'à': 45, 'Ò': 46, 't': 47, 'm': 48, 'g': 49, 'K': 50, '<PAD>': 51}
[[14 17 44 44 31 23 25 47 27 51 51 51 51 51 51 51 51 51]
 [ 3  1 11 17 47 39 27 51 51 51 51 51 51 51 51 51 51 51]
 [31 17 31  2 17 11 39 17 29  1 31  2 17 44 26 25 51 51]
 [24  1 13  1 47 27 25 51 51 51 51 51 51 51 51 51 51 51]
 [20  1 11 23 33 25 23  2  1 51 51 51 51 51 51 51 51 51]]
[[14, 17, 44, 44, 31, 23, 25, 47, 27], [3, 1, 11, 17, 47, 39, 27], [31, 17, 31, 2, 17, 11, 39, 17, 29, 1, 31, 2, 17, 44, 26, 25], [24, 1, 13, 1, 47, 27, 25], [20, 1, 11, 23, 33, 25, 23, 2, 1], [48, 27, 31, 17, 44], [27, 14, 47], [1, 39, 17, 11, 1, 2, 

KeyboardInterrupt: ignored