<a href="https://colab.research.google.com/github/GreihMurray/NLP-2/blob/master/irish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [275]:
!pip install -q -U keras-tuner

In [276]:
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape
from tensorflow.keras.optimizers import SGD
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import joblib
from sklearn.metrics import accuracy_score
import pickle
import nltk
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt

In [277]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [278]:
#Original
def read_file_to_sents():
    i = 1
    with open("train.tsv", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")

        cur_sent = []
        all_sents = []

        for line in tqdm(f, desc="Reading data..."):
            if line[0][0:2].strip() == 'N':
                line[0] = 'N'

            if line[0] == "<S>":
                if len(cur_sent) >= 1:
                    all_sents.append(cur_sent)
                cur_sent = []
                continue
            cur_sent.append((line[0], line[1]))

            i += 1

    return all_sents

In [279]:
#Original
def string_to_num(data):
    le = LabelEncoder()

    label = le.fit_transform(data)

    return label

In [280]:
#Original
def defeature(data):
    unfeatured = []

    for dict in data:
        unfeatured.append(dict['word'])

    return unfeatured

In [281]:
#From Challenge 1 https://github.com/GreihMurray/NLP
def pad(data):
  vocab = list(set([w for sent in data for (w,t) in sent]))
  vocab.append('<PAD>')
  print(len(vocab))
  tags = list(set([t for sent in data for (w,t) in sent]))
  tags.append('<PAD>')

  return vocab, tags

In [282]:
#From Challenge 1 https://github.com/GreihMurray/NLP
def encode(vocab, tags, data):
  max_len = 60
  word2index = {w: i for i, w in enumerate(vocab)}
  tag2index = {t: i for i, t in enumerate(tags)}
  onehot = [[word2index[w[0]] for w in s] for s in data]
  X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

  onehot_y = [[tag2index[w[1]] for w in s] for s in data]
  y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
  y = to_categorical(y, num_classes=len(tags))

  return X, y, max_len

In [283]:
#Part original partly from Dr. Scannell
def seq_model(data, test_len):
  #Original
    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data)

    X_train, X_test = x[:(len(data) - test_len)], x[(len(data) - test_len):]
    y_train, y_test = y[:(len(data) - test_len)], y[(len(data) - test_len):]
  
  
  # Dr. Scannell
    model = Sequential()
    model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.05)))
    model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
    model.compile(optimizer="adam", loss="poisson", metrics=["accuracy"])
  # From https://towardsdatascience.com/hyperparameter-tuning-with-kerastuner-and-tensorflow-c4a4d690b31a
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

    print("[INFO] training network...")
    sgd = SGD(0.05)
    history = model.fit(X_train, y_train, batch_size=1024, epochs=50, validation_split=0.15, verbose=1, callbacks=stop_early)

    return model, X_test, y_test

In [284]:
#Original
def eval_model(model, x_test, y_test):
  return model.evaluate(x_test, y_test, return_dict=True)

Below work partially from https://nlpforhackers.io/training-pos-tagger/amp/

In [285]:
def dec_tree(X, y):
    clf = Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', DecisionTreeClassifier(criterion='entropy')),
    ], verbose=1)

    print("Training Started")

    # Custom work below
    all_clfs = []

    for i in tqdm(range(0, 115), desc="Training"):
        cur_clf = clf
        cur_clf.fit(X[(i * 31649): ((i + 1) * 31649)], y[(i * 31649): ((i + 1) * 31649)])

        all_clfs.append(cur_clf)

        file_loc = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/model' + str(i) + ".h5"
        file_loc_2 = '/gdrive/MyDrive/Colab_Notebooks/NLP/vector' + str(i) + ".pkl"


        with open(file_loc, 'wb') as pickle_file:
          pickle.dump(cur_clf, pickle_file)

    print('Training completed')

    return all_clfs

Custom work below


In [286]:
def acc_score(all_clfs, x_test, y_test):
    all_scores = []

    for clf in tqdm(all_clfs, desc="evaluating..."):
      temp_scores = []
      for i in range(0,50):
        temp_scores.append(clf.score(x_test[int(i * (len(x_test) / 250)):int( (i + 1) * (len(x_test)/250))], y_test[int(i * (len(x_test) / 250)):int( (i + 1) * (len(x_test)/250))]))

        all_scores.append(sum(temp_scores)/ len(temp_scores))

    total_acc = 0

    for score in all_scores:
        total_acc += score

    print("Accuracy:", total_acc / len(all_scores))

In [287]:
# Below work comes from https://nlpforhackers.io/training-pos-tagger/amp/

In [288]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [289]:
# Below work comes from https://nlpforhackers.io/training-pos-tagger/amp/

In [290]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [291]:
# Below work comes from https://nlpforhackers.io/training-pos-tagger/amp/

In [292]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []

    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])

    return X, y

Custom work below

In [293]:
def load_models():
  all_clfs = []

  for i in range(1, 115):
    file_loc = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/model' + str(i) + ".h5"

        # cur_clf.named_steps['classifier'].save(file_loc)

    with open(file_loc, 'rb') as pickle_file:
      cur_clf = pickle.load(pickle_file)

      all_clfs.append(cur_clf)

  return all_clfs



Custom work below

In [294]:
def load_seq_model(filename):
    model = load_model(filename)

    return model

Below based on code provided by Dr. Scannell

In [295]:
def HMM_uni(train, test):
    uni_tag = nltk.UnigramTagger(train)
    acc = uni_tag.accuracy(test)

    print("Unigram tagger accuracy: ", acc)

Influenced by Dr. Scannell but largely original

In [296]:
def gen_rule_tag():
    patterns = [
        (r'a .*', 'H'),
        (r'an .*', 'H'),
        (r'mo .*', 'H'),
        (r'do .*', 'H'),
        (r'don .*', 'H'),
        (r'den .*', 'H'),
        (r'an d.*', 'N'),
        (r'an t.*', 'N'),
        (r'faoi .*', 'N')
    ]

    rule_tagger = nltk.RegexpTagger(patterns)

    return rule_tagger

Below code influenced by code provided by Dr.  and adapted

In [297]:
def HMM_bi(train, test):
    rule_tagger = gen_rule_tag()

    bigrams = gen_grams(train, 1)
    test = gen_grams(test)

    bitag = nltk.UnigramTagger(bigrams, backoff=rule_tagger)
    acc = bitag.accuracy(test)

    print("Bigram tagger accuracy: ", acc)

Original code below

In [298]:
def HMM_tri(train, test):
    tritag = nltk.TrigramTagger(train)

    acc = tritag.accuracy(test)

    print("Trigram tagger accuracy: ", acc)

Original code below

In [299]:
def sent_to_list(data):
    all_data = []

    for entry in data:
      for tup in entry:
        all_data.append(tup)

    return all_data

Original code below (Was also used in Challenge 1 https://github.com/GreihMurray/NLP)

In [300]:
def gen_grams(data, n = 1):
    all_grams = []
    
    for sent in data:
      cur_grams = [(' '.join([sent[i][0], sent[i+1][0]]), sent[i+1][1]) for i in range(len(sent)-1)]

      cur_grams.append(sent[len(sent)-1])

      all_grams.append(cur_grams)

    return all_grams

Original code below

In [301]:
def prep_data(data):
    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data)

    return x, y

Original code below

In [302]:
def test_whole_seq(data):
  #'/content/gdrive/MyDrive/Colab_Notebooks/NLP/fancy_seq_model.h5'
  #'/content/gdrive/MyDrive/Colab_Notebooks/NLP/adamPoisson_seq_model.h5'
  #'/content/gdrive/MyDrive/Colab_Notebooks/NLP/adaGradPoisson_seq_model.h5'

    model = load_seq_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/adamPoisson1024_seq_model.h5')
    x_test, y_test = prep_data(data)

    print('Changed:', eval_model(model, x_test, y_test))

In [303]:
def read_test_data():
    with open("test.tsv", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")

        cur_sent = []
        all_sents = []

        for line in tqdm(f, desc="Reading data..."):
            if line[0][0:2].strip() == 'N':
                line = ('N', 'N')

            if line[0] == "<S>":
                if len(cur_sent) >= 1:
                    all_sents.append(cur_sent)
                cur_sent = []
                continue
            
            if len(line) < 2:
                print(line, i)

            cur_sent.append((line[0], line[1]))

    #x_test, y_test = prep_data(all_sents)

    return all_sents

Original code below except where marked

In [310]:
def anything():
    data = read_file_to_sents()
    test_data = read_test_data()

    test_len = len(test_data)

    data.extend(test_data)

    print(data[:5])

    print('Splitting data')

  # from https://nlpforhackers.io/training-pos-tagger/amp/
    cutoff = int(.75 * len(data))
    training_sentences = data[:cutoff]
    test_sentences = data[cutoff:]

  # Original
    #all_clfs = dec_tree(X, y) # Training Decision Tree Models
    #all_clfs = load_models() # Loading models
    #acc_score(all_clfs, x_test, y_test) # Calculating accuracy

    # HMM_uni(training_sentences, test_sentences)

    # HMM_bi(training_sentences, test_sentences)

    # HMM_tri(training_sentences, test_sentences)

    model, x_test, y_test = seq_model(data, test_len) # Train sequential model

    #test_whole_seq(data)

    #x_test, y_test = read_test_data()

    print(eval_model(model, x_test, y_test)) # Eval sequential model

    #model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/adamPoisson1024_seq_model.h5')

    del training_sentences, test_sentences

In [311]:
anything()

Reading data...: 5057059it [00:07, 668049.50it/s]
Reading data...: 473951it [00:00, 667946.37it/s]


[[('ansin', 'N'), (')', 'N'), ('tá', 'N'), ('níos', 'N'), ('lú', 'N'), ('gaeilge', 'N'), ('ag', 'N'), ('na', 'N'), ('gardaí', 'N'), ('ná', 'N'), ('bí', 'S'), ('ariamh', 'N'), ('ainneoin', 'N'), ('na', 'N'), ('cearta', 'U'), ('.', 'N'), ('níl', 'N'), ('sé', 'N'), ('ach', 'N'), ('roinnt', 'N'), ('seachtainí', 'N'), ('ó', 'N'), ('sin', 'S'), ('a', 'N'), ('tógadh', 'N'), ('fear', 'N'), ('bocht', 'N'), ('a', 'N'), ('tug', 'S'), ('ainm', 'N'), ('gaeilge', 'N'), ('dóibh', 'N'), ('.', 'N')], [('socraíodh', 'N'), ('go', 'N'), ('raibh', 'N'), ('gá', 'N'), ('lena', 'N'), ('leithéid', 'N'), (',', 'N'), ('mar', 'N'), ('go', 'N'), ('bíonn', 'U'), ('na', 'N')], [('tá', 'N'), ('an', 'N'), ('córas', 'N'), ('bainistíochta', 'N'), ('tar', 'N'), ('éis', 'N'), ('freastal', 'N'), ('go', 'N'), ('maith', 'N'), ('ar', 'N'), ('rialtas', 'N'), ('áitiúil', 'N'), ('na', 'N'), ('éireann', 'H'), ('agus', 'N'), ('leanfaidh', 'N'), ('údaráis', 'N'), ('áitiúla', 'N'), ('ar', 'N'), ('aghaidh', 'N'), ('ag', 'N'), ('brath



[INFO] training network...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
{'loss': 0.1730583906173706, 'accuracy': 0.9890205264091492}


ADAM Poisson Accuracy: 99.371

ADAM Cross Entropy Accuracy: 99.370

ADAGRAD Poisson Accuracy: 94.01
