<a href="https://colab.research.google.com/github/GreihMurray/NLP-2/blob/AG_Murray/irish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U keras-tuner

[K     |████████████████████████████████| 135 kB 7.4 MB/s 
[K     |████████████████████████████████| 1.6 MB 57.8 MB/s 
[?25h

In [None]:
import csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Reshape
from tensorflow.keras.optimizers import SGD
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
import joblib
import pickle
import nltk
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import keras_tuner as kt

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def read_file_to_sents():
    with open("train.tsv", encoding="utf-8") as file:
        f = csv.reader(file, delimiter="\t")

        cur_sent = []
        all_sents = []

        for line in tqdm(f, desc="Reading data..."):
            if line[0][0:2].strip() == 'N':
                line[0] = 'N'

            if line[0] == "<S>":
                if len(cur_sent) >= 1:
                    all_sents.append(cur_sent)
                cur_sent = []
                continue

            cur_sent.append((line[0], line[1]))

    return all_sents

In [None]:
def string_to_num(data):
    le = LabelEncoder()

    label = le.fit_transform(data)

    return label

In [None]:
def defeature(data):
    unfeatured = []

    for dict in data:
        unfeatured.append(dict['word'])

    return unfeatured

In [None]:
def pad(data):
  vocab = list(set([w for sent in data for (w,t) in sent]))
  vocab.append('<PAD>')
  print(len(vocab))
  tags = list(set([t for sent in data for (w,t) in sent]))
  tags.append('<PAD>')

  return vocab, tags

In [None]:
def encode(vocab, tags, data):
  max_len = 60
  word2index = {w: i for i, w in enumerate(vocab)}
  tag2index = {t: i for i, t in enumerate(tags)}
  onehot = [[word2index[w[0]] for w in s] for s in data]
  X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

  onehot_y = [[tag2index[w[1]] for w in s] for s in data]
  y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
  y = to_categorical(y, num_classes=len(tags))

  return X, y, max_len

Based on code provided by Dr. Scannell in BiLTSM notebook

In [None]:
def seq_model(data):
    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data)

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

    model = Sequential()
    model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
    model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
    model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
    model.compile(optimizer="adam", loss="poisson", metrics=["accuracy"])
  # From https://towardsdatascience.com/hyperparameter-tuning-with-kerastuner-and-tensorflow-c4a4d690b31a
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

    print("[INFO] training network...")
    sgd = SGD(0.01)
    history = model.fit(X_train, y_train, batch_size=512, epochs=50, validation_split=0.2, verbose=1, callbacks=stop_early)

    return model, X_test, y_test

In [None]:
def eval_model(model, x_test, y_test):
  #x_test = defeature(x_test)
  #_test = string_to_num(x_test)
  #y_test = string_to_num(y_test)
  return model.evaluate(x_test[:5000], y_test[:5000], return_dict=True)

Below work partially https://nlpforhackers.io/training-pos-tagger/amp/

In [None]:
def dec_tree(X, y):
    clf = Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', DecisionTreeClassifier(criterion='entropy')),
    ], verbose=1)

    print("Training Started")

    # Custom work below
    all_clfs = []

    for i in tqdm(range(0, 115), desc="Training"):
        cur_clf = clf
        cur_clf.fit(X[(i * 31649): ((i + 1) * 31649)], y[(i * 31649): ((i + 1) * 31649)])  # Use only the first 10K samples if you're running it multiple times. It takes a fair bit :)

        all_clfs.append(cur_clf)

        file_loc = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/model' + str(i) + ".h5"
        file_loc_2 = '/gdrive/MyDrive/Colab_Notebooks/NLP/vector' + str(i) + ".pkl"

        # cur_clf.named_steps['classifier'].save(file_loc)

        with open(file_loc, 'wb') as pickle_file:
          pickle.dump(cur_clf, pickle_file)

        # cur_clf.named_steps['estimator'].model = None

        # joblib.dump(cur_clf, file_loc_2)

    print('Training completed')

    return all_clfs

In [None]:
def acc_score(all_clfs, x_test, y_test):
    all_scores = []

    for clf in tqdm(all_clfs, desc="evaluating..."):
      temp_scores = []
      for i in range(0,50):
        temp_scores.append(clf.score(x_test[int(i * (len(x_test) / 250)):int( (i + 1) * (len(x_test)/250))], y_test[int(i * (len(x_test) / 250)):int( (i + 1) * (len(x_test)/250))]))

        all_scores.append(sum(temp_scores)/ len(temp_scores))

    total_acc = 0

    for score in all_scores:
        total_acc += score

    print("Accuracy:", total_acc / len(all_scores))

In [None]:
# Below work comes from https://nlpforhackers.io/training-pos-tagger/amp/

In [None]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }

In [None]:
# Below work comes from https://nlpforhackers.io/training-pos-tagger/amp/

In [None]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [None]:
# Below work comes from https://nlpforhackers.io/training-pos-tagger/amp/

In [None]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []

    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])

    return X, y

In [None]:
def load_models():
  all_clfs = []

  for i in range(1, 115):
    file_loc = '/content/gdrive/MyDrive/Colab_Notebooks/NLP/model' + str(i) + ".h5"

        # cur_clf.named_steps['classifier'].save(file_loc)

    with open(file_loc, 'rb') as pickle_file:
      cur_clf = pickle.load(pickle_file)

      all_clfs.append(cur_clf)

  return all_clfs



In [47]:
def load_seq_model(filename):
    model = load_model(filename)

    return model

Below based on code provided by Dr. Scannell

In [None]:
def HMM_uni(train, test):
    uni_tag = nltk.UnigramTagger(train)
    acc = uni_tag.accuracy(test)

    print("Unigram tagger accuracy: ", acc)

In [None]:
def gen_rule_tag():
    patterns = [
        (r'a .*', 'H'),
        (r'an .*', 'H'),
        (r'mo .*', 'H'),
        (r'do .*', 'H'),
        (r'don .*', 'H'),
        (r'den .*', 'H'),
        (r'an d.*', 'N'),
        (r'an t.*', 'N'),
        (r'faoi .*', 'N')
    ]

    rule_tagger = nltk.RegexpTagger(patterns)

    return rule_tagger

Below based on code provided by Dr. Scannell

In [None]:
def HMM_bi(train, test):
    rule_tagger = gen_rule_tag()

    bigrams = gen_grams(train, 1)
    test = gen_grams(test)

    bitag = nltk.UnigramTagger(bigrams, backoff=rule_tagger)
    acc = bitag.accuracy(test)

    print("Bigram tagger accuracy: ", acc)

In [None]:
def HMM_tri(train, test):
    tritag = nltk.TrigramTagger(train)

    acc = tritag.accuracy(test)

    print("Trigram tagger accuracy: ", acc)

In [None]:
def sent_to_list(data):
    all_data = []

    for entry in data:
      for tup in entry:
        all_data.append(tup)

    return all_data

In [None]:
def gen_grams(data, n = 1):
    all_grams = []
    
    for sent in data:
      cur_grams = [(' '.join([sent[i][0], sent[i+1][0]]), sent[i+1][1]) for i in range(len(sent)-1)]

      cur_grams.append(sent[len(sent)-1])

      all_grams.append(cur_grams)

    return all_grams

In [None]:
def prep_data(data):
    vocab, tags = pad(data)

    x, y, max_len = encode(vocab, tags, data)

    return x, y

In [51]:
def test_whole_seq(data):
  #'/content/gdrive/MyDrive/Colab_Notebooks/NLP/fancy_seq_model.h5'
  #'/content/gdrive/MyDrive/Colab_Notebooks/NLP/adamPoisson_seq_model.h5'

    model = load_seq_model('/content/gdrive/MyDrive/Colab_Notebooks/NLP/adamPoisson_seq_model.h5')
    x_test, y_test = prep_data(data)

    print(eval_model(model, x_test, y_test))

In [44]:
def anything():
    data = read_file_to_sents()

    print('Splitting data')

  # from https://nlpforhackers.io/training-pos-tagger/amp/
    cutoff = int(.75 * len(data))
    training_sentences = data[:cutoff]
    test_sentences = data[cutoff:]

  # Original
    #all_clfs = dec_tree(X, y) # Training Decision Tree Models
    #all_clfs = load_models() # Loading models
    #acc_score(all_clfs, x_test, y_test) # Calculating accuracy

    # HMM_uni(training_sentences, test_sentences)

    # HMM_bi(training_sentences, test_sentences)

    # HMM_tri(training_sentences, test_sentences)

    #model, x_test, y_test = seq_model(data) # Train sequential model

    test_whole_seq(data)

    #print(eval_model(model, x_test, y_test)) # Eval sequential model

    #model.save('/content/gdrive/MyDrive/Colab_Notebooks/NLP/adamPoisson_seq_model.h5')

    del training_sentences, test_sentences

In [52]:
anything()

Reading data...: 5057059it [00:05, 874600.50it/s]


Splitting data
115827
{'loss': 0.021168744191527367, 'accuracy': 0.9937133193016052}


ADAM Poisson Accuracy: 99.371

ADAM Cross Entropy Accuracy: 99.370

