In [103]:
# Necessary Imports
import os
from sklearn import metrics
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import nltk
from nltk.metrics import ConfusionMatrix

import warnings
warnings.filterwarnings('always')

In [126]:
# Functions from starter_code.py
def evaluate(test_sentences, tagged_test_sentences, output_dict=False):
    gold = [str(tag) for sentence in test_sentences for token, tag in sentence]
    pred = [
        str(tag)
        for sentence in tagged_test_sentences
        for token, tag in sentence
    ]
    return metrics.classification_report(gold, pred, output_dict=output_dict)

def get_token_tag_tuples(sent):
    return [nltk.tag.str2tuple(t) for t in sent.split()]

def get_tagged_sentences(text):
    sentences = []

    blocks = text.split("======================================")
    for block in blocks:
        sents = block.split("\n\n")
        for sent in sents:
            sent = sent.replace("\n", "").replace("[", "").replace("]", "")
            if sent is not "":
                sentences.append(sent)
    return sentences

def load_treebank_splits(datadir):

    train = []
    dev = []
    test = []

    print("Loading treebank data...")
    
    for subdir, dirs, files in os.walk(datadir):
        for filename in files:
            if filename.endswith(".pos"):
                filepath = subdir + os.sep + filename
                with open(filepath, "r") as fh:
                    text = fh.read()
                    if int(subdir.split(os.sep)[-1]) in range(0, 19):
                        train += get_tagged_sentences(text)

                    if int(subdir.split(os.sep)[-1]) in range(19, 22):
                        dev += get_tagged_sentences(text)
                    
                    if int(subdir.split(os.sep)[-1]) in range(22, 25):
                        test += get_tagged_sentences(text)

    print("Train set size: ", len(train))
    print("Dev set size: ", len(dev))
    print("Test set size: ", len(test))

    return train, dev, test

[('The', 'DT'), ('House', 'NNP'), ('joined', 'VBD'), ('the', 'DT'), ('Senate', 'NNP'), ('in', 'IN'), ('making', 'VBG'), ('federal', 'JJ'), ('reparations', 'NNS'), ('for', 'IN'), ('Japanese-Americans', 'NNPS')]


  if sent is not "":


In [116]:
# Print Helper Function
def print_result(result, name=""):
    print("==================")
    print(name)
    print("==================")
    print(result)

In [144]:
class PreProcess():
    
    def __init__(self):
        # Special Tokens
        self.SOS = '<START>'
        self.EOS = '<STOP>'
        self.UNK = '<UNK>'

    def preprocess_dataset(self, 
                           dataset):
        preprocessed_ds = [([(self.SOS, self.SOS)] + get_token_tag_tuples(sentence) + [(self.EOS, self.EOS)]) for sentence in dataset]
        return preprocessed_ds

    def remove_start_stop(self, 
                          dataset):
        return [sentence[1:-1] for sentence in dataset]

    def get_words_tags(self, 
                       dataset, 
                       words):
        word_tag_list = []
        for sentence in dataset:
            if words == True:
                word_tag_list.append([word for word, tag in sentence])
            else:
                word_tag_list.append([tag for word, tag in sentence])
        return word_tag_list
    
    def preprocess_flatten_corpus(self, 
                                  dataset, 
                                  preprocess):
        return [token for sent in dataset for token in preprocess(sent)]

In [146]:
# Set path for datadir
datadir = os.path.join("data", "penn-treebank3-wsj", "wsj")
train, dev, test = load_treebank_splits(datadir)

Loading treebank data...
Train set size:  51681
Dev set size:  7863
Test set size:  9046


In [150]:
pp = PreProcess()

# extract the gold from dev/test set
gold_dev = pp.preprocess_dataset(dev)
gold_test = pp.preprocess_dataset(test)

# the train set contained labels
train_corpus = train

# the dev/test set without labels
dev_corpus = pp.get_words_tags(gold_dev, True)
test_corpus = pp.get_words_tags(gold_test, True)

# remove special tokens for golds
gold_dev = pp.remove_start_stop(gold_dev)
gold_test = pp.remove_start_stop(gold_test)

In [151]:
class BaselineTagger:
    def __init__(self):
        self.corpus = []  # training corpus

        self.most_frequent_table = (
            {}
        )  # key:val = word:most_freq_tag_of_such_word
        self.most_common_tag = (
            ""  # the most frequent tag no matter what word it is
        )

    def train(self, corpus):
        self.corpus = [preprocess.preprocess_labeled_text(sent) for sent in corpus]

        word_tag_frequent_table = {}
        tag_counts = {}

        # calculate all the word:tag count and individual tag count
        for sent in self.corpus:
            for token in sent:
                word, tag = token
                tag_frequency = word_tag_frequent_table.get(word, {})
                tag_frequency[tag] = tag_frequency.get(tag, 0) + 1
                word_tag_frequent_table[word] = tag_frequency
                tag_counts[tag] = tag_counts.get(tag, 0) + 1

        # calculate the most frequent tag for word
        for word, tag_freq in word_tag_frequent_table.items():
            self.most_frequent_table[word] = max(
                tag_freq.items(), key=lambda x: x[1]
            )[0]
        # calculate the most frequent tag among all tags
        self.most_common_tag = max(tag_counts.items(), key=lambda x: x[1])[0]

    def predict(self, corpus):
        tags = []
        for sent in corpus:
            tag_seq = []
            for word in sent:
                # if word in the dictionary, get the most frequent tag
                # if not just give most common tag
                tag = self.most_frequent_table.get(word, self.most_common_tag)
                tag_seq.append((word, tag))
            tags.append(tag_seq)
        return tags

In [152]:
class HMMTagger:
    def __init__(self):
        # preprocessed and labeled corpus for the POS task
        self.corpus = []
        # frequency of word tokens
        self.vocab = {}
        # all type of tags
        self.tag_list = []
        # table for the transition prob of all combination of tag->tag in the corpus
        self.transition_table = []
        # table for the emission prob of all combination of word:tag in the corpus
        self.emission_table = []
        # the emission probability for <UNK>:tag => actually it is just a prob for every single tag
        self.unknown_emission_prob = {}

    def train(self, corpus, alpha=1):
        self.corpus = [preprocess.preprocess_labeled_text(sent) for sent in corpus]
        self.vocab = self.get_vocab(self.corpus)
        (
            self.transition_table,
            self.emission_table,
            self.unknown_emission_prob,
            self.tag_list,
        ) = self.build_tables(alpha)

        self.word2idx = {
            word: idx for idx, word in enumerate(self.vocab.keys())
        }
        self.tag2idx = {tag: idx for idx, tag in enumerate(self.tag_list)}
        self.idx2word = {
            idx: word for idx, word in enumerate(self.vocab.keys())
        }
        self.idx2tag = {idx: tag for idx, tag in enumerate(self.tag_list)}

    def build_tables(self, alpha=1):
        # key is prev tag and current tag, value is count
        transitions = self.get_transitions(self.corpus)
        # key is tag, token, value is count
        emissions = self.get_emissions(self.corpus)

        # store frequency of each tag
        tag_dict = self.get_tag_freq(self.corpus)
        tag_list = tag_dict.keys()

        transition_table = self.create_transition_table(
            transitions, tag_dict, tag_list, alpha
        )
        emission_table, unknown_emission_prob = self.create_emission_table(
            emissions, tag_dict, tag_list, self.vocab, alpha
        )

        return (
            transition_table,
            emission_table,
            unknown_emission_prob,
            tag_list,
        )

    def get_vocab(self, sents):
        vocab = {}
        for sent in sents:
            for token in sent:
                word = token[0]
                vocab[word] = vocab.get(word, 0) + 1
        return vocab

    def get_transitions(self, sents):
        transitions = {}
        for sent in sents:
            for i in range(1, len(sent)):
                bigram_tags = (sent[i - 1][1], sent[i][1])
                transitions[bigram_tags] = transitions.get(bigram_tags, 0) + 1
        return transitions

    def get_emissions(self, sents):
        emissions = {}
        for sent in sents:
            for token_tag_pair in sent:
                emissions[token_tag_pair] = (
                    emissions.get(token_tag_pair, 0) + 1
                )
        return emissions

    def get_tag_freq(self, sents):
        tag_dict = {}
        for sent in sents:
            for _, tag in sent:
                tag_dict[tag] = tag_dict.get(tag, 0) + 1
        return tag_dict

    def create_transition_table(self, transitions, tag_dict, tags, alpha=1):
        transition_table = []  # 2-dim list
        for prev_tag in tags:
            prob_list = []
            for current_tag in tags:
                prev_count = tag_dict.get(prev_tag, 0)
                bigram_count = transitions.get((prev_tag, current_tag), 0)
                prob = (bigram_count + alpha) / (
                    prev_count + (alpha * len(tags))
                )
                prob_list.append(np.log(prob))
            transition_table.append(prob_list)
        return transition_table

    def create_emission_table(self, emissions, tag_dict, tags, vocab, alpha):
        emission_table = []  # 2-dim list
        unknown_emission_prob = {}
        total_tag_counts = sum(tag_dict.values())
        for tag in tags:
            prob_list = []
            tag_count = tag_dict.get(tag, 0)
            for word in vocab.keys():
                word_tag_count = emissions.get((word, tag), 0)
                prob = (word_tag_count + alpha) / (
                    tag_count + (alpha * len(tags))
                )
                prob_list.append(np.log(prob))
            emission_table.append(prob_list)
            unknown_emission_prob[tag] = (tag_count + alpha) / (
                total_tag_counts + (alpha * len(tags))
            )
        return emission_table, unknown_emission_prob

    def viterbi_decode(self, sent):
        tags = []
        viterbi_matrix = []

        # Initial step
        initial = []  # empty array for start token
        viterbi_matrix.append(initial)
        first_token = sent[1]
        first_token_scores = []
        for i, tag in enumerate(self.tag_list):
            transition_prob = self.transition_table[self.tag2idx[preprocess.SOS]][
                i
            ]
            emission_prob = self.unknown_emission_prob[tag]
            if first_token in self.word2idx.keys():
                emission_prob = self.emission_table[i][
                    self.word2idx[first_token]
                ]
            # calculate all the tag start from the start token
            first_token_scores.append(
                (self.tag2idx[preprocess.SOS], transition_prob + emission_prob)
            )
        viterbi_matrix.append(first_token_scores)

        # recursive step
        for t, token in enumerate(sent):
            if t <= 1:
                continue
            max_scores = []
            for i, tag in enumerate(self.tag_list):
                max_score = float("-inf")
                candidate = None
                emission_prob = self.unknown_emission_prob[tag]
                if token in self.word2idx.keys():
                    emission_prob = self.emission_table[i][
                        self.word2idx[token]
                    ]
                # go through every previous score that already be calculated in the viterbi matrix
                for j, score in enumerate(viterbi_matrix[t - 1]):
                    _, prev_max_log_prob = score
                    transition_prob = self.transition_table[j][i]
                    new_score = (
                        emission_prob + transition_prob + prev_max_log_prob
                    )
                    if new_score > max_score:
                        max_score = new_score
                        candidate = j
                max_scores.append((candidate, max_score))
            viterbi_matrix.append(max_scores)

        # start with the stop tag
        max_tag = self.tag2idx[preprocess.EOS]
        tags.append((preprocess.EOS, self.idx2tag[max_tag]))

        # find best path in viterbi matrix
        for i in reversed(range(1, len(viterbi_matrix))):
            max_tag = viterbi_matrix[i][max_tag][0]
            tags.append((sent[i - 1], self.idx2tag[max_tag]))

        # since it is found backward, we need to reverse it
        tags.reverse()
        return tags

    def predict(self, corpus):
        all_tags = []
        for sent in tqdm(corpus):
            prediction_tags = self.viterbi_decode(sent)
            all_tags.append(prediction_tags)
        return all_tags

In [154]:
# using small amount of samples for training and testing
def experiment_debug(alpha=1):
    train_samples = [
        "The/DT House/NNP joined/VBD  the/DT Senate/NNP in/IN making/VBG  federal/JJ reparations/NNS for/IN Japanese-Americans/NNPS"
    ]
    test_samples = [
        "alsjfla the askdmc and djdsaas in making",
        "vsacs the House and djdsaas in giuhun",
        "The House joined  the Senate in making  federal reparations for Japanese-Americans",
    ]
    tagger = HMMTagger()
    tagger.train(train_samples, alpha)
    hmm_y = tagger.predict(test_samples)

    b_tagger = BaselineTagger()
    b_tagger.train(train_corpus)
    b_y = b_tagger.predict(test_samples)

    hmm_y = preprocess.preprocess_corpus(hmm_y, preprocess.preprocess_remove_start_stop_tokens)
    b_y = preprocess.preprocess_corpus(b_y, preprocess.preprocess_remove_start_stop_tokens)
    print_result(evaluate(hmm_y, b_y), "debug")


def experiment_baseline():
    baseline_tagger = BaselineTagger()
    baseline_tagger.train(train_corpus)

    # dev
    prediction_dev = baseline_tagger.predict(dev_corpus)
    prediction_dev = preprocess.preprocess_corpus(
        prediction_dev, preprocess.preprocess_remove_start_stop_tokens
    )
    print_result(evaluate(gold_dev, prediction_dev), "Baseline_Dev")

    # test
    prediction_test = baseline_tagger.predict(test_corpus)
    prediction_test = preprocess.preprocess_corpus(
        prediction_test, preprocess.preprocess_remove_start_stop_tokens
    )
    print_result(evaluate(gold_test, prediction_test), "Baseline_Test")


def experiment_hmm(alpha=1):
    hmm_tagger = HMMTagger()
    hmm_tagger.train(train_corpus, alpha)

    # dev
    prediction_dev = hmm_tagger.predict(dev_corpus)
    prediction_dev = preprocess.preprocess_corpus(
        prediction_dev, preprocess.preprocess_remove_start_stop_tokens
    )
    print_result(
        evaluate(gold_dev, prediction_dev), f"HMM_Dev with alpha={alpha}"
    )

    # test
    prediction_test = hmm_tagger.predict(test_corpus)
    prediction_test = preprocess.preprocess_corpus(
        prediction_test, preprocess.preprocess_remove_start_stop_tokens
    )
    print_result(
        evaluate(gold_test, prediction_test), f"HMM_Test with alpha={alpha}"
    )

    print_result(
        ConfusionMatrix(
            preprocess.preprocess_flatten_corpus(gold_test, preprocess.preprocess_extract_tags),
            preprocess.preprocess_flatten_corpus(
                prediction_test, preprocess.preprocess_extract_tags
            ),
        ),
        f"Confusion Matrix with alpha={alpha}",
    )


def experiment_hmm_alpha(alpha=1):
    hmm_tagger = HMMTagger()
    hmm_tagger.train(train_corpus, alpha)

    # dev
    prediction_dev = hmm_tagger.predict(dev_corpus)
    prediction_dev = preprocess.preprocess_corpus(
        prediction_dev, preprocess.preprocess_remove_start_stop_tokens
    )
    return evaluate(gold_dev, prediction_dev, output_dict=True)

experiment_baseline()
experiment_hmm()
# experiment_hmm(alpha=pow(10, -5))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Baseline_Dev
              precision    recall  f1-score   support

           #       1.00      1.00      1.00        31
           $       1.00      1.00      1.00      1248
          ''       1.00      0.98      0.99      1168
           (       1.00      1.00      1.00       244
           )       1.00      1.00      1.00       244
           ,       1.00      1.00      1.00      7931
           .       1.00      1.00      1.00      6125
           :       1.00      1.00      1.00       775
          CC       0.99      0.99      0.99      3777
          CD       0.99      0.90      0.94      5766
          DT       0.99      0.98      0.99     12639
          EX       0.86      1.00      0.92       133
          FW       0.53      0.40      0.45        25
          IN       0.95      0.98      0.96     15497
       IN|RB       0.00      0.00      0.00         1
          JJ       0.88      0.84      0.86      9014
         JJR       0.72      0.92      0.81       506
         JJS  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Baseline_Test
              precision    recall  f1-score   support

           #       1.00      1.00      1.00        22
           $       1.00      1.00      1.00      1138
          ''       1.00      0.99      1.00      1423
           (       1.00      1.00      1.00       249
           )       1.00      1.00      1.00       252
           ,       1.00      1.00      1.00      9056
           .       1.00      1.00      1.00      7035
           :       1.00      1.00      1.00       983
          CC       1.00      1.00      1.00      4289
          CD       0.99      0.90      0.94      6023
          DT       0.99      0.99      0.99     14946
          EX       0.89      1.00      0.94       174
          FW       0.35      0.21      0.26        38
          IN       0.94      0.98      0.96     18147
          JJ       0.88      0.86      0.87     10704
         JJR       0.66      0.95      0.78       581
     JJR|RBR       0.00      0.00      0.00         4
         JJS 

  3%|█▏                                      | 233/7863 [00:03<01:48, 70.57it/s]


KeyboardInterrupt: 

In [None]:
## For evaluation against the default NLTK POS tagger

# test_sentences = [get_token_tag_tuples(sent) for sent in test]
# tagged_test_sentences = [nltk.pos_tag([token for token, tag in sentence]) for sentence in test_sentences]
# evaluate(test_sentences, tagged_test_sentences)