In [1]:
#Importing libraries
import math
import sys
import numpy as np
import pandas as pd
import random
import os

In [2]:
# settings
TRAIN_PATH ='WSJ_02-21.pos' 
X_TEST_PATH ='WSJ_24.words'
Y_TEST_PATH = 'WSJ_24.pos' 
TEST_PATH = 'WSJ_23.words'
word_set = set()
tag_set = set()
len_word_set = set()
len_tag_set = set()
tag_count = {}

In [3]:
# get and convert training corpus in list(tuple(word, tag))
total = []
per_pos = {}
with open(TRAIN_PATH, 'r') as f:
    for line in f:
        if len(line.split()) != 0:
            token = line.split()
            total.append((token[0], token[1]))
            if token[1] not in per_pos:
                per_pos[token[1]] = [token]
            else:
                per_pos[token[1]].append(token)
word_set = set([x[0] for x in total]) 
tag_set = set([x[1] for x in total])
len_word_set = len(word_set)
len_tag_set = len(tag_set)

In [4]:
print(total[:10])

[('In', 'IN'), ('an', 'DT'), ('Oct.', 'NNP'), ('19', 'CD'), ('review', 'NN'), ('of', 'IN'), ('``', '``'), ('The', 'DT'), ('Misanthrope', 'NN'), ("''", "''")]


In [5]:
with open(TRAIN_PATH, 'r') as f:
    for line in f:
        if len(line.split()) != 0:
            broken = line.split()
            if broken[1] not in tag_count:
                tag_count[broken[1]] = 1
            else:
                tag_count[broken[1]] += 1

In [6]:
tag_count

{'IN': 98554,
 'DT': 81842,
 'NNP': 91466,
 'CD': 36568,
 'NN': 132935,
 '``': 7092,
 "''": 6919,
 'POS': 8701,
 '(': 1366,
 'VBN': 20024,
 'NNS': 59856,
 'VBP': 12491,
 ',': 48727,
 'CC': 23947,
 ')': 1376,
 'VBD': 29889,
 'RB': 30970,
 'TO': 22357,
 '.': 39478,
 'VBZ': 21672,
 'NNPS': 2673,
 'PRP': 17436,
 'PRP$': 8407,
 'VB': 26438,
 'JJ': 61217,
 'MD': 9803,
 'VBG': 14846,
 'RBR': 1768,
 ':': 4772,
 'WP': 2363,
 'WDT': 4294,
 'JJR': 3238,
 'PDT': 370,
 'RBS': 451,
 'WRB': 2143,
 'JJS': 1947,
 '$': 7372,
 'RP': 2662,
 'FW': 234,
 'EX': 863,
 'SYM': 58,
 '#': 142,
 'LS': 36,
 'UH': 97,
 'WP$': 168}

In [7]:
# grab and convert x_test -> (words,)
x_test = []
with open(X_TEST_PATH, 'r') as f:
    for line in f:
        if len(line.split()) != 0:
            token = line.split()
            x_test.append(token[0])

In [8]:
print(x_test[:10])

['The', 'economy', "'s", 'temperature', 'will', 'be', 'taken', 'from', 'several', 'vantage']


In [9]:
# grab test answers (words, pos)
y_test = []
with open(Y_TEST_PATH, 'r') as f:
    for line in f:
        token = line.split()
        if len(token) != 0:
            y_test.append((token[0], token[1]))

In [10]:
print(y_test[:10])
print(len(y_test))

[('The', 'DT'), ('economy', 'NN'), ("'s", 'POS'), ('temperature', 'NN'), ('will', 'MD'), ('be', 'VB'), ('taken', 'VBN'), ('from', 'IN'), ('several', 'JJ'), ('vantage', 'NN')]
32853


In [11]:
test = []
with open(TEST_PATH, 'r') as f:
    for line in f:
        if len(line.split()) != 0:
            test.append(line.split()[0])

In [12]:
print(test[:10])

['No', ',', 'it', 'was', "n't", 'Black', 'Monday', '.', 'But', 'while']


In [13]:
def p_w_given_t(word, tag, train_bag = total, per_pos = per_pos):
    tag_list = per_pos[tag]
    tag_count = len(tag_list)
    p_w_given_t_list = [pair[0] for pair in tag_list if pair[0] == word]
    p_w_given_t_count = len(p_w_given_t_list)
    return (p_w_given_t_count, tag_count)

In [15]:
def transition_probabilties(curr, prev, train_bag = total):
    tags = [pair[1] for pair in train_bag]
    prev_tags_list = [tag for tag in tags if tag == prev]
    prev_tags_count = len(prev_tags_list)
    curr_given_prev_list = [tags[index+1] for index in range(len(tags) - 1) if tags[index] == prev and tags[index+1] == curr]
    curr_given_prev_count = len(curr_given_prev_list)
    return (curr_given_prev_count, prev_tags_count)

In [17]:
# transition matrix
transition_matrix = np.zeros((len_tag_set,len_tag_set), dtype = 'float32')
for i, prev in enumerate(list(tag_set)):
    for j, curr in enumerate(list(tag_set)):
        transition_matrix[i,j] = transition_probabilties(curr, prev)[0] /  transition_probabilties(curr, prev)[1]
df_transition = pd.DataFrame(transition_matrix, columns = list(tag_set), index = list(tag_set))

In [18]:
def predict(df = df_transition,test = x_test, train_bag = total, tag_set = tag_set):
        state = []
        T = list(tag_set)
        for k, w in enumerate(test):
            if k % 100 == 0: print('Predict Progress: ', k/len(test))
            p= []
            p_transition = []
            for tag in T:
                if k == 0:
                    transition_p = df.loc['.', tag]
                else:
                    transition_p = df.loc[state[-1], tag]
                emission_p = p_w_given_t(test[k], tag)[0] / p_w_given_t(test[k], tag)[1]
                state_probability = emission_p * transition_p
                p.append(state_probability)
                p_transition.append(transition_p)
            pmax = max(p)
            state_max = T[p.index(pmax)]
            # OOV handling : possibility 1 from slides -> only using transition probability
            if (pmax == 0):
                pmax = max(p_transition)
                state_max = T[p_transition.index(pmax)]
            else:
                state_max = T[p.index(pmax)]
            state.append(state_max)
        return list(zip(test, state))

In [None]:
# train on X_train and Y_train/Y_test
res = predict()

In [4]:
def score (keyFileName, responseFileName):
    keyFile = open(keyFileName, 'r')
    key = keyFile.readlines()
    responseFile = open(responseFileName, 'r')
    response = responseFile.readlines()
    if len(key) != len(response):
        print("length mismatch between key and submitted file of: ", len(keyFileName) - len(responseFileName))
        exit()
    correct = 0
    incorrect = 0
    for i in range(len(key)):
        key[i] = key[i].rstrip(os.linesep)
        response[i] = response[i].rstrip(os.linesep)
        if key[i] == "":
            if response[i] == "":
                continue
            else:
                print ("sentence break expected at line " + str(i))
                exit()
        keyFields = key[i].split('\t')
        if len(keyFields) != 2:
            print ("format error in key at line " + str(i) + ":" + key[i])
            exit()
        keyToken = keyFields[0]
        keyPos = keyFields[1]
        responseFields = response[i].split('\t')
        if len(responseFields) != 2:
            print ("format error at line " + str(i))
            exit()
        responseToken = responseFields[0]
        responsePos = responseFields[1]
        if responseToken != keyToken:
            print ("token mismatch at line " + str(i))
            exit()
        if responsePos == keyPos:
            correct = correct + 1
        else:
            incorrect = incorrect + 1
    print (str(correct) + " out of " + str(correct + incorrect) + " tags correct")
    accuracy = 100.0 * correct / (correct + incorrect)
    print("  accuracy: %f" % accuracy)

In [5]:
# accuracy of training data using naive accuracy from declared variables
check = [i for i, j in zip(res, y_test) if i == j] 
accuracy = len(check)/len(res)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

Viterbi Algorithm Accuracy:  93.76921437920433


In [23]:
# write out training results to test of score.py format
with open('training.txt','w') as outf:
    dummy = []
    with open(Y_TEST_PATH, 'r') as f:
        for line in f:
            dummy.append(line.split())
        count = 0
        for i in range(len(dummy)):
            if len(dummy[i]) == 0:
                outf.write('\n')
                count += 1
            else:
                if dummy[i][1] == '.':
                    outf.write(res[i-count][0] + '\t' + res[i-count][1] + '\n')
                else:
                    outf.write(res[i-count][0] + '\t' + res[i-count][1] + '\n')
            

In [21]:
# test accuracy from score.py submission format (Beware extra sentence breaks in keyFile)
score(Y_TEST_PATH, 'training.txt')

30806 out of 32853 tags correct
  accuracy: 93.769214
