### Parts-of-Speech Tagging 

### IMPORTING LIBRARIES

In [1]:
import string
from collections import defaultdict
import pandas as pd
from collections import defaultdict
import math
import numpy as np

### TRAINING DATA 

In [2]:
#Here will use two tagged data sets collected from the Wall Street Journal (WSJ).
with open("C:part2_2_a.pos.txt", 'r') as f:
    lines = f.readlines()
training_corpus = lines

print("\t\tWord", "\tTag\n")

for i in range(5):
    print(f'line number {i+1}: {lines[i]}')

		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



In [3]:
words = [line.split('\t')[0] for line in lines]

#### Training Vocabulary Data

In [4]:
with open("tags vocab.txt", 'r') as f:
    voc_l = f.read().split('\n')

print("A few items of the vocabulary list")
print(voc_l[0:50])
print()
print("A few items at the end of the vocabulary list")
print(voc_l[-50:])

A few items of the vocabulary list
['!', '#', '$', '%', '&', "'", "''", "'40s", "'60s", "'70s", "'80s", "'86", "'90s", "'N", "'S", "'d", "'em", "'ll", "'m", "'n'", "'re", "'s", "'til", "'ve", '(', ')', ',', '-', '--', '--n--', '--unk--', '--unk_adj--', '--unk_adv--', '--unk_digit--', '--unk_noun--', '--unk_punct--', '--unk_upper--', '--unk_verb--', '.', '...', '0.01', '0.0108', '0.02', '0.03', '0.05', '0.1', '0.10', '0.12', '0.13', '0.15']

A few items at the end of the vocabulary list
['yard', 'yards', 'yardstick', 'year', 'year-ago', 'year-before', 'year-earlier', 'year-end', 'year-on-year', 'year-round', 'year-to-date', 'year-to-year', 'yearlong', 'yearly', 'years', 'yeast', 'yelled', 'yelling', 'yellow', 'yen', 'yes', 'yesterday', 'yet', 'yield', 'yielded', 'yielding', 'yields', 'you', 'young', 'younger', 'youngest', 'youngsters', 'your', 'yourself', 'youth', 'youthful', 'yuppie', 'yuppies', 'zero', 'zero-coupon', 'zeroing', 'zeros', 'zinc', 'zip', 'zombie', 'zone', 'zones', 'zonin

In [5]:
vocab = {} 

for i, word in enumerate(sorted(voc_l)): 
    vocab[word] = i       
    
print("Vocabulary dictionary, key is the word, value is a unique integer")
cnt = 0
for k,v in vocab.items():
    print(f"{k}:{v}")
    cnt += 1
    if cnt > 20:
        break

Vocabulary dictionary, key is the word, value is a unique integer
!:0
#:1
$:2
%:3
&:4
':5
'':6
'40s:7
'60s:8
'70s:9
'80s:10
'86:11
'90s:12
'N:13
'S:14
'd:15
'em:16
'll:17
'm:18
'n':19
're:20


#### Frequency Count

In [6]:
#defaultdicts they are a special kind of dictionaries that return the "zero" value of a type if you try to access a key that does not exist.

freq = defaultdict(int)

for word in words:
    freq[word] += 1

In [7]:
vocab = [k for k, v in freq.items() if (v > 1 and k != '\n')]

In [8]:
vocab.sort()

for i in range(4010,4020 ):
    print(vocab[i], end=" ")

Eaton Eaux Eavesdropping Ebensburg Echo Echoing Eckenfelder Eclipse Economic Economics 

#### Dealing With Unknown Words

In [9]:
def assign_unk(word):

    punct = set(string.punctuation)
    
    # Suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]


    if any(char.isdigit() for char in word):
        return "--unk_digit--"

    elif any(char in punct for char in word):
        return "--unk_punct--"

    elif any(char.isupper() for char in word):
        return "--unk_upper--"

    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"
    
    return "--unk--"

In [10]:
def get_word_tag(line, vocab):

    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        word, tag = line.split()

        if word not in vocab: 

            word = assign_unk(word)
    return word, tag

#### Getting The Tags For Words (Checking)

In [11]:
get_word_tag('\n', vocab)

('--n--', '--s--')

In [12]:
get_word_tag('In\tIN\n', vocab)

('In', 'IN')

In [13]:
get_word_tag('tardigrade\tNN', vocab)

('--unk--', 'NN')

In [14]:
get_word_tag('scrutinize\tVB\n', vocab)

('--unk_verb--', 'VB')

### PREPROCESSING : Merging All Above in Single Function

In [15]:
def preprocess(vocab, data_fp):

    orig = []
    prep = []

    with open(data_fp, "r") as data_file:

        for cnt, word in enumerate(data_file):

            if not word.split():
                orig.append(word.strip())
                word = "--n--"
                prep.append(word)
                continue

            # Handle unknown words
            elif word.strip() not in vocab:
                orig.append(word.strip())
                word = assign_unk(word)
                prep.append(word)
                continue

            else:
                orig.append(word.strip())
                prep.append(word.strip())

    assert(len(orig) == len(open(data_fp, "r").readlines()))
    assert(len(prep) == len(open(data_fp, "r").readlines()))

    return orig, prep


### TESTING DATA 

In [16]:
with open("test_corpus 2_2.txt", 'r') as f:
    y = f.readlines()

print("A sample of the test corpus", len(y))
print(y[0:10])

A sample of the test corpus 34198
['The\tDT\n', 'economy\tNN\n', "'s\tPOS\n", 'temperature\tNN\n', 'will\tMD\n', 'be\tVB\n', 'taken\tVBN\n', 'from\tIN\n', 'several\tJJ\n', 'vantage\tNN\n']


In [17]:
#preprocessing the test corpus without data 
_, prep = preprocess(vocab, "test words.txt")     

print('The length of the preprocessed test corpus: ', len(prep))
print('This is a sample of the test_corpus: ')
print(prep[0:10])

The length of the preprocessed test corpus:  34198
This is a sample of the test_corpus: 
['The', 'economy', "'s", 'temperature', 'will', 'be', 'taken', 'from', 'several', '--unk--']


### Creating The Dictionaries :- Tranition, Emission and Tag Counts 

In [18]:
def create_dictionaries(training_corpus, vocab):
    
    emission_counts = defaultdict(int)
    transition_counts = defaultdict(int)
    tag_counts = defaultdict(int)
    
    prev_tag = '--s--' 
    
    i = 0 
    
    # Each item in the training corpus contains a word and its POS tag
    for word_tag in training_corpus:
        
        i += 1

        word, tag = get_word_tag(word_tag,vocab) 

        transition_counts[(prev_tag, tag)] += 1

        emission_counts[(tag, word)] += 1

        tag_counts[tag] += 1

        prev_tag = tag
        
        
    return emission_counts, transition_counts, tag_counts

In [20]:
emission_counts, transition_counts, tag_counts = create_dictionaries(training_corpus, vocab)

### POS STATES

In [21]:
# get all the POS states
states = sorted(tag_counts.keys())
print(f"Number of POS tags (number of 'states'): {len(states)}")
print("View these POS tags (states)")
print(states)

Number of POS tags (number of 'states'): 46
View these POS tags (states)
['#', '$', "''", '(', ')', ',', '--s--', '.', ':', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``']


The 'states' are the Parts-of-speech designations found in the training data. They will also be referred to as 'tags' or POS in this assignment.

"NN" is noun, singular,
'NNS' is noun, plural.
In addition, there are helpful tags like '--s--' which indicate a start of a sentence.

#### Examples

In [22]:
print("transition examples: ")
for ex in list(transition_counts.items())[:3]:
    print(ex)
print()

print("emission examples: ")
for ex in list(emission_counts.items())[200:203]:
    print (ex)
print()

print("ambiguous word example: ")
for tup,cnt in emission_counts.items():
    if tup[1] == 'back': print (tup, cnt)

transition examples: 
(('--s--', 'IN'), 5050)
(('IN', 'DT'), 32364)
(('DT', 'NNP'), 9044)

emission examples: 
(('DT', 'any'), 721)
(('NN', 'decrease'), 7)
(('NN', 'insider-trading'), 5)

ambiguous word example: 
('RB', 'back') 304
('VB', 'back') 20
('RP', 'back') 84
('JJ', 'back') 25
('NN', 'back') 29
('VBP', 'back') 4


### PREDICTION POS OF TESTING DATA AND CHECKING ACCURACY

In [23]:
def predict_pos(prep, y, emission_counts, vocab, states):
 
    num_correct = 0
    
    # (tag, word) tuples, stored as a set
    all_words = set(emission_counts.keys())
    
    # number of (word, POS) tuples in the corpus 'y'
    total = len(y)
    for word, y_tup in zip(prep, y): 

        # Split the (word, POS) string into a list of two items
        y_tup_l = y_tup.split()
        
        # Verify that y_tup contain both word and POS
        if len(y_tup_l) == 2:

            # Set the true POS label for this word
            true_label = y_tup_l[1]

        else:
            # If the y_tup didn't contain word and POS, go to next word
            continue
    
        count_final = 0
        pos_final = ''
        
        # If the word is in the vocabulary...
        if word in vocab:
            for pos in states:

                # defining the key as the tuple containing the POS and word
                key = (pos,word)

                # checking if the (pos, word) key exists in the emission_counts dictionary
                if key in emission_counts: 

                # geting the emission count of the (pos,word) tuple 
                    count = emission_counts[key]

                    # keeping track of the POS with the largest count
                    if count>count_final: 

                        # updating the final count (largest count)
                        count_final = count

                        # updating the final POS
                        pos_final = pos

            # If the final POS (with the largest count) matches the true POS:
            if pos_final == true_label: 
                
                # Update the number of correct predictions
                num_correct += 1

    accuracy = num_correct / total
    
    return accuracy

In [24]:
accuracy_predict_pos = predict_pos(prep, y, emission_counts, vocab, states)
print(f"Accuracy of prediction using predict_pos is {accuracy_predict_pos:.4f}")

Accuracy of prediction using predict_pos is 0.8658
