In [1]:
import pickle
import math
import sys
import os.path

## Creating Word and POS Elements

In [2]:
class Word_POS:

    delim = "/"
    def __init__(self, data, is_training):
        if is_training:
            split = data.split(self.delim)
            self.word = "/".join(split[:-1])
            self.tag = split[-1]
        else:
            self.word = data

In [3]:
class DataParser:
    def __init__(self, corpus_files):

        self.train_tagged, is_train_1 = corpus_files[0]

        self.train_sentences = []

        self._parse_file(self.train_tagged, self.train_sentences, is_training=is_train_1)

    def get_training_data(self):
        return self.train_sentences

    def _parse_file(self, filename, lst, is_training):
        if not filename or not os.path.isfile(filename):
            raise Exception("File not Found")

        with open(filename, "r") as f:
            for line in f:
                line = line.strip()
                word_tag = line.split()
                word_POS = [Word_POS(data, is_training) for data in word_tag]
                lst.append(word_POS)

In [4]:
class K_Fold_Cross_Validation:
    def __init__(self, k, training_data):
        self.portions = []
        num_sentences = int(len(training_data) / k)
        split = []
        for i, sent in enumerate(training_data):
            split.append(sent)
            if (( i + 1 ) % num_of_sentences == 0):
                self.portions.append(split)
                split = []
        if split:
            self.portions.append(split)
    def get_datas(self, index=0):
        return self.portions[index], [portion for i, portion in enumerate(self.portions) if i != index], index+1

In [7]:
class HMM:
    def __init__(self, corpus_files):
        self.words_given_pos = {}
        self.pos3_given_pos2_and_pos1 = {}
        self.parser = DataParser(corpus_files)
        self.word_to_tag = {}
        self.word_tag_count = {}
        self.tag_count = {}
        self.trigrams = {}
        self.bigrams = {}
        self.tags = set()
        self.words = set()
        self.transition_backoff = {}
        self.emission_backoff = {}
        self.transition_singleton = {}
        self.emission_singleton = {}
        self.transition_one_count = {}
        self.emission_smoothed = {}
        self.num = 0

    def calculate_probabilities(self):
        self.populate_dictionaries()
        self.ProbWordGivenTag()
        self.ProbTrigramTags()
        self.BackoffProbabilities()
        self.SingletonCounts()
        self.SmoothedProbabilities()
        self._save()

    def populate_dictionaries(self):
        self.pos_tags = set()
        for sentence in self.parser.get_training_data():

            sentence.insert(0, Word_POS('$^START^$' + Word_POS.delim + '$^START^$', is_training=True ))
            sentence.insert(0, Word_POS('$^START^$' + Word_POS.delim + '$^START^$', is_training=True ))

            start_index = 2
            for i in range(start_index, len(sentence)):

                trigram_triplet = (( sentence[i - 2]).tag, (sentence[i - 1]).tag, (sentence[i]).tag )
                bigram_tuple = (( sentence[i - 2]).tag, (sentence[i - 1]).tag )
                self.trigrams[trigram_triplet] = self.trigrams.get(trigram_triplet, 0) + 1
                self.bigrams[bigram_tuple] = self.bigrams.get(bigram_tuple, 0) + 1

            for i, atom in enumerate(sentence):

                word = atom.word
                tag = atom.tag
                self.num += 1
                self.transition_backoff[tag] = self.transition_backoff.get(tag, 0) + 1
                self.emission_backoff[word] = self.emission_backoff.get(word, 0) + 1
                self.tags.add(tag)
                self.words.add(word)
                self.word_tag_count[ (word, tag) ] =  self.word_tag_count.get((word, tag), 0) + 1
                self.tag_count[ tag ] = self.tag_count.get(tag, 0) + 1
                if word not in self.word_to_tag:
                    self.word_to_tag[ word ] = set()
                self.word_to_tag[ word ].add(tag)

        print(self.bigrams)
        print(self.trigrams)

    def BackoffProbabilities(self):
        V = len(self.tags)
        print(self.num, V)
        for word in self.emission_backoff:
            self.emission_backoff[word] = float(1 + self.emission_backoff[word]) / float(self.num + V)
        for tag in self.transition_backoff:
            self.transition_backoff[tag] = float(self.transition_backoff[tag]) / float(self.num)

    def SmoothedProbabilities(self):
        start_index = 2
        for sentence in self.parser.get_training_data():
            for i in range(start_index, len(sentence)):
                trigram_triplet = ( (sentence[i - 2]).tag , (sentence[i- 1]).tag, (sentence[i]).tag)
                bigram_tuple = ( (sentence[i - 2]).tag, (sentence[i- 1]).tag )
                lamda = self.transition_singleton.get(bigram_tuple, 0) + 1
                self.transition_one_count[trigram_triplet] = math.log(float(self.trigrams[trigram_triplet] + lamda * self.transition_backoff[sentence[i].tag]) / float(self.bigrams[bigram_tuple] + lamda))

        for word, tags_set in self.word_to_tag.items():
            for tag in tags_set:
                lamda = 1 + self.emission_singleton.get(tag, 0)
                self.emission_smoothed[(word, tag)] = math.log(float(self.word_tag_count[(word, tag)] + lamda * self.emission_backoff[word]) / float(self.tag_count[tag] + lamda))
                
    def SingletonCounts(self):
        for i, tag_1 in enumerate(self.tags):
            for j, tag_2 in enumerate(self.tags):
                for k, tag_3 in enumerate(self.tags):
                    if i != j and i != k and j != k:
                        triplet = (tag_3, tag_2, tag_1)
                        if triplet in self.trigrams and self.trigrams[triplet] == 1:
                            self.transition_singleton[(tag_3, tag_2)] = self.transition_singleton.get((tag_3, tag_2), 0) + 1
        for word in self.words:
            for tag in self.tags:
                word_tag = (word, tag)
                if word_tag in self.word_tag_count and self.word_tag_count[word_tag] == 1:
                    self.emission_singleton[tag] = self.emission_singleton.get(tag, 0) + 1


    def _save(self):
        dictionaries = {"unique_tags" : self.tags, "bigram" : self.bigrams, "transmission" : self.pos3_given_pos2_and_pos1, "emission" : self.words_given_pos, "word2tag" : self.word_to_tag,
                        "transition_backoff" : self.transition_backoff, "emission_backoff" : self.emission_backoff,
                        "transition_singleton" : self.transition_singleton, "emission_singleton" : self.emission_singleton,
                        "transition_smoothed" : self.transition_one_count, "emission_smoothed" : self.emission_smoothed,
                        "tag_count" : self.tag_count, "n" : self.num}
        output = open('hmmmodel.txt', 'wb')
        pickle.dump(dictionaries, output)
        output.close()

    def ProbWordGivenTag(self):
        for word, tags_set in self.word_to_tag.items():
            for tag in tags_set:
                self.words_given_pos[(word, tag)] = math.log(float(self.word_tag_count[(word, tag)]) / float(self.tag_count[tag]))

    def ProbTrigramTags(self):
        start_index = 2
        V = len(self.tags)
        for sentence in self.parser.get_training_data():
            for i in range(start_index, len(sentence)):
                trigram_triplet = ((sentence[i- 2]).tag, (sentence[i- 1]).tag, (sentence[i]).tag)
                bigram_tuple = ((sentence[i- 2]).tag, (sentence[i- 1]).tag)
                self.pos3_given_pos2_and_pos1[trigram_triplet] = math.log(float(1 + self.trigrams[trigram_triplet]) / float(V + self.bigrams[bigram_tuple]))

In [8]:
filename = "./en_train_tagged.txt"
hmm = HMM([(filename, True), (filename, True), (filename, True)])
hmm.calculate_probabilities()

{('$^START^$', '$^START^$', 'NNP'): 1455, ('$^START^$', 'NNP', 'HYPH'): 11, ('NNP', 'HYPH', 'NNP'): 132, ('HYPH', 'NNP', ':'): 3, ('NNP', ':', 'JJ'): 5, (':', 'JJ', 'NNS'): 3, ('JJ', 'NNS', 'VBD'): 51, ('NNS', 'VBD', 'NNP'): 6, ('VBD', 'NNP', 'NNP'): 35, ('NNP', 'NNP', 'NNP'): 682, ('NNP', 'NNP', 'HYPH'): 31, ('HYPH', 'NNP', ','): 21, ('NNP', ',', 'DT'): 121, (',', 'DT', 'NN'): 262, ('DT', 'NN', 'IN'): 2315, ('NN', 'IN', 'DT'): 2057, ('IN', 'DT', 'NN'): 3472, ('NN', 'IN', 'NNP'): 888, ('IN', 'NNP', ','): 356, ('NNP', ',', 'IN'): 73, (',', 'IN', 'DT'): 168, ('IN', 'DT', 'JJ'): 1245, ('DT', 'JJ', 'NN'): 2218, ('JJ', 'NN', '.'): 747, ('$^START^$', '$^START^$', '-LRB-'): 137, ('$^START^$', '-LRB-', 'DT'): 18, ('-LRB-', 'DT', 'NN'): 28, ('JJ', 'NN', 'MD'): 67, ('NN', 'MD', 'VB'): 332, ('MD', 'VB', 'VBG'): 82, ('VB', 'VBG', 'PRP'): 11, ('VBG', 'PRP', 'NN'): 5, ('PRP', 'NN', 'IN'): 7, ('NN', 'IN', 'NNS'): 386, ('IN', 'NNS', 'TO'): 18, ('NNS', 'TO', 'VB'): 166, ('TO', 'VB', '.'): 115, ('VB', '