In [8]:
import numpy as np
from collections import Counter
import codecs

In [9]:
class language_model:

    def __init__(self, ngram=1):
        self.ngram = ngram

    def unigram(self, text):
        
        data = {}
        words = [word for sentence in text for word in sentence.split()]
        word_length = len(set(words))
        self.c = Counter(words)

        for word in words:
            data[word] = (self.c[word] / word_length)
        return data, word_length

    def text_clean(self, text):
        
        text = text.strip('\n').lower()
        text = text.replace('\n', ' ')

        text = text.translate ({ord(c): "." for c in "!:?"})
        text = text.translate ({ord(c): "" for c in "\"\''""@#$%^&*()[]{};,/<>\|`~=_+"})
        text = text.translate ({ord(c): " " for c in "-"})

        sentences = ['<s> ' +" ".join(sentence.split())+ ' </s>' for sentence in text.split('.') if sentence != ""]
        return sentences

    def ngram_generation(self, sentence, n):
        
        tokens = [token for token in sentence.split(" ") if token != ""]
        ngrams = zip(*[tokens[i:] for i in range(n)])
        ngrams = [" ".join(ngram) for ngram in ngrams]
        ngrams = [tuple(sent.split()) for sent in ngrams]
        return ngrams

    
    def perplexity(self, data):   
        exp = [np.log2(v) for k,v in data.items()]
        exp = sum(exp) / len(exp)
        return np.power(2, -exp)


    def train(self, file_name) :
        with codecs.open(file_name, 'r') as f:
            text = f.read()
        clean_text = self.text_clean(text)

        if self.ngram == 1:
            self.uni_data, self.uni_count = self.unigram(clean_text)
            
            new_dict = {}
            for k,v in self.uni_data.items():
                tup = []
                tup = [tupl for tupl in k[0]]
                tup.append(k[0])
                new_dict[tuple(tup)] = v
            self.uni_data = new_dict
            self.uni_data_count = len(self.uni_data)
        pass
    

    def test(self, file_name) :
        with open(file_name, 'r') as f:
            test_text = f.read()
        clean_test_text = self.text_clean(test_text)
        test_dict = {}
        self.zero_count = 0
        self.sparsity = 0

        if self.ngram == 1:
            for sentence in clean_test_text:
                for word in sentence.split():
                    if word not in self.uni_data.keys():
                        test_dict[word] = 1/self.uni_count
                        self.zero_count += 1
                    else:
                        test_dict[word] = self.uni_data[word]
            if self.zero_count:
                self.sparsity = self.zero_count/len(test_dict)
        return self.perplexity(test_dict)


In [None]:
lm1= language_model(1)
lm1.train("ted.txt")
lm1.test("test.ted.txt")

In [None]:
lm1.train("ted.txt")
lm1.test("test.news.txt")

In [None]:
lm1.train("ted.txt")
lm1.test("test.reddit.txt")