Language Models and Smoothing

In [None]:
import os.path
import sys
import random
from operator import itemgetter
from collections import defaultdict
from collections import Counter
import random
import math

In [None]:
def load_corpus(file_path):
    if os.path.isfile(file_path):
        file = open(file_path, "r")
        count = 0
        corpus = []
        print("Reading file ", file_path)
        for line in file:
            count += 1
            sentence = line.split()
            corpus.append(sentence)
            if count % 1000 == 0:
                sys.stderr.write("Reading sentence " + str(count) + "\n")
        return corpus
    else:
        print("Error: corpus file ", file_path, " does not exist")
        sys.exit()


def preprocess_corpus(corpus):
    word_freq = defaultdict(int)
    for sentence in corpus:
        for word in sentence:
            word_freq[word] += 1

    for sentence in corpus:
        for i in range(0, len(sentence)):
            word = sentence[i]
            if word_freq[word] < 2:
                sentence[i] = UNK

    for sentence in corpus:
        sentence.insert(0, START)
        sentence.append(END)

    return corpus


def preprocess_test_data(vocabulary, test_corpus):
    for sentence in test_corpus:
        for i in range(0, len(sentence)):
            word = sentence[i]
            if word not in vocabulary:
                sentence[i] = UNK

    for sentence in test_corpus:
        sentence.insert(0, START)
        sentence.append(END)

    return test_corpus

UNK = "UNK"
START = "<s>"
END = "</s>"



In [None]:
trainCorpus = load_corpus('train.txt')
trainCorpus = preprocess_corpus(trainCorpus)

posTestCorpus = load_corpus('pos_test.txt')
negTestCorpus = load_corpus('neg_test.txt')
vocab = set(word for sentence in trainCorpus for word in sentence)

posTestCorpus = preprocess_test_data(vocab, posTestCorpus)
negTestCorpus = preprocess_test_data(vocab, negTestCorpus)


Reading file  train.txt


Reading sentence 1000
Reading sentence 2000
Reading sentence 3000
Reading sentence 4000
Reading sentence 5000
Reading sentence 6000
Reading sentence 7000
Reading sentence 8000
Reading sentence 9000
Reading sentence 10000
Reading sentence 11000
Reading sentence 12000
Reading sentence 13000
Reading sentence 14000
Reading sentence 15000
Reading sentence 16000
Reading sentence 17000
Reading sentence 18000
Reading sentence 19000
Reading sentence 20000
Reading sentence 21000
Reading sentence 22000
Reading sentence 23000
Reading sentence 24000
Reading sentence 25000
Reading sentence 26000
Reading sentence 27000
Reading sentence 28000
Reading sentence 29000
Reading sentence 30000


Reading file  pos_test.txt
Reading file  neg_test.txt


Reading sentence 1000
Reading sentence 1000


Unsmoothed Unigram

In [None]:
class UnigramModel:
    def __init__(self):
        self.vocab = set()
        self.word_counts = defaultdict(int)
        self.total_count = 0

    def train_model(self, corpus):
        for sentence in corpus:
            for word in sentence:
                self.word_counts[word] += 1
                self.total_count += 1

    def generate_sentence(self):
        sentence = ["<s>"]
        while True:
            word = random.choices(list(self.word_counts.keys()), weights=list(self.word_counts.values()))[0]
            sentence.append(word)
            if word == "</s>":
                break
        return sentence

    def get_sentence_probability(self, sentence):
        probability = 1.0
        for word in sentence:
            probability *= self.word_counts[word] / self.total_count
        return probability

    def generate_sentences_to_file(self, file_name, num_sentences):
        file_pointer = open(file_name, 'w+')
        for i in range(num_sentences):
            sen = self.generate_sentence()
            prob = self.get_sentence_probability(sen)
            string_generated = str(prob) + " " + " ".join(sen)
            print(string_generated, end="\n", file=file_pointer)

    def compute_perplexity(self, test_corpus):
        total_log_probability = 0
        total_words = 0

        for sentence in test_corpus:
            sentence_probability = self.get_sentence_probability(sentence)
            total_words += len(sentence)

            if sentence_probability > 0:
                total_log_probability += math.log(sentence_probability)

        perplexity = math.exp(-total_log_probability / total_words)
        return perplexity


In [None]:
model = UnigramModel()
model.train_model(trainCorpus)
model.generate_sentences_to_file('unigram output.txt',20)
print(model.compute_perplexity(posTestCorpus))
print(model.compute_perplexity(negTestCorpus))

579.0175572073653
561.6252784519507


smoothed Unigram

In [None]:
class SmoothedUnigramModel:
    def __init__(self):
        self.vocab = set()
        self.word_counts = defaultdict(int)
        self.total_count = 0

    def train_model(self, corpus):
        for sentence in corpus:
            for word in sentence:
                self.word_counts[word] += 1
                self.total_count += 1

        self.vocab = set(self.word_counts.keys())

    def generate_sentence(self):
        sentence = ["<s>"]
        while True:
            word = random.choices(list(self.word_counts.keys()), weights=list(self.word_counts.values()))[0]
            sentence.append(word)
            if word == "</s>":
                break
        return sentence

    def get_sentence_probability(self, sentence):
        probability = 1.0
        vocabulary_size = len(self.vocab)
        for word in sentence:
            probability *= (self.word_counts[word] + 1) / (self.total_count + vocabulary_size)
        return probability

    def generate_sentences_to_file(self, file_name, num_sentences):
        file_pointer = open(file_name, 'w+')
        for i in range(num_sentences):
            sen = self.generate_sentence()
            prob = self.get_sentence_probability(sen)
            string_generated = str(prob) + " " + " ".join(sen)
            print(string_generated, end="\n", file=file_pointer)

    def compute_perplexity(self, test_corpus):
        total_log_probability = 0
        total_words = 0

        for sentence in test_corpus:
            sentence_probability = self.get_sentence_probability(sentence)
            total_words += len(sentence)

            if sentence_probability > 0:
                total_log_probability += math.log(sentence_probability)

        perplexity = math.exp(-total_log_probability / total_words)
        return perplexity

In [None]:
model = SmoothedUnigramModel()
model.train_model(trainCorpus)
model.generate_sentences_to_file('smooth unigram output.txt',20)
print(model.compute_perplexity(posTestCorpus))
print(model.compute_perplexity(negTestCorpus))

581.5700463677365
564.3740498208495


Unsmoothed Bigram

In [None]:
class UnsmoothedBigramModel:
    def __init__(self):
        self.vocab = set()
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.total_count = 0

    def train_model(self, corpus):
        for sentence in corpus:
            for i in range(len(sentence) - 1):
                word1, word2 = sentence[i], sentence[i + 1]
                self.bigram_counts[word1][word2] += 1
                self.unigram_counts[word1] += 1
                self.total_count += 1
            self.unigram_counts[sentence[-1]] += 1
            self.total_count += 1
        self.vocab = set(self.unigram_counts.keys())

    def generate_sentence(self):
        sentence = ["<s>"]
        while True:
            prev_word = sentence[-1]
            next_word = random.choices(list(self.bigram_counts[prev_word].keys()))[0]
            sentence.append(next_word)
            if next_word == "</s>":
                break
        return sentence

    def get_sentence_probability(self, sentence):
        probability = 1.0
        for i in range(len(sentence) - 1):
            word1, word2 = sentence[i], sentence[i + 1]
            bigram_count = self.bigram_counts[word1][word2]
            unigram_count = self.unigram_counts[word1]
            if unigram_count == 0:
                return 0
            probability *= bigram_count / unigram_count
        return probability

    def generate_sentences_to_file(self, file_name, num_sentences):
        file_pointer = open(file_name, 'w+')
        for i in range(num_sentences):
            sen = self.generate_sentence()
            prob = self.get_sentence_probability(sen)
            string_generated = str(prob) + " " + " ".join(sen)
            print(string_generated, end="\n", file=file_pointer)

    def compute_perplexity(self, test_corpus):
        total_log_probability = 0
        total_words = 0

        for sentence in test_corpus:
            sentence_probability = self.get_sentence_probability(sentence)
            total_words += len(sentence)
            if sentence_probability > 0:
                total_log_probability += math.log(sentence_probability)

        perplexity = math.exp(-total_log_probability / total_words)
        return perplexity

In [None]:
model = UnsmoothedBigramModel()
model.train_model(trainCorpus)
model.generate_sentences_to_file('bigram output.txt',20)
print(model.compute_perplexity(posTestCorpus))
print(model.compute_perplexity(negTestCorpus))

1.0870673514925697
1.107624965768465


Smoothed Bigram

In [None]:
class SmoothedBigramModel:
    def __init__(self, smoothing_parameter=0.5):
        self.vocab = set()
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.unigram_counts = defaultdict(int)
        self.total_count = 0
        self.smoothing_parameter = smoothing_parameter

    def train_model(self, corpus):
        for sentence in corpus:
            for i in range(len(sentence) - 1):
                word1, word2 = sentence[i], sentence[i + 1]
                self.bigram_counts[word1][word2] += 1
                self.unigram_counts[word1] += 1
                self.total_count += 1
            self.unigram_counts[sentence[-1]] += 1
            self.total_count += 1
        self.vocab = set(self.unigram_counts.keys())

    def generate_sentence(self):
        sentence = ["<s>"]
        while True:
            prev_word = sentence[-1]
            next_word = random.choices(list(self.bigram_counts[prev_word].keys()))[0]
            sentence.append(next_word)
            if next_word == "</s>" or len(sentence) > 20:
                break
        return sentence

    def get_sentence_probability(self, sentence):
        probability = 1.0
        for i in range(len(sentence) - 1):
            word1, word2 = sentence[i], sentence[i + 1]
            bigram_count = self.bigram_counts[word1][word2]
            unigram_count = self.unigram_counts[word1]
            if unigram_count == 0:
                return 0

            bigram_prob = bigram_count / unigram_count
            unigram_prob = self.unigram_counts[word2] / self.total_count
            probability *= self.smoothing_parameter * bigram_prob + (1 - self.smoothing_parameter) * unigram_prob
        return probability

    def generate_sentences_to_file(self, file_name, num_sentences):
        file_pointer = open(file_name, 'w+')
        for i in range(num_sentences):
            sen = self.generate_sentence()
            prob = self.get_sentence_probability(sen)
            string_generated = str(prob) + " " + " ".join(sen)
            print(string_generated, end="\n", file=file_pointer)

    def compute_perplexity(self, test_corpus):
        total_log_probability = 0
        total_words = 0

        for sentence in test_corpus:
            sentence_probability = self.get_sentence_probability(sentence)
            total_words += len(sentence)
            if sentence_probability > 0:
                total_log_probability += math.log(sentence_probability)

        perplexity = math.exp(-total_log_probability / total_words)
        return perplexity

In [None]:
model = SmoothedBigramModel()
model.train_model(trainCorpus)
model.generate_sentences_to_file('smoothed bigram output.txt',20)
print(model.compute_perplexity(posTestCorpus))
print(model.compute_perplexity(negTestCorpus))

200.96697839328772
205.37508966186493
