# `521H0489 - Hồ Hữu An`

# `Unigram - Bigram`

In [2]:
import re
import math

UNK = None
SENTENCE_START = "<s>"
SENTENCE_END = "</s>"

In [3]:
def read_sentences_from_file(file_path):
    with open(file_path, "r") as f:
        return [re.split("\s+", line.rstrip('\n')) for line in f]

# `Unigram model`

In [4]:
class UnigramLanguageModel:
    def __init__(self, sentences, smoothing=False):
        self.unigram_frequencies = dict()
        self.corpus_length = 0
        for sentence in sentences:
            for word in sentence:
                self.unigram_frequencies[word] = self.unigram_frequencies.get(word, 0) + 1
                if word != SENTENCE_START and word != SENTENCE_END:
                    self.corpus_length += 1
        # subtract 2 because unigram_frequencies dictionary contains values for SENTENCE_START and SENTENCE_END
        self.unique_words = len(self.unigram_frequencies) - 2
        self.smoothing = smoothing

    def calculate_unigram_probability(self, word):
            word_probability_numerator = self.unigram_frequencies.get(word, 0)
            word_probability_denominator = self.corpus_length
            if self.smoothing:
                word_probability_numerator += 1
                # add one more to total number of seen unique words for UNK - unseen events
                word_probability_denominator += self.unique_words + 1
            return float(word_probability_numerator) / float(word_probability_denominator)

    def calculate_sentence_probability(self, sentence, normalize_probability=True):
        sentence_probability_log_sum = 0
        for word in sentence:
            if word != SENTENCE_START and word != SENTENCE_END:
                word_probability = self.calculate_unigram_probability(word)
                sentence_probability_log_sum += math.log(word_probability, 2)
        return math.pow(2, sentence_probability_log_sum) if normalize_probability else sentence_probability_log_sum                

    def sorted_vocabulary(self):
        full_vocab = list(self.unigram_frequencies.keys())
        full_vocab.remove(SENTENCE_START)
        full_vocab.remove(SENTENCE_END)
        full_vocab.sort()
        full_vocab.append(UNK)
        full_vocab.append(SENTENCE_START)
        full_vocab.append(SENTENCE_END)
        return full_vocab

# `Bigram model`

In [5]:
class BigramLanguageModel(UnigramLanguageModel):
    def __init__(self, sentences, smoothing=False):
        UnigramLanguageModel.__init__(self, sentences, smoothing)
        self.bigram_frequencies = dict()
        self.unique_bigrams = set()
        for sentence in sentences:
            previous_word = None
            for word in sentence:
                if previous_word != None:
                    self.bigram_frequencies[(previous_word, word)] = self.bigram_frequencies.get((previous_word, word),
                                                                                                 0) + 1
                    if previous_word != SENTENCE_START and word != SENTENCE_END:
                        self.unique_bigrams.add((previous_word, word))
                previous_word = word
        # we subtracted two for the Unigram model as the unigram_frequencies dictionary
        # contains values for SENTENCE_START and SENTENCE_END but these need to be included in Bigram
        self.unique__bigram_words = len(self.unigram_frequencies)

    def calculate_bigram_probabilty(self, previous_word, word):
        bigram_word_probability_numerator = self.bigram_frequencies.get((previous_word, word), 0)
        bigram_word_probability_denominator = self.unigram_frequencies.get(previous_word, 0)
        if self.smoothing:
            bigram_word_probability_numerator += 1
            bigram_word_probability_denominator += self.unique__bigram_words
        return 0.0 if bigram_word_probability_numerator == 0 or bigram_word_probability_denominator == 0 else float(
            bigram_word_probability_numerator) / float(bigram_word_probability_denominator)

    def calculate_bigram_sentence_probability(self, sentence, normalize_probability=True):
        bigram_sentence_probability_log_sum = 0
        previous_word = None
        for word in sentence:
            if previous_word != None:
                bigram_word_probability = self.calculate_bigram_probabilty(previous_word, word)
                bigram_sentence_probability_log_sum += math.log(bigram_word_probability, 2)
            previous_word = word
        return math.pow(2,
                        bigram_sentence_probability_log_sum) if normalize_probability else bigram_sentence_probability_log_sum

In [6]:
# calculate number of unigrams & bigrams
def calculate_number_of_unigrams(sentences):
    unigram_count = 0
    for sentence in sentences:
        # remove two for <s> and </s>
        unigram_count += len(sentence) - 2
    return unigram_count

In [7]:
def calculate_number_of_bigrams(sentences):
        bigram_count = 0
        for sentence in sentences:
            # remove one for number of bigrams in sentence
            bigram_count += len(sentence) - 1
        return bigram_count

# `Unigram and Bigram probs`

In [8]:
def print_unigram_probs(sorted_vocab_keys, model):
    for vocab_key in sorted_vocab_keys:
        if vocab_key != SENTENCE_START and vocab_key != SENTENCE_END:
            print("{}: {}".format(vocab_key if vocab_key != UNK else "UNK",
                                       model.calculate_unigram_probability(vocab_key)), end=" ")
    print("")

In [9]:
def print_bigram_probs(sorted_vocab_keys, model):
    print("\t\t", end="")
    for vocab_key in sorted_vocab_keys:
        if vocab_key != SENTENCE_START:
            print(vocab_key if vocab_key != UNK else "UNK", end="\t\t")
    print("")
    for vocab_key in sorted_vocab_keys:
        if vocab_key != SENTENCE_END:
            print(vocab_key if vocab_key != UNK else "UNK", end="\t\t")
            for vocab_key_second in sorted_vocab_keys:
                if vocab_key_second != SENTENCE_START:
                    print("{0:.5f}".format(model.calculate_bigram_probabilty(vocab_key, vocab_key_second)), end="\t\t")
            print("")
    print("")

# `Calculate perplexty`

In [10]:
def calculate_unigram_perplexity(model, sentences):
    unigram_count = calculate_number_of_unigrams(sentences)
    sentence_probability_log_sum = 0
    for sentence in sentences:
        try:
            sentence_probability_log_sum -= math.log(model.calculate_sentence_probability(sentence), 2)
        except:
            sentence_probability_log_sum -= float('inf')
    return math.pow(2, sentence_probability_log_sum / unigram_count)

In [11]:
def calculate_bigram_perplexity(model, sentences):
    number_of_bigrams = calculate_number_of_bigrams(sentences)
    bigram_sentence_probability_log_sum = 0
    for sentence in sentences:
        try:
            bigram_sentence_probability_log_sum -= math.log(model.calculate_bigram_sentence_probability(sentence), 2)
        except:
            bigram_sentence_probability_log_sum -= float('inf')
    return math.pow(2, bigram_sentence_probability_log_sum / number_of_bigrams)

# `Run`

## `Sample data`

In [12]:
# toy_dataset = read_sentences_from_file("./sampledata.txt")
# toy_dataset_test = read_sentences_from_file("./sampletest.txt")

# toy_dataset_model_unsmoothed = BigramLanguageModel(toy_dataset)
# toy_dataset_model_smoothed = BigramLanguageModel(toy_dataset, smoothing=True)

# sorted_vocab_keys = toy_dataset_model_unsmoothed.sorted_vocabulary()

# print("---------------- Sample dataset ---------------\n")
# print("=== UNIGRAM MODEL ===")
# print("- Unsmoothed  -")
# print_unigram_probs(sorted_vocab_keys, toy_dataset_model_unsmoothed)
# print("\n- Smoothed  -")
# print_unigram_probs(sorted_vocab_keys, toy_dataset_model_smoothed)

# print("")

# print("=== BIGRAM MODEL ===")
# print("- Unsmoothed  -")
# print_bigram_probs(sorted_vocab_keys, toy_dataset_model_unsmoothed)
# print("- Smoothed  -")
# print_bigram_probs(sorted_vocab_keys, toy_dataset_model_smoothed)

# print("")

# print("== SENTENCE PROBABILITIES == ")
# longest_sentence_len = max([len(" ".join(sentence)) for sentence in toy_dataset_test]) + 5
# print("sent", " " * (longest_sentence_len - len("sent") - 2), "uprob\t\tbiprob")
# for sentence in toy_dataset_test:
#     sentence_string = " ".join(sentence)
#     print(sentence_string, end=" " * (longest_sentence_len - len(sentence_string)))
#     print("{0:.5f}".format(toy_dataset_model_smoothed.calculate_sentence_probability(sentence)), end="\t\t")
#     print("{0:.5f}".format(toy_dataset_model_smoothed.calculate_bigram_sentence_probability(sentence)))        
    
# print("")

# print("== TEST PERPLEXITY == ")
# print("unigram: ", calculate_unigram_perplexity(toy_dataset_model_smoothed, toy_dataset_test))
# print("bigram: ", calculate_bigram_perplexity(toy_dataset_model_smoothed, toy_dataset_test))

# print("")

## `Actual data`

In [20]:
actual_dataset = read_sentences_from_file("train.txt")
actual_dataset_test = read_sentences_from_file("test.txt")

uigramLanguageModel= UnigramLanguageModel(actual_dataset)
bigramLanguageModel= BigramLanguageModel(actual_dataset)

In [21]:
print("PERPLEXITY of train.txt")
print("unigram: ", calculate_unigram_perplexity(uigramLanguageModel, actual_dataset))
print("bigram: ", calculate_bigram_perplexity(bigramLanguageModel, actual_dataset))

PERPLEXITY of train.txt
unigram:  753.8242377461315
bigram:  16.05992418507916


In [22]:
print("PERPLEXITY of test.txt")
print("unigram: ", calculate_unigram_perplexity(uigramLanguageModel, actual_dataset_test))
print("bigram: ", calculate_bigram_perplexity(bigramLanguageModel, actual_dataset_test))

PERPLEXITY of test.txt
unigram:  0.0
bigram:  0.0


In [19]:
print("PROBABILITY")
print(uigramLanguageModel.calculate_sentence_probability(actual_dataset_test[0]))
print(bigramLanguageModel.calculate_sentence_probability(actual_dataset_test[0]))

PROBABILITY
1.6638831424153624e-83
1.6638831424153624e-83


In [17]:
# print(actual_dataset_test[0])

['<s>', 'the', 'website', 'and', 'monthly', 'newsletter', 'is', 'run', 'by', 'a', 'sub-committee', 'that', 'is', 'independent', 'to', 'the', 'parish', 'council', 'and', 'is', 'financed', 'through', 'selling', 'advertisement', 'space', 'to', 'local', 'businesses', '</s>']


In [24]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable

DEPRECATION: Loading egg at c:\programdata\anaconda3\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330
ERROR: Could not install packages due to an OSError: HTTPSConnectionPool(host='files.pythonhosted.org', port=443): Max retries exceeded with url: /packages/5a/d7/f7f93c41fde5b8c1f9d52cc0f9a104a56eca13dc6876c6d2f967ddef88d7/fst-pso-1.8.1.tar.gz (Caused by ConnectTimeoutError(<pip._vendor.urllib3.connection.HTTPSConnection object at 0x0000022BACF22150>, 'Connection to files.pythonhosted.org timed out. (connect timeout=15)'))




Collecting FuzzyTM>=0.4.0 (from gensim)
  Using cached FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim)
  Using cached pyFUME-0.3.4-py3-none-any.whl.metadata (9.7 kB)
Collecting scipy>=1.7.0 (from gensim)
  Using cached scipy-1.10.1-cp311-cp311-win_amd64.whl.metadata (58 kB)
Collecting numpy>=1.18.5 (from gensim)
  Using cached numpy-1.24.4-cp311-cp311-win_amd64.whl.metadata (5.6 kB)
Collecting simpful==2.12.0 (from pyfume->FuzzyTM>=0.4.0->gensim)
  Using cached simpful-2.12.0-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso==1.8.1 (from pyfume->FuzzyTM>=0.4.0->gensim)


In [25]:
import gensim.downloader as api

# check available models and datasets
info_datasets = api.info()
print(info_datasets)
#>{'corpora': 
#> {'semeval-2016-2017-task3-subtaskBC': 
#>	 {'num_records': -1, 'record_format': 'dict', 'file_size': 6344358, ....}

# information of a particular dataset
dataset_info = api.info("text8")

# load the "text8" dataset
dataset = api.load("text8")

# load a pre-trained model
word2vec_model = api.load('word2vec-google-news-300')


ValueError: unable to read local cache 'C:\\Users\\An/gensim-data\\information.json' during fallback, connect to the Internet and retry