# Statistical Natural Language Processing (WS 20/21)
## Exercise Sheet 1 - Manuel Hettich

Instructions: Run all the cells above the headline "Example function calls" to be able to use those examples below.

In [1]:
import string
import numpy as np

word_frequency_unigram = dict()
word_frequency_bigram = dict()
word_frequency_trigram = dict()

In [2]:
# Exercise 1a)
def process_line(line):
    # Remove whitespace at the end and beginning of the sentence
    line = line.strip()
    # Make sentence lowercase
    line = line.lower()
    # Remove punctuation (Source: https://stackoverflow.com/a/60725620)
    line = line.translate(str.maketrans('', '', string.punctuation))
    # Tokenize sentence into a list and add keywords for the beginning and the end of the sentence
    return ["[SOS]"] + line.split() + ["[EOS]"]

processed_corpus = []
with open("corpus.txt", "rt") as infile:
    for line in infile:
        # Add processed sentence to the list of all sentences in memory
        processed_corpus += process_line(line)

In [3]:
# Exercise 1b)
# Calculate frequency of each word and sequences of up to three words in the corpus
size_of_processed_corpus = len(processed_corpus)
for idx, word in enumerate(processed_corpus):
    # Unigram frequency
    if word in word_frequency_unigram:
        word_frequency_unigram[word] += 1
    else:
        word_frequency_unigram[word] = 1
    # Bigram frequency
    if idx <= size_of_processed_corpus - 2:
        if (word, processed_corpus[idx + 1]) in word_frequency_bigram:
            word_frequency_bigram[(word, processed_corpus[idx + 1])] += 1
        else:
            word_frequency_bigram[(word, processed_corpus[idx + 1])] = 1
    # Trigram frequency
    if idx <= size_of_processed_corpus - 3:
        if (word, processed_corpus[idx + 1], processed_corpus[idx + 2]) in word_frequency_trigram:
            word_frequency_trigram[(word, processed_corpus[idx + 1], processed_corpus[idx + 2])] += 1
        else:
            word_frequency_trigram[(word, processed_corpus[idx + 1], processed_corpus[idx + 2])] = 1

# Calculate unigram probabilities
def word_prob_unigram(word_i):
    # Calculate sum of all unigram word frequencies
    sum_of_frequencies = sum(word_frequency_unigram.values())
    return word_frequency_unigram[word_i] / sum_of_frequencies


# Calculate bigram probabilities
def word_prob_bigram(word_i, word_i1):
    if (word_i1, word_i) in word_frequency_bigram:
        return word_frequency_bigram[(word_i1, word_i)] / word_frequency_unigram[word_i1]
    else:
        return 0


# Calculate trigram probabilities
def word_prob_trigram(word_i, word_i1, word_i2):
    if (word_i2, word_i1, word_i) in word_frequency_trigram:
        return word_frequency_trigram[(word_i2, word_i1, word_i)] / word_frequency_bigram[(word_i2, word_i1)]
    else:
        return 0

#### Exercise 1c)

The numbers of parameters of these distributions scale with the number of different words N in the corpus accordingly:

Unigram Model: Linear scaling because we need to calculate a probability for each word in the vocabulary.

Bigram Model: # of parameters is proportional to N x N due to each possible combination of unique words although most of these will result in a 0 because most combination have never appeared in the corpus.

Trigram Model: # of parameters is proportional to N x N x N with the same reasoning as for the bigram model but only for each possible triplet taken from the vocabulary.

In [4]:
# Exercise 2a)
def sample_unigram(*args):
    if not args:
        all_words_prob_unigram = [word_prob_unigram(word) for word in word_frequency_unigram.keys()]
    else:
        all_words_prob_unigram = args[0]
    return np.random.choice([*word_frequency_unigram],
                            p=all_words_prob_unigram)


def sample_bigram(word_i1):
    matching_bigrams = [(w_i1, w_i)
                        for (w_i1, w_i)
                        in word_frequency_bigram
                        if w_i1 == word_i1]
    return np.random.choice([w_i for (_, w_i) in matching_bigrams],
                            p=[word_prob_bigram(w_i, word_i1)
                               for (_, w_i)
                               in matching_bigrams])


def sample_trigram(word_i2, word_i1):
    matching_trigrams = [(w_i2, w_i1, w_i)
                         for (w_i2, w_i1, w_i)
                         in word_frequency_trigram
                         if w_i2 == word_i2 and w_i1 == word_i1]
    return np.random.choice([w_i for (_, _, w_i) in matching_trigrams],
                            p=[word_prob_trigram(w_i, word_i1, word_i2)
                               for (_, _, w_i)
                               in matching_trigrams])

In [5]:
# Exercise 2b)
def gen_sentence_unigram():
    all_words_prob_unigram = [word_prob_unigram(word) for word in word_frequency_unigram.keys()]

    sentence = list()
    while True:
        word = sample_unigram(all_words_prob_unigram)
        if word == '[EOS]':
            del sentence[0]
            return ' '.join(sentence) + '.'
        else:
            sentence.append(word)


def gen_sentence_bigram():
    i = 1
    sentence = ['[SOS]']
    while True:
        word = sample_bigram(sentence[i - 1])
        if word == '[EOS]':
            del sentence[0]
            return ' '.join(sentence) + '.'
        else:
            sentence.append(word)
            i += 1


def gen_sentence_trigram():
    i = 1
    sentence = ['[SOS]']
    while True:
        if i >= 2:
            word = sample_trigram(sentence[i - 2], sentence[i - 1])
        if i == 1:
            word = sample_bigram(sentence[i - 1])
        if word == '[EOS]':
            del sentence[0]
            return ' '.join(sentence) + '.'
        else:
            sentence.append(word)
            i += 1

#### Exercise 2c)

The generated sentences based on the unigram model do not make any sense because this model generates each new word completely independent of its context. This approach does not provide a good basis for generating sentences.

The bigram model delivers sentences which look like some reasonable text in English but it's still hard to find a meaningful sentence in the output. The context with only looking at a single previous word is very limited.

The trigram model is sometimes able to produce meaningful sentences. The longer the sentences get, the more random they are and it gets harder to accept them as a correct sentence (as soon as we go beyond the first three words).

## Example function calls

### Word probabilities

In [6]:
# Calculate the probability for a single word
word_prob_unigram('the')

0.06420179970450697

In [7]:
# Calculate the probability for a word given a single previous word
# Here: probability for the word 'not' given the previous word 'does'
word_prob_bigram('not', 'does')

0.5212765957446809

In [8]:
# Calculate the probability for a word given the two previous words
# Here: probability for the word 'the' given the two previous words 'up to'
word_prob_trigram('the', 'to', 'up')

0.3081081081081081

### Samples

In [9]:
# Get a sample based on unigram model
# Caution: takes a long computation time due to the calculation of unigram probabilities for each word in the corpus
sample_unigram()

'kind'

In [10]:
# Get a sample based on bigram model (given a single previous word)
sample_bigram('in')

'this'

In [11]:
# Get a sample based on trigram model (given the two previous words)
# Here: Get a sample based on the two previous words 'up to'
sample_trigram('up', 'to')

'150000000'

### Sentence generators

In [12]:
# Generate a sentence based on unigram model
print(gen_sentence_unigram())

[SOS] the of farmhouse took.


In [13]:
# Generate a sentence based on bigram model
print(gen_sentence_bigram())

353 u s phosphor screens for such unity has 43 hole crosswise through contact with television in labor unions who was.


In [14]:
# Generate a sentence based on trigram model
print(gen_sentence_trigram())

we walked down the temptation to flng himself out of its body in an areawide effort better results can be obtained from the publisher for 1680 and 2800 respectively.
