# Assignment 3

### Imports

In [178]:
import math
from collections import Counter
import numpy as np
import pandas as pd

### Preprocessing

In [179]:
def preProcess(file_in, file_out, train_words):
    read = open(file_in, "r")    # Open the input file in read mode
    
    text = ""
    
    # Loop through each sentence in the input file, convert to lowercase and pad
    for sentence in read.readlines():
        text += " <s> " + sentence.lower() + " </s> "
        
   
    text = text.split()         # Split the processed text into individual words
    
    words = {}
    
    # Count the frequency of each word in the text
    for word in text:
        if word in words:
            words[word] += 1
        else:
            words[word] = 1
            
    # If train_words is None, replace single occurrence words with <unk>
    if train_words is None:
        for i in range(len(text)):
            if words[text[i]] == 1:
                text[i] = "<unk>"
    else: 
        # If train_words is provided, replace unseen words with <unk>
        for i in range(len(text)):
            if text[i] not in train_words:
                text[i] = "<unk>"
                
    # Count the frequency of <unk> words in the text
    unk_words = {}
    for word in text:
        if word in unk_words:
            unk_words[word] += 1
        else:
            unk_words[word] = 1
    
    out = open(file_out, "w")         # Open the output file in write mode
    txt = ' '.join(text)              # Convert the list of words back to text
    out.write(txt)                    # Write the processed text to the output file
    out.close()
    
    return [words, unk_words, text]


### Preprocess the various txt files

In [180]:
[train_count, train_unk_count, train_text] = preProcess("brown.train.txt", "trainOut.txt", None)
[test_count, test_unk_count, test_text] = preProcess("brown.test.txt", "testOut.txt", train_unk_count)
[dev_count, dev_unk_count, dev_text] = preProcess("brown.dev.txt", "devOut.txt", train_unk_count)

## Question 1

In [181]:
print('Number of unique words ' + str(len(train_unk_count)))

Number of unique words 24796


## Question 2

In [182]:
print('Number of word tokens ' + str(sum(train_unk_count.values())))

Number of word tokens 1018784


### Unigram Model

In [183]:
def unigram(train_dict):
    unigram = train_dict.copy()
    unigram.pop('<s>', None)
    n = sum(unigram.values())
    
    unigram_result = {}
    for word, count in unigram.items():
        unigram_result[word] = count / n
    
    return unigram_result


### Bigram Model

In [184]:
# Function to generate a count matrix from the training text based on bigram counts
def count_matrix(train_text, word_freq_dict, add_one):
    # Extract the unique vocabulary words from the word frequency dictionary
    vocabulary = list(set(word_freq_dict))

    # Generate bigrams from the training text
    bigrams = generate_bigrams(train_text)
    
    # Count the occurrences of each bigram
    bigram_counts = Counter(bigrams)

    # Create column index dictionary for the vocabulary words
    col_index = {}
    for i, word in enumerate(vocabulary):
        col_index[word] = i

    # Create row index dictionary, which is a copy of the column index
    row_index = col_index.copy()

    # Get the number of rows and columns for the count matrix
    nrow = len(row_index)
    ncol = len(col_index)
    
    # Initialize a count matrix with zeros
    count_matrix = np.zeros((nrow, ncol))

    # Populate the count matrix with bigram counts
    for bigram, count in bigram_counts.items():
        prev_word, curr_word = bigram
        i = row_index[prev_word]
        j = col_index[curr_word]
        count_matrix[i, j] = count

    # Add-one smoothing if required
    if add_one:
        for i in range(nrow):
            for j in range(ncol):
                count_matrix[i, j] += 1

    # Convert the count matrix into a DataFrame with row and column indices
    count_matrix = pd.DataFrame(count_matrix, index=row_index, columns=vocabulary)
    return count_matrix


In [185]:
def bigram(train_text, word_freq_dict, add_one_smoothing):
    count_matrix = make_count_matrix(train_text, word_freq_dict, add_one_smoothing)
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

### Word Percentage

In [186]:
def perc_words(train, test):
    # Initialize variables to store counts and percentages
    sum_tokens = 0
    num_types = 0
    test_tokens = sum(test.values())
    test_type = len(test)
    
    # Count the number of unseen words in the test set
    for word in test:
        if word not in train:
            sum_tokens += test[word]
            num_types += 1
            
    # Calculate percentages
    types_perc = num_types / test_type * 100
    tokens_perc = sum_tokens / test_tokens * 100
    
    # Print the percentages
    print('Percentage of word types ' + str(types_perc))
    print('Percentage of word tokens ' + str(tokens_perc))
    
    return


## Question 3

### Test

In [187]:
perc_words(train_count, test_count)

Percentage of word types 15.623797922810617
Percentage of word tokens 1.9957318074153982


### Dev

In [188]:
perc_words(train_count, dev_count)

Percentage of word types 16.283174683225116
Percentage of word tokens 2.0942205705880075


## Training

In [189]:
unigram = unigram(train_unk_count)
bigram = bigram(train_text, train_unk_count, False)
#bigram_add1 = bigram(train_text, train_unk_count, True)

In [190]:
print(sum(train_unk_count.values()))

1018784


### Set-up scentences

In [191]:
end_symbol = '</s>'

scentences = ["He was laughed off the screen . ",
             "There was no compulsion behind them . ",
             "I look forward to hearing your reply . "]

i =0

# Pad scentences
scentences [0] = scentences[0].lower() + '</s>'
scentences [1] = scentences[1].lower() + '</s>'
scentences [2] = scentences[2].lower() + '</s>'


# Add them to a list
words = []
for sentence in scentences:
    for word in scentence.split():
        words.append(word)
        
# <unk> words
for i in range(len(words)):
    if words[i] not in unigram:
        words[i] = '<unk>'

### Unigram Log Probability

In [192]:
subset = {}
for word in words:
    subset[word] = unigram[word]

In [193]:
# Compute log probabilities of words
log_prob = {}
for word, count in subset.items():
    log_prob[word] = math.log(count, 2)

In [194]:
# Sum the log probabilities
sum_log_prob = 0
for word in words:
    sum_log_prob += log_prob[word]

### Question 4a 

In [211]:
avg_log_prob = sum_log_prob/ len(words)
print('Unigram Log Probability ' + str(sum_log_prob))

Unigram Log Probability -9.218361943610052


### Question 5a

In [196]:
perplexity = 2 ** (-avg_log_prob)
print('Unigram perplexity ' + str(perplexity))

Unigram perplexity 433.71684018402664


In [201]:
def perplex_uni(text, unigram):
    # Create a subset dictionary to store log probabilities of each word in the text
    subset = {}
    
    # Iterate over each word in the text
    for word in text:
        # Exclude the start symbol "<s>" from consideration
        if word != "<s>":
            # Calculate and store the log probability of the word using the unigram model
            subset[word] = math.log(unigram[word], 2)

    # Initialize variables to calculate the average log probability
    sum_log_prob = 0 
    m = 0  
    
    # Iterate over each word in the text, exclude symbols and accumulate log probs
    for word in text:
        if word != "<s>":
            sum_log_prob += subset[word]
            m += 1

    # Calculate the average log probability
    avg_log_prob = sum_log_prob / m
    
    # Calculate perplexity using the average log probability
    perplexity = 2 ** (-avg_log_prob)
    
    return perplexity

### Question 6a

In [202]:
print('Test unigram perplexiy ' + str(perplex_uni(test_text, unigram)))
print('Dev unigram perplexiy ' + str(perplex_uni(dev_text, unigram)))

Test unigram perplexiy 696.3522405243207
Dev unigram perplexiy 692.1274970283


### Bigram Log Probability/Perplexity

In [203]:
for i in range(len(scentences)):
    if scentences[i] not in bigram:
        scentences[i] = '<unk>'

In [205]:
subset = set()
sum_log_prob = 0
has_zero = False

# Iterate over the indices of the sentences list, excluding the last index
for i in range(len(sentences) - 1):
    # Add the current bigram (current word and the next word) along with its count to the subset set
    subset.add((sentences[i], sentences[i + 1], bigram[sentences[i]][sentences[i + 1]]))
    
    # Check if the count of the current bigram is zero
    if bigram[sentences[i]][sentences[i + 1]] == 0:
        # Set the flag indicating that at least one bigram has a zero count
        has_zero = True
    else:
        # Accumulate the log probability of the current bigram
        sum_log_prob += math.log(bigram[sentences[i]][sentences[i + 1]], 2)


In [206]:
log_prob = set()

# Iterate over each bigram in the subset set
for bigram in subset:
    # Calculate the log probability of the current bigram and add it to the log_prob set
    log_prob.add((bigram[0], bigram[1], math.log(bigram[2], 2)))


In [207]:
m = len(scentences)
avg_log_prob = sum_log_prob / m

### Question 4b

In [208]:
print('Bigram average Log Probability ' + str(sum_log_prob))

Bigram average Log Probability -9.218361943610052


### Question 5b

In [209]:
bigram_perplexity = 2 ** -avg_log_prob

In [210]:
print('Bigram perplexity ' + str(bigram_perplexity))

Bigram perplexity 8.41397373458845
