# HW02: N-gram models

In [None]:
# Imports requiered libraries
import os

# Impports minidom for XML file processing
from lxml import etree

# Imports NLTK for tokenizing
import nltk

# Imports counter for term frequency
from collections import Counter

# Imports time function for performance measurements
from time import time

## Merging datasets into a single file

In [None]:
# Sets prefix where the dataset is
NEWS_PREFIX = "./resources/news/"

# Sets the news consolidate file
NEWS_CONSOLIDATE = "./resources/news_consolidate.txt"

# Counts the number of processed files
n = 0

# Sets the initial time
initial_time = time()

# Opens the consolidate file
with open(NEWS_CONSOLIDATE, 'wb') as consolidate:

    # Iterates over every folder in dataset
    for folder in os.listdir(NEWS_PREFIX):
        
        # Iterates over every file in subfolders
        for filename in os.listdir(NEWS_PREFIX + folder):
            
            # Opens file
            file = open(NEWS_PREFIX + folder + '/' + filename, 'rb').read()
            
            # Writes the file content into consolidate
            consolidate.write(file)
            
            # Increments number of processed files
            n += 1
            
# Calculates execution time
final_time = time() - initial_time

# Prints relevant information
print("Processed: {} files in {:4.2f} seconds".format(n, final_time))


In [None]:
# Sets prefix where the dataset is
BLOGS_PREFIX = "./resources/blogs/"

# Sets the blog consolidate file
BLOGS_CONSOLIDATE = "./resources/blogs_consolidate.txt"

# Counts the number of processed files
n = 0

# Sets the initial time
initial_time = time()

# Opens consolidate file
with open(BLOGS_CONSOLIDATE, 'w') as consolidate:

    # Iterates over files in dataset
    for filename in os.listdir(BLOGS_PREFIX):

        # Opens the file
        text = open(BLOGS_PREFIX + filename, 'r', encoding="latin-1").read()

        # Parses the file and retrieves the element tree
        parser = etree.XMLParser(recover=True)
        tree = etree.fromstring(text, parser=parser)
        
        # Writes content to consolidate file
        consolidate.write(tree[1].text)
        
        # Increments number of processed files
        n += 1
        
# Calculates execution time
final_time = time() - initial_time

# Prints relevant information
print("Processed: {} files in {:4.2f} seconds".format(n, final_time))


## Tokenize

In [None]:
def process_text(text):
    
    # Imports GENSIM PorterStemmer
    from gensim.parsing.porter import PorterStemmer
    
    # Imports regular expressions substitution
    from re import sub, match
    
    # Tokenizes text into sentences
    text = nltk.sent_tokenize(text)
    
    # Creates the stemmer instance
    stemmer = PorterStemmer()
    
    # List to store processed sentences
    processed_text = []
    
    # Iterates over sentences
    for sentence in text:
    
        # Sets text in lower case
        sentence = sentence.lower()
        
        # Replaces numbers
        sentence = sub(r'[0-9]+', 'NUM', sentence)
        
        # Stems text
        sentence = stemmer.stem_sentence(sentence)
        
        # Tokenizes text
        sentence = nltk.wordpunct_tokenize(sentence)
        
        # Removes punctuation marks
        sentence = [ token for token in sentence if not match(r'[^\w\s]', token) ]
        
        # Adds tags to sentence
        sentence.insert(0, '<s>')
        sentence.append("</s>")
        
        # Appends sentence to list
        processed_text.append(sentence)
    
    # Creates a single list with all tokens
    processed_text = [ token for sentence in processed_text for token in sentence ]
    
    # Counts token frequency
    token_freq = dict(Counter(processed_text))
    
    # Gets the unique tokens from dictionary
    unique_tokens = [ token for token in token_freq.keys() if token_freq[token] == 1 ]

    # Replaces unique tokens in corpus
    for i, token in enumerate(unique_tokens):
        processed_text[processed_text.index(token)] = "<UNK>"
        
    # Creates lists to store corpus and current sentence
    corpus = []
    sentence = []
        
    # Returns corpus to sentences
    for i, token in enumerate(processed_text):
        
        # Appends token to current sentence
        sentence.append(token)
        
        # Resets sentence if end-of-sentence token is found
        if token == "</s>":
            corpus.append(sentence)
            sentence = []
            
    # Returns corpus 
    return corpus

In [None]:
# Opens both datasets
news_text = open(NEWS_CONSOLIDATE, 'rb').read().decode('utf-8', 'ignore')
blogs_text = open(BLOGS_CONSOLIDATE, 'rb').read().decode('utf-8', 'ignore')

# Process news text files
initial_time = time()
news_processed_text = process_text(news_text)
final_time = time() - initial_time
print("Processed news text in {:4.2f} seconds".format(final_time))

# Process blogs text files
initial_time = time()
blogs_processed_text = process_text(blogs_text)
final_time = time() - initial_time
print("Processed blogs text in {:4.2f} seconds".format(final_time))

In [None]:
# Function to save dataset on disk
def save_dataset(data, filename):
    
    # Imports csv to store data in such format
    import csv

    # Opens target file to write
    with open(filename, 'w') as file:

        # Creates a writer for the file
        csv_writer = csv.writer(file)

        # Stores each sentence in file
        for sentence in data:
            csv_writer.writerow(sentence)


# Defines filenames for processed text
news_filename = './resources/20N_7_data.csv'
blogs_filename = './resources/BAC_7_data.csv'

# Stores processed data
save_dataset(news_processed_text, news_filename)
save_dataset(blogs_processed_text, blogs_filename)

## Splits data

In [None]:
# Function to open the processed datasets
def open_dataset(filename):
    
    # Imports csv to read data in such format
    import csv
    
    # Opens target file to read
    with open(filename, 'r') as file:
        
        # Creates a reader for the file
        csv_reader = csv.reader(file)
        
        # List to store the corpus of the collection
        corpus = []
        
        # Reads rows in file
        for row in csv_reader:
            corpus.append(list(row))
            
    # Returns text corpus
    return corpus


# Defines filenames for processed text
news_filename = './resources/20N_7_data.csv'
blogs_filename = './resources/BAC_7_data.csv'

# Reads corpus from CSV files
news_corpus = open_dataset(news_filename)
blogs_corpus = open_dataset(blogs_filename)

In [None]:
# Imports splitter for data
from sklearn.model_selection import train_test_split

# Splits news data
news_train_data, news_test_data = train_test_split(news_corpus, test_size=0.2)

# Splits blog data
blogs_train_data, blogs_test_data = train_test_split(blogs_corpus, test_size=0.2)

In [None]:
# Defines news dataset filenames
news_train_filename = "./resources/20N_7_training.csv"
news_test_filename = "./resources/20N_7_testing.csv"

# Defines blogs dataset filenames
blogs_train_filename = "./resources/BAC_7_training.csv"
blogs_test_filename = "./resources/BAC_7_testing.csv"

# Saves splitted news dataset
save_dataset(news_train_data, news_train_filename)
save_dataset(news_test_data, news_test_filename)

# Saves splitted blogs dataset
save_dataset(blogs_train_data, blogs_train_filename)
save_dataset(blogs_test_data, blogs_test_filename)

## N-Grams

In [1]:
# Function to open the processed datasets
def open_dataset(filename):
    
    # Imports csv to read data in such format
    import csv
    
    # Opens target file to read
    with open(filename, 'r') as file:
        
        # Creates a reader for the file
        csv_reader = csv.reader(file)
        
        # List to store the corpus of the collection
        corpus = []
        
        # Reads rows in file
        for row in csv_reader:
            corpus.append(list(row))
            
    # Returns text corpus
    return corpus

# Defines news dataset filenames
news_train_filename = "./resources/20N_7_training.csv"
news_test_filename = "./resources/20N_7_testing.csv"

# Defines blogs dataset filenames
blogs_train_filename = "./resources/BAC_7_training.csv"
blogs_test_filename = "./resources/BAC_7_testing.csv"

# Opens train datasets
news_train_data = open_dataset(news_train_filename)
blogs_train_data = open_dataset(blogs_train_filename)

# Opens test datasets
news_test_data = open_dataset(news_test_filename)
blogs_test_data = open_dataset(blogs_test_filename)

In [2]:
# Function to count unigrams
def count_unigrams(data):
    
    # Imports utility to create N-Grams
    from nltk.util import ngrams
    
    # Imports default dict from collections
    from collections import defaultdict
    
    # Creates a defaultdict to store unigrams count
    unigrams_count = defaultdict(lambda: 0)
    
    # Iterates over data
    for sentence in data:
        
        # Iterates over unigrams
        for w1, in ngrams(sentence, 1):
            
            # Increments count for given unigram
            unigrams_count[w1] += 1
            
    # Returns the count
    return unigrams_count


# Function to count bigrams
def count_bigrams(data):
    
    # Imports utility to create N-Grams
    from nltk.util import ngrams
    
    # Imports default dict from collections
    from collections import defaultdict
    
    # Defaultdict to store bigrams count
    bigrams_count = defaultdict(lambda: 0)
    
    # Iterates over data
    for sentence in data:
        
        # Iterates over bigrams
        for w1, w2 in ngrams(sentence, 2):
            
            # Increments count for given bigram
            bigrams_count[(w1, w2)] += 1
    
    # Returns the count
    return bigrams_count


# Function to count trigrams
def count_trigrams(data):
    
    # Imports utility to create N-Grams
    from nltk.util import ngrams
    
    # Imports default dict from collections
    from collections import defaultdict
    
    # Defaultdict to store bigrams count
    trigrams_count = defaultdict(lambda: 0)
    
    # Iterates over data
    for sentence in data:
        
        # Iterates over trigrams
        for w1, w2, w3 in ngrams(sentence, 3):
            
            # Increments count for given trigrams
            trigrams_count[(w1, w2, w3)] += 1
    
    # Returns the count
    return trigrams_count

In [82]:
# Function to get unigrams with add-1 smoothing
def laplace_unigrams(data):
    
    # Gets the counting for data
    unigrams_count = count_unigrams(data)
    
    # Calculates coefficient for OOV tokens
    coeff = 1.0/(sum(unigrams_count.values()) + len(unigrams_count.keys()))
    
    # Creates a dict for the unigrams
    unigrams = {}
    
    # Calculates probability using Laplace smoothing
    for unigram in unigrams_count.keys():
        
        # Calculates unigram count plus k
        upper = unigrams_count[unigram] + 1
        
        # Calculates word count plus Laplace term
        lower = sum(unigrams_count.values()) + len(unigrams_count.keys())
        
        # Calculates Laplace probability
        unigrams[unigram] = upper / lower
    
    # Returns unigrams model
    return unigrams, coeff


# Function to get bigrams with add-1 smoothing
def laplace_bigrams(data):
    
    # Gets counts for unigrams
    unigrams_count = count_unigrams(data)
    
    # Gets counts for bigrams
    bigrams_count = count_bigrams(data)
    
    # Calculates OOV coefficient
    coeff = 1.0/(sum(unigrams_count.values()) + len(unigrams_count.keys()))
    
    # Creates a dict to store bigrams
    bigrams = {}
    
    # Iterates over bigrams
    for w1, w2 in bigrams_count.keys():
        
        # Counts occurence of bigram 
        upper = bigrams_count[(w1, w2)] + 1

        # Calculates unigram count plus vocabulary size
        lower = unigrams_count[w1] + len(unigrams_count.keys())

        # Calculates conditional probability
        bigrams[(w1, w2)] = upper / lower
    
    # Returns bigrams model
    return bigrams, coeff


# Function to get trigrams with add-1 smoothing
def laplace_trigrams(data):
    
    # Gets counts for trigrams
    trigrams_count = count_trigrams(data)
    
    # Gets counts for bigrams
    bigrams_count = count_bigrams(data)
    
    # Gets count for unigrams
    unigrams_count = count_unigrams(data)
    
    # Calculates OOV coefficient
    coeff = 1.0/(sum(bigrams_count.values()) + len(unigrams_count.keys()))
    
    # Creates a dict to store trigrams
    trigrams = {}
    
    # Iterates over trigrams
    for w1, w2, w3 in trigrams_count.keys():
        
        # Counts occurence of trigram
        upper = trigrams_count[(w1, w2, w3)] + 1

        # Calculates bigram count plus vocabulary size
        lower = bigrams_count[(w1, w2)] + len(unigrams_count.keys())

        # Calculates conditional probability
        trigrams[(w1, w2, w2)] = upper / lower
    
    # Returns trigrams model
    return trigrams, coeff

In [151]:
def store_model(model, coeff, filename, n=1):
     
    from csv import writer
    
    with open(filename, 'w') as csv_file:
        
        csv_writer = writer(csv_file)
        
        csv_writer.writerow([coeff])
        
        for key in model.keys():
        
            if n == 1:
            
                row = [ key , model[key] ]
                
                csv_writer.writerow(row)
            
            else:
                
                row = list(key)
                
                row.append(model[key])
                
                csv_writer.writerow(row)


from collections import defaultdict
                
news_uni_dict, news_uni_coeff = laplace_unigrams(news_train_data)
news_unigram_model = defaultdict(lambda: news_uni_coeff, news_uni_dict)
news_unigram_model_filename = "./results/ngrams/models/20N_7_unigrams.csv"
store_model(news_uni_dict, news_uni_coeff, news_unigram_model_filename, n=1)
print("Finished 20N unigram model")

news_bi_dict, news_bi_coeff = laplace_bigrams(news_train_data)
news_bigram_model = defaultdict(lambda: news_bi_coeff, news_bi_dict)
news_bigram_model_filename = "./results/ngrams/models/20N_7_bigrams.csv"
store_model(news_bi_dict, news_bi_coeff, news_bigram_model_filename, n=2)
print("Finished 20N bigram model")


news_tri_dict, news_tri_coeff = laplace_trigrams(news_train_data)
news_trigram_model = defaultdict(lambda: news_tri_coeff, news_tri_dict)
news_trigram_model_filename = "./results/ngrams/models/20N_7_trigrams.csv"
store_model(news_tri_dict, news_tri_coeff, news_trigram_model_filename, n=3)
print("Finished 20N trigram model")

blogs_uni_dict, blogs_uni_coeff = laplace_unigrams(blogs_train_data)
blogs_unigram_model = defaultdict(lambda: blogs_uni_coeff, blogs_uni_dict)
blogs_unigram_model_filename = "./results/ngrams/models/BAC_7_unigrams.csv"
store_model(blogs_uni_dict, blogs_uni_coeff, blogs_unigram_model_filename, n=1)
print("Finished BAC unigram model")

blogs_bi_dict, blogs_bi_coeff = laplace_bigrams(blogs_train_data)
blogs_bigram_model = defaultdict(lambda: blogs_bi_coeff, blogs_bi_dict)
blogs_bigram_model_filename = "./results/ngrams/models/BAC_7_bigrams.csv"
store_model(blogs_bi_dict, blogs_bi_coeff, blogs_bigram_model_filename, n=2)
print("Finished BAC bigram model")


blogs_tri_dict, blogs_tri_coeff = laplace_trigrams(blogs_train_data)
blogs_trigram_model = defaultdict(lambda: blogs_tri_coeff, blogs_tri_dict)
blogs_trigram_model_filename = "./results/ngrams/models/BAC_7_trigrams.csv"
store_model(blogs_tri_dict, blogs_tri_coeff, news_trigram_model_filename, n=3)
print("Finished BAC trigram model")


Finished 20N unigram model
Finished 20N bigram model
Finished 20N trigram model
Finished BAC unigram model
Finished BAC bigram model
Finished BAC trigram model


In [159]:
# Opens model
def read_model(filename):
    
    import csv
    
    coeff = 0
    dictionary = {}
    
    with open(filename, 'r') as file:
        
        csv_reader = csv.reader(file)
        
        row_indx = 0
        
        for row in csv_reader:
            
            if row_indx == 0:
                coeff = float(row[0])
                row_indx += 1
                
            else:
                _len = len(row)
                value = float(row[_len-1])
                
                if _len == 2:
                    keys = row[0]
                    
                else:
                    keys = tuple(row[0:_len-1])
                    
                dictionary[keys] = value
                row_indx += 1
                
    return dictionary, coeff


from collections import defaultdict

# Opens news unigram dictionary as example
dictionary, coeff = read_model("./results/ngrams/models/20N_7_unigrams.csv")
news_unigram_model = defaultdict(lambda: coeff, dictionary)

In [93]:
# Calculates perplexity for a sentence and a model
def perplexity(sentence, model, n=1):
    
    # Imports function to generate ngrams
    from nltk.util import ngrams
    
    # List to store ngrams probabilities from sentence
    sentence_ngrams = []
    
    # Iterates over ngrams in sentence
    for n_gram in ngrams(sentence, n):
        
        # If unigrams
        if n == 1:
            
            # Appends ngram probability
            sentence_ngrams.append(model[n_gram[0]])
        
        # If not unigrams
        else:
            
            # Appends ngram probability
            sentence_ngrams.append(model[n_gram])
    
    # Calculates the number of ngrams
    N = len(sentence_ngrams)
    
    # Returns zero if no ngrams are found
    if N == 0:
        return 0
    
    # Imports methods to calculate log2 and power
    from numpy import log2, power
    
    # Calculates log2 for each ngram probability
    l = [ log2(item) for item in sentence_ngrams ]
    
    # Sums and averages the log2-probabilities
    l = sum(l) / N
    
    # Returns the perplexity calculation
    return power(2, -l)
    

In [140]:
# Calculates perplexity for news unigram model
perplexity_sum = 0
for sentence in news_test_data:
    perplexity_sum += perplexity(sentence, news_unigram_model, n=1)  
print("Perplexity: {:4.2f}".format(perplexity_sum / len(news_test_data)))

Perplexity: 1289.55


In [141]:
# Calculates perplexity for news bigram model
perplexity_sum = 0
for sentence in news_test_data:
    perplexity_sum += perplexity(sentence, news_bigram_model, n=2)
print("Perplexity: {:4.2f}".format(perplexity_sum / len(news_test_data)))

Perplexity: 16230.65


In [142]:
# Calculates perplexity for news trigram model
perplexity_sum = 0
for sentence in news_test_data:
    perplexity_sum += perplexity(sentence, news_trigram_model, n=3)   
print("Perplexity: {:4.2f}".format(perplexity_sum / len(news_test_data)))

Perplexity: 4588633.63


In [143]:
# Calculates perplexity for blogs unigram model
perplexity_sum = 0
for sentence in blogs_test_data:
    perplexity_sum += perplexity(sentence, blogs_unigram_model, n=1)
print("Perplexity: {:4.2f}".format(perplexity_sum / len(blogs_test_data)))

Perplexity: 806.15


In [144]:
# Calculates perplexity for blogs bigram model
perplexity_sum = 0
for sentence in blogs_test_data:
    perplexity_sum += perplexity(sentence, blogs_bigram_model, n=2)
print("Perplexity: {:4.2f}".format(perplexity_sum / len(blogs_test_data)))

Perplexity: 18975.60


In [145]:
# Calculates perplexity for blogs trigram model
perplexity_sum = 0
for sentence in blogs_test_data:
    perplexity_sum += perplexity(sentence, blogs_trigram_model, n=3)
print("Perplexity: {:4.2f}".format(perplexity_sum / len(blogs_test_data)))

Perplexity: 4846085.30


In [88]:
# Prints perplexity for ten sentences and three models
for i in range(10):
    print(news_test_data[i])
    print("Unigram perplexity: {:6.2f}".format(perplexity(news_test_data[i], news_unigram_model, n=1)))
    print("Bigram perplexity: {:6.2f}".format(perplexity(news_test_data[i], news_bigram_model, n=2)))
    print("Trigram perplexity: {:6.2f}".format(perplexity(news_test_data[i], news_trigram_model, n=3)))

['<s>', 'i', 'am', 'consid', 'the', 'mx', 'num', 'probe', 'accord', 'corolla', 'and', 'the', 'numsx', '</s>']
Unigram perplexity: 712.25
Bigram perplexity: 2172.43
Trigram perplexity: 4828531.00
['<s>', 'from', 'gsnum', 'prism', 'gatech', 'edu', 'glenn', 'r', 'stone', 'subject', 're', 'atf', 'burn', 'dividian', 'ranch', '</s>']
Unigram perplexity: 2828.66
Bigram perplexity: 1044.94
Trigram perplexity: 4828531.00
['<s>', 'in', 'other', 'words', 'thei', 'let', 'the', 'best', 'honda', 'play', 'but', 'not', 'the', 'best', 'saturn', '</s>']
Unigram perplexity: 642.48
Bigram perplexity: 1565.45
Trigram perplexity: 4828531.00
['<s>', 'he', 'wa', 'on', 'of', 'the', 'lead', 'scorer', 'on', 'a', 'mediocr', 'team', 'when', 'he', 'wa', 'trade', 'awai', 'in', 'num', '</s>']
Unigram perplexity: 458.12
Bigram perplexity: 761.32
Trigram perplexity: 4828531.00
['<s>', 'pagliarulo', 'mike', 'num', 'num', 'num', 'num', 'num', 'thi', 'is', 'an', 'interest', 'line', '</s>']
Unigram perplexity: 234.76
Bigra

In [110]:
# Generates a linear interpolation model
def linear_ngram(l_a, l_b, unigram, bigram):
    linear = {}
    for w1, w2 in bigram.keys():
        linear[(w1, w2)] = l_a * bigram[(w1, w2)] + l_b * unigram[w1]
    return linear

In [133]:
# Calculates the best model using SGD 
def sgd_linear_ngram(validation, unigram, bigram, n=30, alpha=1e-3):
    
    lambdas = [0.1, 0.9]
    
    P_prev = 0
    
    for indx in range(n):
        
        linear = linear_ngram(lambdas[0], lambdas[1], unigram, bigram)
        
        p_sum = 0
        
        for sentence in validation:
            
            p_sum += perplexity(sentence, linear, n=2)
            
        P = p_sum / len(validation)
            
        dP = P - P_prev
        P_prev = P
        
        lambdas[0] -= alpha * dP
        lambdas[1] -= alpha * dP
        
        sum_l = sum(lambdas)
        
        lambdas[0] /= sum_l
        lambdas[1] /= sum_l
    
    return linear, lambdas

In [134]:
# Calculates the best linear model for news dataset
news_linear, news_lambdas = sgd_linear_ngram(news_test_data, news_unigram_model, news_bigram_model)

In [135]:
# Calculates the best linear model for blogs dataset
blogs_linear, blogs_lambdas = sgd_linear_ngram(blogs_test_data, blogs_unigram_model, blogs_bigram_model)

In [138]:
news_lambdas

[0.43810584607794384, 0.5618941539220561]

In [139]:
blogs_lambdas

[0.39353838332215585, 0.6064616166778443]

In [146]:
# Calculates perplexity for news linear model
perplexity_sum = 0
for sentence in news_test_data:
    perplexity_sum += perplexity(sentence, news_linear, n=2)
print("Perplexity: {:4.2f}".format(perplexity_sum / len(news_test_data)))

Perplexity: 1258.02


In [148]:
# Calculates perplexity for blogs linear model
perplexity_sum = 0
for sentence in blogs_test_data:
    perplexity_sum += perplexity(sentence, blogs_linear, n=2)
print("Perplexity: {:4.2f}".format(perplexity_sum / len(blogs_test_data)))

Perplexity: 967.98
