In [1]:
import pandas as pd
import math
from nltk.tokenize import word_tokenize
from collections import Counter, defaultdict
from tokenizers import CustomTokenizer

## Import dataset

In [2]:
df = pd.read_csv('../data/dataset.csv', index_col=0)

## Tokenize

In [3]:
tokenizer = CustomTokenizer(remove_stopwords=False, to_lower=True)

In [4]:
sample_df = df.sample(20000, random_state=42)
train_df = sample_df.sample(frac=0.8, random_state=42)
test_df = sample_df.drop(train_df.index)

train_list = list(train_df['text'])
train_string = " ".join(train_list)
train_tokens = tokenizer.encode(train_string)

test_list = list(test_df['text'])
test_string = " ".join(test_list)
test_tokens = tokenizer.encode(test_string)

vocab = set(train_tokens)

## N-grams

In [5]:
def count_ngrams(tokens, n):
    """Counts n-grams."""
    
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    
    return Counter(ngrams)

def calculate_ngram_probabilities(train_tokens, n, test_tokens, k=0.00001):
    """Calculates n-gram probabilities."""
    
    vocab = set(train_tokens)
    V = len(vocab)
    ngram_counts = count_ngrams(train_tokens, n)
    n_minus_one_gram_counts = count_ngrams(train_tokens, n-1)
    ngram_probabilities = defaultdict(float)
    
    for ngram in ngram_counts:
        prefix = ngram[:-1]
        ngram_counts[ngram] += k
        n_minus_one_gram_counts[prefix] += k
        ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)

    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        if ngram not in ngram_counts:
            ngram_counts[ngram] = k
            prefix = ngram[:-1]
            if prefix not in n_minus_one_gram_counts:
                n_minus_one_gram_counts[prefix] = k
            ngram_probabilities[ngram] = (ngram_counts[ngram] + k) / (n_minus_one_gram_counts[prefix] + k*V)
    
    return ngram_probabilities

In [6]:
n = 5

ngram_prob = calculate_ngram_probabilities(train_tokens, n, test_tokens)
print(f"Number of {n}-grams:", len(ngram_prob))

Number of 5-grams: 15686589


In [7]:
ngram_prob

defaultdict(float,
            {(82, 2357, 2221, 7138, 21137): 0.8151469083824795,
             (2357, 2221, 7138, 21137, 1754): 0.4075775298850247,
             (2221, 7138, 21137, 1754, 374): 0.6879751233170746,
             (7138, 21137, 1754, 374, 264): 0.8686721276823693,
             (21137, 1754, 374, 264, 7138): 0.8686721276823693,
             (1754, 374, 264, 7138, 21137): 0.8686721276823693,
             (374, 264, 7138, 21137, 304): 0.4648620840867923,
             (264, 7138, 21137, 304, 18671): 0.2895612366333968,
             (7138, 21137, 304, 18671, 709): 0.6879751233170746,
             (21137, 304, 18671, 709, 1725): 0.6879751233170746,
             (304, 18671, 709, 1725, 6181): 0.6879751233170746,
             (18671, 709, 1725, 6181, 14189): 0.6879751233170746,
             (709, 1725, 6181, 14189, 304): 0.09566301273153575,
             (1725, 6181, 14189, 304, 2064): 0.6879751233170746,
             (6181, 14189, 304, 2064, 14466): 0.6879751233170746,
          

In [8]:
print('These are the top 20 n-grams:')
top = sorted(ngram_prob.items(), key=lambda item: item[1], reverse=True)[:20]
[(tokenizer.decode(token), prob) for token, prob in top]

These are the top 20 n-grams:


[('\nvideo games developed in', 0.9994158697731275),
 (' of birth missing (living', 0.999265450841915),
 (' birth missing (living people', 0.999265450841915),
 ('\ntaxa named by', 0.9990520501289184),
 (' films\nfilms directed by', 0.9987855250654051),
 ('\nplants described in ', 0.9986908779724798),
 ('see also\n list of', 0.9986479499979684),
 ('\namerican black-and-white', 0.998643907433848),
 ('american black-and-white films', 0.998643907433848),
 ('\n\nreferences\n\nexternal links', 0.9985471807358023),
 ('\nafrican-american', 0.9984904489568888),
 (' singers\n21st-century', 0.9984111251437493),
 ('references\n\nbibliography', 0.9984111251437493),
 ('\ngrevillea', 0.9983653120772603),
 ('see also\nlist of', 0.9983230023548959),
 ('="background:#fbb', 0.9982849541414774),
 ('\ntaxonomy articles created by', 0.9982245305869086),
 ('taxonomy articles created by pol', 0.9982245305869086),
 (' articles created by polbot', 0.9982245305869086),
 ('| style="text-align', 0.9981962872907312

## Perplexity

In [9]:
def calculate_perplexity(test_tokens, ngram_probabilities, n):
    """Calculates the perplexity of a test corpus given n-gram probabilities."""
    log_probability_sum = 0
    ngram_count = 0
    
    for i in range(len(test_tokens)-n+1):
        ngram = tuple(test_tokens[i:i+n])
        log_probability_sum += math.log2(ngram_probabilities[ngram])
        ngram_count += 1
    
    average_log_probability = -log_probability_sum / ngram_count
    perplexity = math.pow(2, average_log_probability)
    
    return perplexity

In [10]:
calculate_perplexity(train_tokens, ngram_prob, n)

1.9444046574889686

In [11]:
calculate_perplexity(test_tokens, ngram_prob, n)

9444.029206481004

In [12]:
def greedy_sampling(context, vocab, ngram_probabilities, n, max_length = 50):
    
    sentence = []

    if len(context) < (n-1):
        print("len(context) < n")
        return sentence

    context = context[-(n-1):]
    
    for i in range(max_length):

        probs = dict()
        
        for v in vocab:

            ngram = list(context)
            ngram.append(v)
            ngram = tuple(ngram)
            probs[v] = ngram_probabilities[ngram]

        best_token = max(probs, key=probs.get) # greedy 
        
        if probs[best_token] == 0:
            print("prob = 0")
            return sentence
            
        sentence.append(best_token)
        context = list(context)[1:]
        context.append(best_token)
        context = tuple(context)
            
    return sentence  

In [13]:
context = "the city of los angeles"
encoded_context = tokenizer.encode(context)

encoded_sentence = greedy_sampling(encoded_context, vocab, ngram_prob, 5, max_length=200)
sentence = tokenizer.decode(encoded_sentence)

print(context + sentence)

the city of los angeles, california, the son of a wealthy merchant loyal to the spanish crown. lorenzo batlle's son josé batlle y ordóñez, nephew luis batlle berres and grand-nephew jorge batlle ibáñez-martín (b. madrid, 20 october 1981) is an american singer, songwriter, and record producer. he is best known for his work in the 1960s, the company was known for creating games that revolved around historic battles and conflicts, such as theatre europe, bismarck archipelago, solomon islands
 sciaphila janthina (champ.) thwaites - sri lanka
spiders of asia
spiders described in 1991, the supreme court of the united states
video games set in the 1950s, the company was known for creating games that revolved around historic battles and conflicts, such as theatre europe, bismar


In [14]:
context = "he died at the age"
encoded_context = tokenizer.encode(context)

encoded_sentence = greedy_sampling(encoded_context, vocab, ngram_prob, 5, max_length=200)
sentence = tokenizer.decode(encoded_sentence)

print(context + sentence)

he died at the age of 18 living with them, 52.6% were married couples living together, 11.3% 65 or older. the average household size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average family size was 2.50 and the average


In [15]:
context = "the first time that"
encoded_context = tokenizer.encode(context)

encoded_sentence = greedy_sampling(encoded_context, vocab, ngram_prob, 5, max_length=200)
sentence = tokenizer.decode(encoded_sentence)

print(context + sentence)

the first time that the private wealth management industry has joined together voluntarily to commit to common standards of quality, compliance and good market practice as set out in the charter. the charter's is organised around the following 3 main principles:
- integrity: in business relationships; of markets, financial products and services, derivatives reform is an element of a, b, c, d, e, f} be a collection of subsets of a set x = {1, 2, 2013, the company announced that it would be a "conceited boy", but after a few minutes, you will stop caring". 50 cent stated via twitter that he feels that empire is being marketed by fox in a way that is different from hand milking or calf suckling. continuous vacuum is applied inside the soft liner to massage milk from the teatcups are composed of a steel shell with a series of "softball" questions (stewart has acknowledged he voted for kerry in the 2000s,


## Interpolation
Mixing different sizes of n-grams

In [6]:
n = 5
ngram_prob_interpol = {}

for i in range(3, n+1):
    new_ngram = calculate_ngram_probabilities(train_tokens, i, test_tokens)
    ngram_prob_interpol = ngram_prob_interpol | new_ngram
    print(f"Number of {i}-grams:", len(new_ngram))

Number of 3-grams: 9174417
Number of 4-grams: 13545259
Number of 5-grams: 15686589


In [7]:
print('These are the top 20 n-grams (using interpolation):')
top = sorted(ngram_prob_interpol.items(), key=lambda item: item[1], reverse=True)[:20]
[(tokenizer.decode(token), prob) for token, prob in top]

These are the top 20 n-grams (using interpolation):


[(' births\nliving people', 0.9997356092826226),
 (' los angeles', 0.9997038434537718),
 (' summer olymp', 0.9994577702621318),
 ('video games developed in', 0.9994196072327112),
 ('\nvideo games developed in', 0.9994158697731275),
 (' san francisco', 0.9993964364291994),
 (' birth missing (living', 0.999265450841915),
 (' missing (living people', 0.999265450841915),
 (' of birth missing (living', 0.999265450841915),
 (' birth missing (living people', 0.999265450841915),
 ('\nfilms directed by', 0.9992383117293259),
 (' covid-19', 0.9992090902843276),
 ('grevillea', 0.9992063222214185),
 (' style="text-align', 0.9991950534628788),
 ('!align="', 0.9991592418231655),
 (' hong kong', 0.9990991224871044),
 (' in los angeles', 0.9990771254342501),
 ('a named by', 0.9990695524088581),
 ('taxa named by', 0.9990695524088581),
 ('\ntaxa named by', 0.9990520501289184)]

In [8]:
def greedy_sampling_interpol(context, vocab, ngram_probabilities, n, max_length = 50):
    
    sentence = []

    if len(context) < (n-1):
        print("len(context) < n")
        return sentence

    context = context[-(n-1):]
    
    for i in range(max_length):

        probs = dict()

        for i in range(1, n): 
            for v in vocab:
                ngram = list(context[(i-1):])
                ngram.append(v)
                
                ngram = tuple(ngram)
                if v in probs:
                    probs[v] += ngram_probabilities[ngram] / n
                else:
                    probs[v] = ngram_probabilities[ngram] / n

        best_token = max(probs, key=probs.get) # greedy
        
        if probs[best_token] == 0:
            print("prob = 0")
            return sentence
            
        sentence.append(best_token)
        context = list(context)[1:]
        context.append(best_token)
        context = tuple(context)
            
    return sentence  

In [9]:
context = "the city of los angeles"
encoded_context = tokenizer.encode(context)

encoded_sentence = greedy_sampling_interpol(encoded_context, vocab, ngram_prob_interpol, 5, max_length=200)
sentence = tokenizer.decode(encoded_sentence)

print(context + sentence)

the city of los angeles, california, united states
| 
|-
| 2001
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3
|align=center|3


In [10]:
context = "he died at the age"
encoded_context = tokenizer.encode(context)

encoded_sentence = greedy_sampling_interpol(encoded_context, vocab, ngram_prob_interpol, 5, max_length=200)
sentence = tokenizer.decode(encoded_sentence)

print(context + sentence)

he died at the age of 18, 2011, the film was released on 1 december 2007, the album has sold 1.4% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the


In [11]:
context = "the first time that"
encoded_context = tokenizer.encode(context)

encoded_sentence = greedy_sampling_interpol(encoded_context, vocab, ngram_prob_interpol, 5, max_length=200)
sentence = tokenizer.decode(encoded_sentence)

print(context + sentence)

the first time that the bjp's rise under modi in 2012, the first time in the 2010, the game was released on 1 december 2007, the album has sold 1.4% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty line, including 8.5% of the population were below the poverty
