In [2]:
from datasets import load_dataset
import Preprocessor as p

In [6]:
import pandas as pd
from collections import defaultdict
import math

# Load the copenlu/answerable_tydiqa dataset
dataset = load_dataset("copenlu/answerable_tydiqa")

train_df = dataset['train'].to_pandas()
train_df = train_df[train_df['language'].isin(['indonesian', 'arabic', 'bengali'])]
train_df = p.DataFramePreprocessor(train_df, columns_to_tokenize=['document_plaintext', 'question_text'], remove_stopwords = False).df

val_df = dataset['validation'].to_pandas()
val_df = val_df[val_df['language'].isin(['indonesian', 'arabic', 'bengali'])]
val_df = p.DataFramePreprocessor(val_df, columns_to_tokenize=['document_plaintext', 'question_text'], remove_stopwords = False).df


def compute_normalized_ngram_frequencies(df, column, n, compute_perplexity=True):
    if column not in df.columns:
        raise ValueError(f"{column} not found in the DataFrame")
    
    ngram_freqs = defaultdict(int)
    prefix_counts = defaultdict(int)
    
    for index, row in df.iterrows():
        tokens = row[column]
        
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i+n])
            prefix = tuple(tokens[i:i+n-1])
            
            ngram_freqs[ngram] += 1
            prefix_counts[prefix] += 1
    
    normalized_ngram_freqs = {}
    for ngram, count in ngram_freqs.items():
        prefix = ngram[:-1]
        normalized_ngram_freqs[ngram] = count / prefix_counts[prefix]
    
    if not compute_perplexity:
        return normalized_ngram_freqs
    
    def compute_perplexity(df, column, normalized_ngram_freqs, n):
        product = 1.0
        total_ngrams = 0
        
        for tokens in df[column]:        
            for i in range(n-1, len(tokens)):
                ngram = tuple(tokens[i-n+1:i+1])
                
                # If this n-gram was not in the training data, we skip it
                if ngram not in normalized_ngram_freqs:
                    continue
                
                prob = normalized_ngram_freqs[ngram]
                product *= (1/prob)
                total_ngrams += 1
        
        # If there were no valid n-grams in the validation set, the perplexity is undefined
        if total_ngrams == 0:
            return float('inf')
        
        # Calculate and return the perplexity
        return product ** (1/total_ngrams)

    # Compute perplexity for validation set
    return compute_perplexity(val_df, column, normalized_ngram_freqs, n)

In [11]:
compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 10, compute_perplexity=True)

1.0037327281712645

In [15]:
compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 100, compute_perplexity=True)

1.0

In [5]:
print(f"Document perplexity for unigram: {compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 1)}")
print(f"Document perplexity for bigram: {compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 2)}")
print(f"Document perplexity for trigram: {compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 3)}")
print(f"Document perplexity for 4-gram: {compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 4)}")
print(f"Document perplexity for 5-gram: {compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 5)}")
print(f"Document perplexity for 6-gram: {compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 6)}")
print(f"Document perplexity for 7-gram: {compute_normalized_ngram_frequencies(train_df, 'document_plaintext_tokens', 7)}")

print(f"Question perplexity for unigram: {compute_normalized_ngram_frequencies(train_df, 'question_text_tokens', 1)}")
print(f"Question perplexity for bigram: {compute_normalized_ngram_frequencies(train_df, 'question_text_tokens', 2)}")
print(f"Question perplexity for trigram: {compute_normalized_ngram_frequencies(train_df, 'question_text_tokens', 3)}")
print(f"Question perplexity for 4-gram: {compute_normalized_ngram_frequencies(train_df, 'question_text_tokens', 4)}")
print(f"Question perplexity for 5-gram: {compute_normalized_ngram_frequencies(train_df, 'question_text_tokens', 5)}")
print(f"Question perplexity for 6-gram: {compute_normalized_ngram_frequencies(train_df, 'question_text_tokens', 6)}")
print(f"Question perplexity for 7-gram: {compute_normalized_ngram_frequencies(train_df, 'question_text_tokens', 7)}")




Document perplexity for unigram: 13390.45519933151
Document perplexity for bigram: 75.85927459009982
Document perplexity for trigram: 4.169447999633632
Document perplexity for 4-gram: 1.3455694340722437
Document perplexity for 5-gram: 1.0564257088361038
Document perplexity for 6-gram: 1.015377084252998
Document perplexity for 7-gram: 1.006855356028215
Question perplexity for unigram: 346.3580325222089
Question perplexity for bigram: 9.621843001657368
Question perplexity for trigram: 6.145006896638814
Question perplexity for 4-gram: 5.787894352718436
Question perplexity for 5-gram: 2.054405685721316
Question perplexity for 6-gram: 1.497735987862882
Question perplexity for 7-gram: 1.1378808708882513
