In [1]:
from datasets import load_dataset
import Preprocessor as p

In [9]:
import pandas as pd
from collections import defaultdict
from datasets import load_dataset
import math

# Define a dummy preprocessor since the original is not provided
class DataFramePreprocessor:
    def __init__(self, df, columns_to_tokenize, remove_stopwords):
        self.df = df

# Load the copenlu/answerable_tydiqa dataset
dataset = load_dataset("copenlu/answerable_tydiqa")

train_df = dataset['train'].to_pandas()
train_df = train_df[train_df['language'].isin(['indonesian', 'arabic', 'bengali'])]
train_df = DataFramePreprocessor(train_df, columns_to_tokenize=['document_plaintext', 'question_text'], remove_stopwords=False).df

val_df = dataset['validation'].to_pandas()
val_df = val_df[val_df['language'].isin(['indonesian', 'arabic', 'bengali'])]
val_df = DataFramePreprocessor(val_df, columns_to_tokenize=['document_plaintext', 'question_text'], remove_stopwords=False).df


def compute_normalized_ngram_frequencies(df, val_df, column, n, compute_perplexity=True, smoothing=0.0001):
    if column not in df.columns:
        raise ValueError(f"{column} not found in the DataFrame")

    ngram_freqs = defaultdict(int)
    prefix_counts = defaultdict(int)

    for index, row in df.iterrows():
        tokens = row[column]
        for i in range(len(tokens) - n + 1):
            ngram = tuple(tokens[i:i + n])
            prefix = tuple(tokens[i:i + n - 1])
            ngram_freqs[ngram] += 1
            prefix_counts[prefix] += 1

    vocabulary_size = len(set(word for sentence in df[column] for word in sentence))
    normalized_ngram_freqs = {}
    for ngram, count in ngram_freqs.items():
        prefix = ngram[:-1]
        normalized_ngram_freqs[ngram] = (count + smoothing) / (prefix_counts[prefix] + smoothing * vocabulary_size)

    if not compute_perplexity:
        return normalized_ngram_freqs

    def compute_perplexity(df, column, normalized_ngram_freqs, n, vocabulary_size, smoothing):
        log_product = 0.0
        total_ngrams = 0
        for tokens in df[column]:
            for i in range(n - 1, len(tokens)):
                ngram = tuple(tokens[i - n + 1:i + 1])
                prob = normalized_ngram_freqs.get(ngram, smoothing / (smoothing * vocabulary_size))
                log_product += -math.log(prob)
                total_ngrams += 1
        if total_ngrams == 0:
            return float('inf')
        return math.exp(log_product / total_ngrams)

    return compute_perplexity(val_df, column, normalized_ngram_freqs, n, vocabulary_size, smoothing)

In [10]:
# Load the copenlu/answerable_tydiqa dataset
dataset = load_dataset("copenlu/answerable_tydiqa")

train_df = dataset['train'].to_pandas()
train_df = train_df[train_df['language'].isin(['indonesian', 'arabic', 'bengali'])]
train_df = p.DataFramePreprocessor(train_df, columns_to_tokenize=['document_plaintext', 'question_text'], remove_stopwords=False).df
train_arabic = train_df[train_df['language'] == 'arabic']
train_indonesian = train_df[train_df['language'] == 'indonesian']
train_bengali = train_df[train_df['language'] == 'bengali']

val_df = dataset['validation'].to_pandas()
val_df = val_df[val_df['language'].isin(['indonesian', 'arabic', 'bengali'])]
val_df = p.DataFramePreprocessor(val_df, columns_to_tokenize=['document_plaintext', 'question_text'], remove_stopwords = False).df
val_arabic = val_df[val_df['language'] == 'arabic']
val_indonesian = val_df[val_df['language'] == 'indonesian']
val_bengali = val_df[val_df['language'] == 'bengali']

#print(f"Document perplexity for bigram arabic: {compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='document_plaintext_tokens', n=2)}")
#print(f"Document perplexity for bigram indonesian: {compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='document_plaintext_tokens', n=2)}")
#print(f"Document perplexity for bigram bengali: {compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='document_plaintext_tokens', n=2)}")

#print(f"Question perplexity for bigram arabic: {compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='question_text_tokens', n=2)}")
#print(f"Question perplexity for bigram indonesian: {compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='question_text_tokens', n=2)}")
#print(f"Question perplexity for bigram bengali: {compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='question_text_tokens', n=2)}")

In [11]:
arabic_freqs_docs = compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='document_plaintext_tokens', n=2, compute_perplexity=False)
indonesian_freqs_docs = compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='document_plaintext_tokens', n=2, compute_perplexity=False)
bengali_freqs_docs = compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='document_plaintext_tokens', n=2, compute_perplexity=False)

arabic_freqs_questions = compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='question_text_tokens', n=2, compute_perplexity=False)
indonesian_freqs_questions = compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='question_text_tokens', n=2, compute_perplexity=False)
bengali_freqs_questions = compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='question_text_tokens', n=2, compute_perplexity=False)

In [17]:
from tqdm import tqdm
import pandas as pd

def predict_next_token_accuracy(bigram_freqs, val_df, column, k=1):
    accuracies = [0] * 10  # List to store accuracies for each token index (0-9)
    total_counts = [0] * 10  # List to store total prediction attempts for each token index (0-9)

    # Function to predict next token based on previous token and bigram frequencies
    def predict_next_token(prev_token, bigram_freqs, k):
        candidates = {bigram[1]: freq for bigram, freq in bigram_freqs.items() if bigram[0] == prev_token}
        # Sort candidates by frequency in descending order and take the top k
        top_candidates = sorted(candidates, key=candidates.get, reverse=True)[:k]
        return top_candidates

    # Iterate through each row in the validation dataframe
    for _, row in val_df.iterrows():
        tokens = row[column]
        for i in range(min(10, len(tokens) - 1)):  # Only consider first 10 tokens or less if not available
            total_counts[i] += 1
            top_candidates = predict_next_token(tokens[i], bigram_freqs, k)
            if tokens[i + 1] in top_candidates:
                accuracies[i] += 1

    # Calculate accuracies
    accuracies = [acc / total if total > 0 else 0 for acc, total in zip(accuracies, total_counts)]
    
    return accuracies

print("Bengali questions")
print(predict_next_token_accuracy(bengali_freqs_questions, val_bengali, 'question_text_tokens', k=3))
print("Bengali documents")
print(predict_next_token_accuracy(bengali_freqs_docs, val_bengali, 'document_plaintext_tokens', k=3))
print("Arabic questions")
print(predict_next_token_accuracy(arabic_freqs_questions, val_arabic, 'question_text_tokens', k=3))
print("Arabic documents")
print(predict_next_token_accuracy(arabic_freqs_docs, val_arabic, 'document_plaintext_tokens', k=3))
print("Indonesian questions")
print(predict_next_token_accuracy(indonesian_freqs_questions, val_indonesian, 'question_text_tokens', k=3))
print("Indonesian documents")
print(predict_next_token_accuracy(indonesian_freqs_docs, val_indonesian, 'document_plaintext_tokens', k=3))

Bengali questions
[0.19642857142857142, 0.25892857142857145, 0.32142857142857145, 0.42857142857142855, 0.5504587155963303, 0.6116504854368932, 0.6931818181818182, 0.7076923076923077, 0.8222222222222222, 0.7241379310344828]
Bengali documents
[0.16964285714285715, 0.14285714285714285, 0.15695067264573992, 0.12612612612612611, 0.13122171945701358, 0.15454545454545454, 0.15, 0.14678899082568808, 0.14883720930232558, 0.16822429906542055]
Arabic questions
[0.6771819137749737, 0.22082018927444794, 0.1808622502628812, 0.2235817575083426, 0.3278955954323002, 0.3958333333333333, 0.4686192468619247, 0.4861111111111111, 0.5116279069767442, 0.4222222222222222]
Arabic documents
[0.1892744479495268, 0.15615141955835962, 0.15, 0.17223105458399576, 0.16622127204703366, 0.17189189189189188, 0.16111414527580556, 0.16501103752759383, 0.18050139275766017, 0.1858108108108108]
Indonesian questions
[0.4063811922753988, 0.29303106633081444, 0.23173803526448364, 0.2576419213973799, 0.3706140350877193, 0.4526678

In [18]:
print("Indonesian documents")
print(predict_next_token_accuracy(indonesian_freqs_docs, val_indonesian, 'document_plaintext_tokens', k=3))

Indonesian documents
[0.1847187237615449, 0.16554621848739495, 0.19208087615838249, 0.1921768707482993, 0.1982832618025751, 0.21872265966754156, 0.1909814323607427, 0.1906474820143885, 0.1912964641885766, 0.1780821917808219]


In [44]:
predict_next_token('dan', indonesian_freqs_questions, k=3)

['ferb', 'bartolomeo', 'ma’juj']

In [24]:
val_indonesian['question_text'].iloc[1]

'Dimanakah letak Donggala ?'

In [33]:
val_indonesian['question_text_tokens'].iloc[10]

['<Q>',
 'siapakah',
 'karakter',
 'utama',
 'serial',
 'anime',
 'dan',
 'manga',
 'eyeshield',
 '</Q>',
 '<EOS>']

In [14]:
from tqdm import tqdm
import pandas as pd

def predict_next_token_accuracy(bigram_freqs, val_df, column, k=1):
    accuracies = [0] * 10  # List to store accuracies for each token index (0-9)
    total_counts = [0] * 10  # List to store total prediction attempts for each token index (0-9)

    # Function to predict next token based on previous token and bigram frequencies
    def predict_next_token(prev_token, bigram_freqs, k):
        candidates = {bigram[1]: freq for bigram, freq in bigram_freqs.items() if bigram[0] == prev_token}
        # Sort candidates by frequency in descending order and take the top k
        top_candidates = sorted(candidates, key=candidates.get, reverse=True)[:k]
        return top_candidates

    # Iterate through each row in the validation dataframe
    for _, row in tqdm(val_df.iterrows(), total=len(val_df), desc="Processing rows"):
        tokens = row[column]
        for i in range(min(10, len(tokens) - 1)):  # Only consider first 10 tokens or less if not available
            total_counts[i] += 1
            top_candidates = predict_next_token(tokens[i], bigram_freqs, k)
            if tokens[i + 1] in top_candidates:
                accuracies[i] += 1

    # Calculate accuracies
    accuracies = [acc / total if total > 0 else 0 for acc, total in zip(accuracies, total_counts)]
    
    return accuracies

predict_next_token_accuracy(bengali_freqs_docs, val_bengali, 'document_plaintext_tokens')

Processing rows: 100%|██████████| 224/224 [00:31<00:00,  7.17it/s]


[0.044642857142857144,
 0.11160714285714286,
 0.09821428571428571,
 0.08928571428571429,
 0.08968609865470852,
 0.1036036036036036,
 0.12217194570135746,
 0.1,
 0.08181818181818182,
 0.10091743119266056]

In [5]:
print(f"Document perplexity for bigram arabic: {compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='document_plaintext_tokens', n=2)}")
print(f"Document perplexity for bigram indonesian: {compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='document_plaintext_tokens', n=2)}")
print(f"Document perplexity for bigram bengali: {compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='document_plaintext_tokens', n=2)}")

print(f"Question perplexity for bigram arabic: {compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='question_text_tokens', n=2)}")
print(f"Question perplexity for bigram indonesian: {compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='question_text_tokens', n=2)}")
print(f"Question perplexity for bigram bengali: {compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='question_text_tokens', n=2)}")

Document perplexity for bigram arabic: 5470.263650772015
Document perplexity for bigram indonesian: 2418.98273615176
Document perplexity for bigram bengali: 3974.521499262435
Question perplexity for bigram arabic: 100.24688224033824
Question perplexity for bigram indonesian: 76.26349679305203
Question perplexity for bigram bengali: 51.02582039878195


In [19]:
print(f"Document perplexity for bigram arabic: {compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='document_plaintext_tokens', n=2)}")
print(f"Document perplexity for bigram indonesian: {compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='document_plaintext_tokens', n=2)}")
print(f"Document perplexity for bigram bengali: {compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='document_plaintext_tokens', n=2)}")

print(f"Question perplexity for bigram arabic: {compute_normalized_ngram_frequencies(df=train_arabic, val_df=val_arabic, column='question_text_tokens', n=2)}")
print(f"Question perplexity for bigram indonesian: {compute_normalized_ngram_frequencies(df=train_indonesian, val_df=val_indonesian, column='question_text_tokens', n=2)}")
print(f"Question perplexity for bigram bengali: {compute_normalized_ngram_frequencies(df=train_bengali, val_df=val_bengali, column='question_text_tokens', n=2)}")

Document perplexity for bigram arabic: 5555.463768819053
Document perplexity for bigram indonesian: 2553.3478426941274
Document perplexity for bigram bengali: 3972.5372565828825
Question perplexity for bigram arabic: 100.87734520584422
Question perplexity for bigram indonesian: 77.66550552391907
Question perplexity for bigram bengali: 51.392352373865464
