In [1]:
#10s
import pandas as pd

# Loading the train and test datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Extracting 100 samples for validation
val_df = train_df.sample(n=100, random_state=13)

# Removing these 100 samples from the training set
train_df = train_df.drop(val_df.index)

# Saving updated train and validation sets
train_df.to_csv("train_cleaned.csv", index=False)
val_df.to_csv("validation.csv", index=False)

print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))

Training set size: 13779
Validation set size: 100


In [2]:
print("Test set size:", len(test_df))

Test set size: 100


Removing punctuations and non-ASCII characters

In [3]:
import re
#7s
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase


#print("Sample cleaned text:", train_df["text"].iloc[0])


Removing Stop words

In [4]:
import nltk
#10s
from nltk.corpus import stopwords

#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])




Lemmatization

In [5]:
#150 sec
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])


# print("Sample after lemmatization:", train_df["text"].iloc[0])


In [6]:
train_df["text"] = train_df["text"].str.lower().map(clean_text).map(remove_stopwords).map(lemmatize_text).str.strip()
val_df["text"] = val_df["text"].str.lower().map(clean_text).map(remove_stopwords).map(lemmatize_text).str.strip()
test_df["text"] = test_df["text"].str.lower().map(clean_text).map(remove_stopwords).map(lemmatize_text).str.strip()


Removing Extra Space

In [7]:
train_df["text"] = train_df["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
val_df["text"] = val_df["text"].str.replace(r'\s+', ' ', regex=True).str.strip()
test_df["text"] = test_df["text"].str.replace(r'\s+', ' ', regex=True).str.strip()

Removing numbers

In [8]:
train_df["text"] = train_df["text"].str.replace(r'\d+', '', regex=True)
val_df["text"] = val_df["text"].str.replace(r'\d+', '', regex=True)
test_df["text"] = test_df["text"].str.replace(r'\d+', '', regex=True)

Tokenization

In [9]:
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
#60s
def tokenize_text(text):
    return word_tokenize(text)

train_df["tokens"] = train_df["text"].apply(tokenize_text)
val_df["tokens"] = val_df["text"].apply(tokenize_text)

# print("Tokenized Sample:", train_df["tokens"].iloc[0])

In [10]:
test_df["tokens"] = test_df["text"].apply(tokenize_text)

In [11]:
train_df.to_csv("train_preprocessed.csv", index=False)
val_df.to_csv("validation_preprocessed.csv", index=False)
#12 s

# Generating N-grams

In [12]:
from collections import Counter
from itertools import islice

# Function to generate n-grams from a list of tokens
def generate_ngrams(tokens, n):
    return list(zip(*[tokens[i:] for i in range(n)]))#slicing will get us n lists(starting from 0,1,,,n-1, and then zip 
                                                    #takes one element from each list and make into tuple

# def generate_ngrams(tokens, n):
#     ngrams = []
#     for i in range(len(tokens) - n + 1):  # Ensure we don't go out of bounds
#         ngram = tuple(tokens[i:i+n])  # Create an n-gram tuple
#         ngrams.append(ngram)  # Add to the list
#     return ngrams
# Apply n-gram generation to each article in train set
train_df["unigrams"] = train_df["tokens"].apply(lambda x: generate_ngrams(x, 1))
train_df["bigrams"] = train_df["tokens"].apply(lambda x: generate_ngrams(x, 2))
train_df["trigrams"] = train_df["tokens"].apply(lambda x: generate_ngrams(x, 3))

# Display sample n-grams
print("Sample Unigrams:", train_df["unigrams"].iloc[0][:10])
print("Sample Bigrams:", train_df["bigrams"].iloc[0][:10])
print("Sample Trigrams:", train_df["trigrams"].iloc[0][:10])

Sample Unigrams: [('port',), ('st',), ('lucie',), ('city',), ('st',), ('lucie',), ('county',), ('florida',), ('united',), ('state',)]
Sample Bigrams: [('port', 'st'), ('st', 'lucie'), ('lucie', 'city'), ('city', 'st'), ('st', 'lucie'), ('lucie', 'county'), ('county', 'florida'), ('florida', 'united'), ('united', 'state'), ('state', 'populous')]
Sample Trigrams: [('port', 'st', 'lucie'), ('st', 'lucie', 'city'), ('lucie', 'city', 'st'), ('city', 'st', 'lucie'), ('st', 'lucie', 'county'), ('lucie', 'county', 'florida'), ('county', 'florida', 'united'), ('florida', 'united', 'state'), ('united', 'state', 'populous'), ('state', 'populous', 'municipality')]


In [13]:
test_df["bigrams"] = test_df["tokens"].apply(lambda x: generate_ngrams(x, 2))
test_df["unigrams"] = test_df["tokens"].apply(lambda x: generate_ngrams(x, 1))
test_df["trigrams"] = test_df["tokens"].apply(lambda x: generate_ngrams(x, 3))

In [18]:
#48 seconds
# Minimum threshold (1% of total training articles)
min_count = int(0.01 * len(train_df))

# Function to count how many articles contain each n-gram
def count_ngrams_across_articles(ngrams_list):
    ngram_article_counts = Counter()
    for ngrams in ngrams_list:
        unique_ngrams = set(ngrams)  # unique_ngrams is a set containing all 
        #unique n-grams from a single article
        ngram_article_counts.update(unique_ngrams)  # Increment count for each unique n-gram
    return ngram_article_counts

# Count n-grams in articles
unigram_counts = count_ngrams_across_articles(train_df["unigrams"])
bigram_counts = count_ngrams_across_articles(train_df["bigrams"])
trigram_counts = count_ngrams_across_articles(train_df["trigrams"])

# Filter n-grams that appear in at least 1% of training articles
filtered_unigrams = {ngram for ngram, count in unigram_counts.items() if count >= min_count}
#.items() gives (ngram, count) pairs. filtered_unigram is a set.
filtered_bigrams = {ngram for ngram, count in bigram_counts.items() if count >= min_count}
filtered_trigrams = {ngram for ngram, count in trigram_counts.items() if count >= min_count}

# Display results
print(f"Total Unigrams: {len(unigram_counts)}, Filtered: {len(filtered_unigrams)}")
print(f"Total Bigrams: {len(bigram_counts)}, Filtered: {len(filtered_bigrams)}")
print(f"Total Trigrams: {len(trigram_counts)}, Filtered: {len(filtered_trigrams)}")

Total Unigrams: 435746, Filtered: 8674
Total Bigrams: 8007311, Filtered: 5099
Total Trigrams: 15776177, Filtered: 1008


In [19]:
print("Filtered Unigrams Sample:", list(filtered_unigrams)[:10])
print("Total Filtered Unigrams:", len(filtered_unigrams))


Filtered Unigrams Sample: [('notion',), ('obtaining',), ('patent',), ('seated',), ('citrus',), ('hanna',), ('disestablished',), ('evan',), ('newton',), ('wildlife',)]
Total Filtered Unigrams: 8674


# MLE with Laplace Smoothing

In [20]:
#40sec
# Function to compute MLE probability with Laplace smoothing
def compute_ngram_probabilities(ngram_counts, lower_ngram_counts, vocab_size, alpha=1):
    probabilities = {}
    for ngram, count in ngram_counts.items():
        prefix = ngram[:-1]  # (n-1)-gram prefix
        prefix_count = lower_ngram_counts.get(prefix, 0)  # Count of (n-1)-gram
        #get fetches a value for a given key, 0 is the default value
        probabilities[ngram] = (count + alpha) / (prefix_count + alpha * vocab_size)
    return probabilities

# Vocabulary size (total unique unigrams)
vocab_size = len(filtered_unigrams)

# Filter ngram_counts to include only those present in filtered_ngrams
filtered_unigram_counts = {ngram: count for ngram, count in unigram_counts.items() if ngram in filtered_unigrams}
filtered_bigram_counts = {ngram: count for ngram, count in bigram_counts.items() if ngram in filtered_bigrams}
filtered_trigram_counts = {ngram: count for ngram, count in trigram_counts.items() if ngram in filtered_trigrams}

# Compute probabilities using only the filtered n-grams
unigram_probs = compute_ngram_probabilities(filtered_unigram_counts, {}, vocab_size)
bigram_probs = compute_ngram_probabilities(filtered_bigram_counts, filtered_unigram_counts, vocab_size)
trigram_probs = compute_ngram_probabilities(filtered_trigram_counts, filtered_bigram_counts, vocab_size)

# Display some probabilities
print("Sample Unigram Probability:", list(unigram_probs.items())[:5])
print("Sample Bigram Probability:", list(bigram_probs.items())[:5])
print("Sample Trigram Probability:", list(trigram_probs.items())[:5])



Sample Unigram Probability: [(('david',), 0.4760202905234033), (('tropical',), 0.03389439704865114), (('infrastructure',), 0.12324187226193221), (('maintaining',), 0.04184920451925294), (('shopping',), 0.10721697025593728)]
Sample Bigram Probability: [(('en', 'route'), 0.04881641241451867), (('also', 'located'), 0.016719976737423668), (('asian', 'pacific'), 0.11167601683029453), (('including', 'age'), 0.1034668905950096), (('population', 'census'), 0.10491676345334881)]
Sample Trigram Probability: [(('living', 'together', 'female'), 0.18173302107728337), (('size', 'average', 'family'), 0.18413384716608705), (('age', 'year', 'every'), 0.16950092421441776), (('average', 'household', 'size'), 0.18792071802543006), (('census', 'bureau', 'city'), 0.10436504195342698)]


In [21]:
import numpy as np

def compute_perplexity(tokens, ngram_probs, n):
    N = len(tokens)  # Length of the test article
    log_prob_sum = 0  
    unseen_count = 0  # Count of unseen n-grams

    for i in range(len(tokens) - n + 1):  # Sliding window for n-grams
        ngram = tuple(tokens[i:i+n])
        prob = ngram_probs.get(ngram, 0)  # Get probability, 0 if unseen
        
        if prob == 0:
            unseen_count += 1  # Track how many times we hit an unseen n-gram
        else:
            log_prob_sum += np.log(prob)  # Sum of log probabilities

    if unseen_count == N:  # If all n-grams are unseen, return high perplexity
        return float('inf')

    avg_log_prob = log_prob_sum / N  # Average log probability
    return np.exp(-avg_log_prob)  # Perplexity formula


In [22]:
unigram_perplexities = [compute_perplexity(tokens, unigram_probs, n=1) for tokens in test_df["tokens"]]
bigram_perplexities = [compute_perplexity(tokens, bigram_probs, n=2) for tokens in test_df["tokens"]]
trigram_perplexities = [compute_perplexity(tokens, trigram_probs, n=3) for tokens in test_df["tokens"]]

# Compute overall average perplexity
avg_unigram_perplexity = np.mean(unigram_perplexities)
avg_bigram_perplexity = np.mean(bigram_perplexities)
avg_trigram_perplexity = np.mean(trigram_perplexities)

print(f"Unigram Model Perplexity: {avg_unigram_perplexity}")
print(f"Bigram Model Perplexity: {avg_bigram_perplexity}")
print(f"Trigram Model Perplexity: {avg_trigram_perplexity}")


Unigram Model Perplexity: 3.177360199688495
Bigram Model Perplexity: 1.8214216136747892
Trigram Model Perplexity: 1.200521810584277


In [23]:
train_bigrams = set(bigram for bigrams in train_df["bigrams"] for bigram in bigrams)


In [24]:
test_bigrams = set(bigram for bigrams in test_df["bigrams"] for bigram in bigrams)


In [25]:
unseen_bigrams = test_bigrams - train_bigrams
print(f"Number of unseen bigrams: {len(unseen_bigrams)}")


Number of unseen bigrams: 47039


In [26]:
train_unigrams = set(unigram for unigrams in train_df["unigrams"] for unigram in unigrams)

In [27]:
test_unigrams = set(unigram for unigrams in test_df["unigrams"] for unigram in unigrams)

In [28]:
unseen_unigrams = test_unigrams - train_unigrams
print(f"Number of unseen unigrams: {len(unseen_unigrams)}")

Number of unseen unigrams: 1786


In [29]:
train_trigrams = set(trigram for trigrams in train_df["trigrams"] for trigram in trigrams)


In [30]:
test_trigrams = set(trigram for trigrams in test_df["trigrams"] for trigram in trigrams)


In [31]:
unseen_trigrams = test_trigrams - train_trigrams
print(f"Number of unseen trigrams: {len(unseen_trigrams)}")


Number of unseen trigrams: 110664


In [32]:
unseen_bigram_count = sum(1 for prob in bigram_probs.values() if prob == 0)
unseen_trigram_count = sum(1 for prob in trigram_probs.values() if prob == 0)

print("Zero probability bigrams:", unseen_bigram_count)
print("Zero probability trigrams:", unseen_trigram_count)


Zero probability bigrams: 0
Zero probability trigrams: 0


In [33]:
bigram_probs_list = [bigram_probs.get(bigram, 0) for bigrams in test_df["bigrams"] for bigram in bigrams]
trigram_probs_list = [trigram_probs.get(trigram, 0) for trigrams in test_df["trigrams"] for trigram in trigrams]

print("Average Bigram Probability:", np.mean(bigram_probs_list))
print("Average Trigram Probability:", np.mean(trigram_probs_list))


Average Bigram Probability: 0.009795097429667362
Average Trigram Probability: 0.0030489062637462116


In [34]:
validation_tokens = val_df["tokens"].tolist()


In [35]:
print("Validation tokens length:", len(validation_tokens))


Validation tokens length: 100


In [36]:
validation_tokens = [token for sentence in validation_tokens for token in sentence]


In [37]:
# Perplexity function for interpolation
def perplexity_interpolation(lambdas, validation_tokens, unigram_probs, bigram_probs, trigram_probs):
    lambda1, lambda2, lambda3 = lambdas  # λ1 for trigrams, λ2 for bigrams, λ3 for unigrams
    N = len(validation_tokens)
    log_prob_sum = 0

    for i in range(len(validation_tokens) - 2):  # Iterate through trigrams
        trigram = tuple(validation_tokens[i:i+3])
        bigram = tuple(trigram[1:])
        unigram = (trigram[-1],)  # Ensure it's a tuple

        # Get probabilities, defaulting to a small value to avoid log(0)
        p_trigram = trigram_probs.get(trigram, 1e-10)
        p_bigram = bigram_probs.get(bigram, 1e-10)
        p_unigram = unigram_probs.get(unigram, 1e-10)

        # Compute interpolated probability
        interpolated_prob = lambda1 * p_trigram + lambda2 * p_bigram + lambda3 * p_unigram
        interpolated_prob = max(interpolated_prob, 1e-10)  # Ensure nonzero probability
        
        log_prob_sum += np.log(interpolated_prob)

    avg_log_prob = log_prob_sum / N
    return np.exp(-avg_log_prob)  # Perplexity


In [38]:
print(validation_tokens[:20])


['weymouth', 'city', 'norfolk', 'county', 'massachusetts', 'one', 'municipality', 'state', 'city', 'form', 'government', 'retaining', 'town', 'official', 'name', 'named', 'weymouth', 'dorset', 'coastal', 'town']


In [39]:
for i in range(len(validation_tokens) - 2):  # Iterate through trigrams
    trigram = tuple(validation_tokens[i:i+3])
    bigram = tuple(trigram[1:])
    unigram = (trigram[-1],)  # Ensure it's a tuple

    print(f"Trigram: {trigram}, Bigram: {bigram}, Unigram: {unigram}")

    p_trigram = trigram_probs.get(trigram, 1e-10)
    p_bigram = bigram_probs.get(bigram, 1e-10)
    p_unigram = unigram_probs.get(unigram, 1e-10)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Trigram: ('list', 'influential', 'sport'), Bigram: ('influential', 'sport'), Unigram: ('sport',)
Trigram: ('influential', 'sport', 'figure'), Bigram: ('sport', 'figure'), Unigram: ('figure',)
Trigram: ('sport', 'figure', 'th'), Bigram: ('figure', 'th'), Unigram: ('th',)
Trigram: ('figure', 'th', 'century'), Bigram: ('th', 'century'), Unigram: ('century',)
Trigram: ('th', 'century', 'july'), Bigram: ('century', 'july'), Unigram: ('july',)
Trigram: ('century', 'july', 'walter'), Bigram: ('july', 'walter'), Unigram: ('walter',)
Trigram: ('july', 'walter', 'omalley'), Bigram: ('walter', 'omalley'), Unigram: ('omalley',)
Trigram: ('walter', 'omalley', 'inducted'), Bigram: ('omalley', 'inducted'), Unigram: ('inducted',)
Trigram: ('omalley', 'inducted', 'irish'), Bigram: ('inducted', 'irish'), Unigram: ('irish',)
Trigram: ('inducted', 'irish', 'american'), Bigram: ('irish', 'american'), Unigram: ('american',)
Trigram: ('irish', 'american', 'baseball'), Bigram: ('american', 'baseball'), Unigra

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Trigram: ('croatia', 'goal', 'tally'), Bigram: ('goal', 'tally'), Unigram: ('tally',)
Trigram: ('goal', 'tally', 'first'), Bigram: ('tally', 'first'), Unigram: ('first',)
Trigram: ('tally', 'first', 'score'), Bigram: ('first', 'score'), Unigram: ('score',)
Trigram: ('first', 'score', 'column'), Bigram: ('score', 'column'), Unigram: ('column',)
Trigram: ('score', 'column', 'indicates'), Bigram: ('column', 'indicates'), Unigram: ('indicates',)
Trigram: ('column', 'indicates', 'score'), Bigram: ('indicates', 'score'), Unigram: ('score',)
Trigram: ('indicates', 'score', 'babi'), Bigram: ('score', 'babi'), Unigram: ('babi',)
Trigram: ('score', 'babi', 'goal'), Bigram: ('babi', 'goal'), Unigram: ('goal',)
Trigram: ('babi', 'goal', 'managerial'), Bigram: ('goal', 'managerial'), Unigram: ('managerial',)
Trigram: ('goal', 'managerial', 'statistic'), Bigram: ('managerial', 'statistic'), Unigram: ('statistic',)
Trigram: ('managerial', 'statistic', 'honour'), Bigram: ('statistic', 'honour'), Unigr

Trigram: ('rumour', 'lennon', 'also'), Bigram: ('lennon', 'also'), Unigram: ('also',)
Trigram: ('lennon', 'also', 'said'), Bigram: ('also', 'said'), Unigram: ('said',)
Trigram: ('also', 'said', 'manson'), Bigram: ('said', 'manson'), Unigram: ('manson',)
Trigram: ('said', 'manson', 'stuff'), Bigram: ('manson', 'stuff'), Unigram: ('stuff',)
Trigram: ('manson', 'stuff', 'built'), Bigram: ('stuff', 'built'), Unigram: ('built',)
Trigram: ('stuff', 'built', 'around'), Bigram: ('built', 'around'), Unigram: ('around',)
Trigram: ('built', 'around', 'george'), Bigram: ('around', 'george'), Unigram: ('george',)
Trigram: ('around', 'george', 'song'), Bigram: ('george', 'song'), Unigram: ('song',)
Trigram: ('george', 'song', 'pig'), Bigram: ('song', 'pig'), Unigram: ('pig',)
Trigram: ('song', 'pig', 'piggy'), Bigram: ('pig', 'piggy'), Unigram: ('piggy',)
Trigram: ('pig', 'piggy', 'one'), Bigram: ('piggy', 'one'), Unigram: ('one',)
Trigram: ('piggy', 'one', 'paul'), Bigram: ('one', 'paul'), Unigram:

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [40]:
print("Trigram type:", type(trigram), "Value:", trigram)
print("Bigram type:", type(bigram), "Value:", bigram)
print("Unigram type:", type(unigram), "Value:", unigram)


Trigram type: <class 'tuple'> Value: ('cup', 'final', 'player')
Bigram type: <class 'tuple'> Value: ('final', 'player')
Unigram type: <class 'tuple'> Value: ('player',)


In [41]:
from scipy.optimize import minimize
import numpy as np

# Constraint: λ1 + λ2 + λ3 = 1
constraints = {'type': 'eq', 'fun': lambda lambdas: sum(lambdas) - 1}

# Bounds: Each λ should be between 0 and 1
bounds = [(0, 1), (0, 1), (0, 1)]

# Initial values (equal weights)
initial_lambdas = [1/3, 1/3, 1/3]

# Optimize λ values to minimize perplexity
result = minimize(perplexity_interpolation, initial_lambdas, 
                  args=(validation_tokens, unigram_probs, bigram_probs, trigram_probs),
                  bounds=bounds, constraints=constraints, method='SLSQP')

# Optimized λ values
optimized_lambdas = result.x
print("Optimized Lambda Values:", optimized_lambdas)


Optimized Lambda Values: [0. 0. 1.]


In [42]:
# Extract tokens from test set
test_tokens = test_df["tokens"].explode().tolist()

# Compute perplexity on test set using optimized lambda values
test_perplexity = perplexity_interpolation(optimized_lambdas, test_tokens, unigram_probs, bigram_probs, trigram_probs)

print("Test Set Perplexity:", test_perplexity)


Test Set Perplexity: 121.05203587603539
