## NLP mini-project Group3

<br> * Library Installation
<br>pip install wikipedia-api
<br>pip install nltk
<br>nltk.download('punkt')  # For sentence tokenization
<br>nltk.download('stopwords')  # For removing stop words
<br>nltk.download('wordnet')  # For lexical semantics
<br>nltk.download('punkt') #For sentence tokenizer

1. Data Collection

In [1]:
import os
import wikipediaapi

# Set up the Wikipedia API with a valid user-agent
wiki = wikipediaapi.Wikipedia(
    language="en", 
    user_agent="YourAppName/1.0 (your-email@example.com)"
)

# Specify the topic and folder path
topic = "Khmer Empire"
output_folder = "./datasets"
output_file = os.path.join(output_folder, "wiki_data1.txt")

# Fetch the article
article = wiki.page(topic)

if article.exists():
    # Ensure the folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    # Write the article text to a file
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(article.text)
    print(f"Article '{topic}' has been saved to {output_file}")
else:
    print(f"The article '{topic}' does not exist.")


Article 'Khmer Empire' has been saved to ./datasets\wiki_data1.txt


2. Data Preprocessing

In [25]:
import re

# Function to clean text
def clean_text(text):
    # Remove citations like [1], [citation needed]
    text = re.sub(r'\[[^\]]+\]', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove parenthetical content
    text = re.sub(r'\([^)]*\)', '', text)
    # Allow commas and periods while removing other punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,]', '', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    # Ensure proper sentence spacing (e.g., "word.word" -> "word. word")
    text = re.sub(r'\.([a-zA-Z])', r'. \1', text)
    return text.lower()  # Convert to lowercase

# Load the raw corpus
# Load the raw corpus
with open('./datasets/wiki_data.txt', 'r', encoding='utf-8') as file:
    raw_corpus = file.read()


# Clean the corpus
cleaned_corpus = clean_text(raw_corpus)

# Save the cleaned corpus for inspection (optional)
with open('./datasets/cleaned_wiki_data.txt', 'w') as file:
    file.write(cleaned_corpus)
    
    

3. Split the Corpus into training (70%), validation (10%) and testing (20%)

In [26]:
import os
import random

# Split the text into sentences
sentences = cleaned_corpus.split('.')
# Remove empty sentences and trim whitespace
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Shuffle the sentences to avoid bias
random.shuffle(sentences)

# Calculate split indices
train_split = int(0.7 * len(sentences))
val_split = int(0.8 * len(sentences))

# Create subsets
train_set = sentences[:train_split]
val_set = sentences[train_split:val_split]
test_set = sentences[val_split:]

# Print information about the splits
print(f"Total sentences: {len(sentences)}")
print(f"Training set: {len(train_set)}")
print(f"Validation set: {len(val_set)}")
print(f"Testing set: {len(test_set)}")

# Save the split datasets
train_file = './datasets/train_set.txt'
val_file = './datasets/val_set.txt'
test_file = './datasets/test_set.txt'

# Save the data to files
with open(train_file, 'w') as f:
    f.write('\n'.join(train_set))

with open(val_file, 'w') as f:
    f.write('\n'.join(val_set))

with open(test_file, 'w') as f:
    f.write('\n'.join(test_set))

print("Datasets saved successfully.")


Total sentences: 356
Training set: 249
Validation set: 35
Testing set: 72
Datasets saved successfully.


4 .Tokenize Sentences and Limit Vocabulary

In [27]:
from nltk.tokenize import word_tokenize
from collections import Counter

# Step 1: Tokenize each sentence in the training set
train_tokens = [word_tokenize(sentence) for sentence in train_set]

# Check if tokenization is successful
if len(train_tokens) == 0:
    print("Error: `train_tokens` is empty. Check `train_set`.")
else:
    print(f"Number of tokenized sentences in train_set: {len(train_tokens)}")

# Step 2: Define the vocabulary size
vocab_size = 20000

# Step 3: Count word frequencies in the training set
word_counts = Counter(word for sentence in train_tokens for word in sentence)

# Step 4: Select the top `vocab_size` words as the vocabulary
vocab = {word for word, _ in word_counts.most_common(vocab_size)}
print(f"Vocabulary size (top {vocab_size} words): {len(vocab)}")

# Step 5: Function to replace words not in the vocabulary with <UNK>
def replace_with_unk(sentences, vocab):
    return [
        [word if word in vocab else '<UNK>' for word in word_tokenize(sentence)]
        for sentence in sentences
    ]

# Step 6: Apply the function to train, validation, and test sets
train_tokens = replace_with_unk(train_set, vocab)
val_tokens = replace_with_unk(val_set, vocab)
test_tokens = replace_with_unk(test_set, vocab)

# Check the length of `train_tokens` after replacement
if len(train_tokens) == 0:
    print("Error: `train_tokens` is empty after replacement. Check `replace_with_unk` function.")
else:
    print(f"Number of sentences in train_tokens after replacement: {len(train_tokens)}")

# Step 7: Print the first 5 tokenized sentences to check results
print("First 5 tokenized training sentences with <UNK>:")
for i in range(min(5, len(train_tokens))):  # Handle cases where there are fewer than 5 sentences
    print(train_tokens[i])


Number of tokenized sentences in train_set: 249
Vocabulary size (top 20000 words): 1542
Number of sentences in train_tokens after replacement: 249
First 5 tokenized training sentences with <UNK>:
['i']
['ancient', 'angkor']
['his', 'fatherinlaw', ',', 'the', 'king', 'of', 'cambodia', ',', 'gave', 'him', 'a', 'khmer', 'army', 'to', 'create', 'a', 'buffer', 'state', 'in', 'what', 'is', 'now', 'laos']
['keyes', ',', 'charles', 'f']
['while', 'previously', 'three', 'rice', 'harvests', 'per', 'year', 'were', 'possible', 'a', 'substantial', 'contribution', 'to', 'the', 'prosperity', 'and', 'power', 'of', 'kambuja', 'the', 'declining', 'harvests', 'further', 'weakened', 'the', 'empire']


5. Build 4-Gram Models

In [28]:
from collections import defaultdict

# Function to build an n-gram model
def build_ngram_model(tokenized_sentences, n=4):
    ngram_counts = defaultdict(int)
    for sentence in tokenized_sentences:
        # padded_sentence = ['<s>'] * (n - 1) + sentence + ['</s>']
        padded_sentence = ['<s>'] * (n - 1) + sentence 
        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i:i + n])
            ngram_counts[ngram] += 1
    return ngram_counts

# Build the n-gram models
lm1 = build_ngram_model(train_tokens, n=4)  # Backoff model
lm2 = build_ngram_model(train_tokens, n=4)  # Interpolation model

print("4-gram model (Backoff) has", len(lm1), "entries.")
print("4-gram model (Interpolation) has", len(lm2), "entries.")


4-gram model (Backoff) has 4596 entries.
4-gram model (Interpolation) has 4596 entries.


6. Build model for backoff probability

In [29]:
def backoff_prob(model, ngram):
    """
    Calculate the probability of an n-gram using a backoff model.
    :param model: N-gram counts.
    :param ngram: The n-gram tuple.
    :return: Probability of the n-gram.
    """
    for i in range(len(ngram), 0, -1):  # Back off through lower-order n-grams
        sub_ngram = ngram[-i:]
        if sub_ngram in model:
            return model[sub_ngram] / sum(
                count for ngram_key, count in model.items() if ngram_key[:-1] == sub_ngram[:-1]
            )
    return 1e-6  # Small probability for unseen n-grams


7. Build model for interpolation probability

In [31]:
def interpolated_prob(model, ngram, lambdas, k=1):

    prob = 0.0
    for i in range(1, len(ngram) + 1):  # Iterate over all sub-ngrams
        sub_ngram = ngram[-i:]
        sub_ngram_count = model.get(sub_ngram, 0)
        context_count = sum(
            count for ngram_key, count in model.items() if ngram_key[:-1] == sub_ngram[:-1]
        )
        prob += lambdas[i - 1] * ((sub_ngram_count + k) / (context_count + k))
    return prob


8. Calulation Perplexity for each models

In [30]:
import math

def calculate_perplexity(model, tokens, prob_func, n=4, lambdas=None, k=1):
    log_prob_sum = 0
    word_count = 0

    for sentence in tokens:
        # padded_sentence = ["<s>"] * (n - 1) + sentence + ["</s>"]
        padded_sentence = ["<s>"] * (n - 1) + sentence
        for i in range(len(padded_sentence) - n + 1):
            ngram = tuple(padded_sentence[i:i + n])
            prob = prob_func(model, ngram, lambdas, k) if lambdas else prob_func(model, ngram)
            log_prob_sum += math.log2(prob)
            word_count += 1

    return 2 ** (-log_prob_sum / word_count)

# Hyperparameters for interpolation
lambdas = [0.1, 0.3, 0.4, 0.2]
k = 1

# Calculate perplexity for Backoff and Interpolation models
perplexity_lm1 = calculate_perplexity(lm1, test_tokens, prob_func=backoff_prob, n=4)
perplexity_lm2 = calculate_perplexity(lm2, test_tokens, prob_func=interpolated_prob, n=4, lambdas=lambdas, k=k)

print(f"Perplexity of LM1 (Backoff): {perplexity_lm1}")
print(f"Perplexity of LM2 (Interpolation): {perplexity_lm2}")


Perplexity of LM1 (Backoff): 456526.0963889846
Perplexity of LM2 (Interpolation): 1.02510106989401


9. Text Generation

In [None]:
import random

def generate_text(model, n=4, length=20, prob_func=None, lambdas=None, k=1):
    text = ["<s>"] * (n - 1)  # Start with padding symbols
    for _ in range(length):
        context = tuple(text[-(n - 1):])
        candidates = [ngram for ngram in model if ngram[:-1] == context]
        if candidates:
            if lambdas:
                weights = [prob_func(model, ngram, lambdas, k) for ngram in candidates]
            else:
                weights = [prob_func(model, ngram) for ngram in candidates]
            chosen = random.choices(candidates, weights=weights)[0]
            text.append(chosen[-1])


    return " ".join(text[n - 1:])  # Skip padding symbols in output

# Generate text with both models
generated_text_lm1 = generate_text(lm1, prob_func=backoff_prob, n=4, length=30)
generated_text_lm2 = generate_text(lm2, prob_func=interpolated_prob, n=4, length=30, lambdas=lambdas, k=k)

print("Generated Text by LM1 (Backoff):", generated_text_lm1)
print("Generated Text by LM2 (Interpolation):", generated_text_lm2)


Generated Text by LM1 (Backoff): benjamin walker , angkor empire a history of cambodia , an important insight into the khmer empires daily life , market scenes , military marches , and palace life reports
Generated Text by LM2 (Interpolation): then come the palace women carrying lances and shields , with the kings private guards
