1 Data Preparation


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
import string
import math
import random
from collections.abc import Iterable
from itertools import product

# Checks if 'punkt' tokenizer is available, download it if not.
def ensure_nltk_resources():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

# Read dataset from CSV file.
def read_dataset(file_path):
    data = pd.read_csv(file_path)
    return data

# Split data to train, validation, test with 70:15:15 ratio, keep genre distribution.
def split_dataset(data, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    train_data, temp_data = train_test_split(
        data, test_size=1 - train_ratio, stratify=data['genre'], random_state=42)
    val_data, test_data = train_test_split(
        temp_data, test_size=test_ratio/(val_ratio + test_ratio), stratify=temp_data['genre'], random_state=42)
    return train_data, val_data, test_data

# Make corpora for train, validation, test by join articles to single strings
def create_corpora(train_data, val_data, test_data):
    train_corpus = ' '.join(train_data['article'].astype(str).tolist())
    val_corpus = ' '.join(val_data['article'].astype(str).tolist())
    test_corpus = ' '.join(test_data['article'].astype(str).tolist())
    return train_corpus, val_corpus, test_corpus

# Tokenize corpus to sentences then to words.
def tokenize_corpus(corpus):
    sentences = sent_tokenize(corpus)
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return tokenized_sentences

# Preprocess tokens by make lowercase, remove punctuation, and stem with Porter
def preprocess_tokens(tokenized_sentences):
    stemmer = PorterStemmer()
    processed_sentences = []
    for sentence in tokenized_sentences:
        tokens = [word.lower() for word in sentence if word not in string.punctuation]
        tokens = [stemmer.stem(word) for word in tokens if word]
        if tokens:
            processed_sentences.append(tokens)
    return processed_sentences

# Prepare data by read, split, make corpora, tokenize, and preprocess.
def prepare_data(file_path):
    ensure_nltk_resources()
    data = read_dataset(file_path)
    train_data, val_data, test_data = split_dataset(data)
    train_corpus, val_corpus, test_corpus = create_corpora(train_data, val_data, test_data)
    train_tokens = tokenize_corpus(train_corpus)
    val_tokens = tokenize_corpus(val_corpus)
    test_tokens = tokenize_corpus(test_corpus)
    train_processed = preprocess_tokens(train_tokens)
    val_processed = preprocess_tokens(val_tokens)
    test_processed = preprocess_tokens(test_tokens)
    return train_processed, val_processed, test_processed, train_data, val_data, test_data

 2 N-Gram Language Modeling

In [None]:
from lm import SmoothedNGramLanguageModel

# Train and evaluate n-gram model, print perplexity and sample sentences.
def evaluate_ngram_model(n, k, threshold, train_data, val_data, test_data):
    model = SmoothedNGramLanguageModel(n=n, k=k, threshold=threshold)
    print(f"Training {n}-gram model...")
    model.train(train_data)
    print(f"Vocabulary size: {len(model.vocabulary)}")
    val_perplexity = model.get_perplexity(val_data)
    test_perplexity = model.get_perplexity(test_data)
    print(f"Validation perplexity: {val_perplexity:.2f}")
    print(f"Test perplexity: {test_perplexity:.2f}")
    print(f"Sample sentences from {n}-gram model:")
    for i in range(3):
        seed = 42 + i
        sample = model.sample(random_seed=seed)
        print(f"Sample {i+1}: {' '.join(sample)}")
    return model

#run bigram and trigram model on news dataset.
file_path = "./news.csv"
train_processed, val_processed, test_processed, train_data, val_data, test_data = prepare_data(file_path)

bigram_k = 0.001
bigram_threshold = 13
trigram_k = .001
trigram_threshold = 13

print("    Bigram Model (n=2)    ")
bigram_model = evaluate_ngram_model(2, bigram_k, bigram_threshold, train_processed, val_processed, test_processed)

print("\n    Trigram Model (n=3)    ")
trigram_model = evaluate_ngram_model(3, trigram_k, trigram_threshold, train_processed, val_processed, test_processed)

    Bigram Model (n=2)    
Training 2-gram model...
Number of n-grams: 339473
Number of contexts: 6540
Vocabulary size: 6541
Validation perplexity: 241.77
Test perplexity: 243.72
Sample sentences from 2-gram model:
Sample 1: we are veri difficult situat is testament to a way to reach a total domest stock exchang ’ t have got in the offici limit of the first projector have
Sample 2: the icc final be restrict shipment size and latest news and noth do to a vacat will debut in india vs australia automat
Sample 3: advertis thi becom the same time in a subscript to cybersecur retain the group on the world champion australia defeat india and icon film amass four to becom a ca

    Trigram Model (n=3)    
Training 3-gram model...
Number of n-grams: 904926
Number of contexts: 334872
Vocabulary size: 6541
Validation perplexity: 498.77
Test perplexity: 503.58
Sample sentences from 3-gram model:
Sample 1: we ’ ve learn a few off the last seven month australia defeat india in an interview with pti 

3 Spelling Correction

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.metrics.distance import edit_distance
from nltk.stem.porter import PorterStemmer
import string
from collections import Counter, defaultdict
from jiwer import wer, cer, compute_measures
from jiwer.transforms import Compose, RemovePunctuation, AbstractTransform, ReduceToListOfListOfChars
from functools import lru_cache
import math
import time
import statistics
import re
from lm import SmoothedNGramLanguageModel
from spell import SimpleSpellingCorrector

# Split strings to tokens for WER calculation.
class CustomSplitWords(AbstractTransform):
    def __call__(self, sentences):
        if isinstance(sentences, str):
            sentences = [sentences]
        return [sentence.split() for sentence in sentences]

# Normalize text by remove punctuation and convert to lowercase.
def normalize_text(text):
    text = re.sub(r'[^\w\s-]', '', text)
    text = text.lower().strip()
    return text if text else None
stemmer = PorterStemmer()

# Preprocess paragraphs to tokens, optional stemming.
def preprocess_tokens(paragraphs, stem=True):
    processed_paragraphs = []
    original_paragraphs = []
    for paragraph in paragraphs:
        proc, orig = process_single_unit(paragraph, stem)
        if proc and orig:
            processed_paragraphs.append(proc)
            original_paragraphs.append(orig)
    return processed_paragraphs, original_paragraphs

# Process single paragraph to tokens, handle hyphens and numbers.
def process_single_unit(text, stem):
    if isinstance(text, list):
        text = ' '.join(text)
    tokens = word_tokenize(text)
    tokens = [word.replace("'", "") for word in tokens if word not in string.punctuation or word == '-']
    combined_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 2 and tokens[i+1] == '-':
            combined_tokens.append(tokens[i] + '-' + tokens[i+2])
            i += 3
        else:
            if tokens[i].replace(',', '').replace('.', '').isdigit():
                combined_tokens.append(tokens[i])
            else:
                combined_tokens.append(tokens[i])
            i += 1
    if not combined_tokens:
        return [], []
    processed_tokens = []
    for token in combined_tokens:
        if token.replace(',', '').replace('.', '').isdigit():
            processed_tokens.append(token)
        else:
            if stem:
                processed_tokens.append(stemmer.stem(token.lower()))
            else:
                processed_tokens.append(token.lower())
    return processed_tokens, combined_tokens

# Prepare dataset by split, tokenize, and preprocess texts.
def prepare_data(filepath, stem=True):
    nltk.download('punkt', quiet=True)
    data = pd.read_csv(filepath)
    train_data, temp_data = train_test_split(
        data, test_size=0.3, stratify=data['genre'], random_state=42
    )
    val_data, test_data = train_test_split(
        temp_data, test_size=0.5, stratify=temp_data['genre'], random_state=42
    )
    
    train_texts = train_data['article'].astype(str).tolist()
    train_proc, _ = preprocess_tokens(train_texts, stem=stem)
    
    val_ref_texts = val_data['article'].astype(str).tolist()
    val_hyp_texts = val_data['article_erroneous'].astype(str).tolist()
    test_ref_texts = test_data['article'].astype(str).tolist()
    test_hyp_texts = test_data['article_erroneous'].astype(str).tolist()
    
    print(f"Number of validation units: {len(val_ref_texts)} (ref), {len(val_hyp_texts)} (hyp)")
    print(f"Number of test units: {len(test_ref_texts)} (ref), {len(test_hyp_texts)} (hyp)")
    
    val_ref_processed, val_orig = preprocess_tokens(val_ref_texts, stem=stem)
    val_hyp_processed, val_err = preprocess_tokens(val_hyp_texts, stem=stem)
    test_ref_processed, test_orig = preprocess_tokens(test_ref_texts, stem=stem)
    test_hyp_processed, test_err = preprocess_tokens(test_hyp_texts, stem=stem)

    val_pairs = [(orig, err) for orig, err in zip(val_orig, val_err) if orig and err]
    val_orig = [pair[0] for pair in val_pairs]
    val_err = [pair[1] for pair in val_pairs]

    test_pairs = [(orig, err) for orig, err in zip(test_orig, test_err) if orig and err]
    test_orig = [pair[0] for pair in test_pairs]
    test_err = [pair[1] for pair in test_pairs]
    
    print(f"Number of validation units after preprocessing: {len(val_orig)} (ref), {len(val_err)} (hyp)")
    print(f"Number of test units after preprocessing: {len(test_orig)} (ref), {len(test_err)} (hyp)")
    
    return train_proc, val_err, test_err, val_orig, test_orig

# Evaluate spelling correction, calculate WER/CER before and after, and show samples.
def evaluate_spelling_correction(n, k, threshold, train_data, val_err, test_err, val_orig, test_orig):
    lm = SmoothedNGramLanguageModel(n=n, k=k, threshold=threshold)
    start_time = time.time()
    lm.train(train_data)
    #print(f"Training time: {time.time() - start_time:.2f} seconds")
    print(f"Vocab size: {len(lm.vocabulary)}, N-grams: {len(lm.ngram_counts)}")
    corrector = SimpleSpellingCorrector(lm)

    print(f"Evaluating on {len(val_err)} validation units and {len(test_err)} test units")

    val_ref_processed = []
    val_hyp_processed = []
    test_ref_processed = []
    test_hyp_processed = []

    for orig in val_orig:
        tokens = [normalize_text(token) for token in orig if token not in string.punctuation or token == '-']
        tokens = [token for token in tokens if token]
        if tokens:
            val_ref_processed.append(tokens)

    for err in val_err:
        tokens = [normalize_text(token) for token in err if token not in string.punctuation or token == '-']
        tokens = [token for token in tokens if token]
        if tokens:
            val_hyp_processed.append(tokens)

    for orig in test_orig:
        tokens = [normalize_text(token) for token in orig if token not in string.punctuation or token == '-']
        tokens = [token for token in tokens if token]
        if tokens:
            test_ref_processed.append(tokens)

    for err in test_err:
        tokens = [normalize_text(token) for token in err if token not in string.punctuation or token == '-']
        tokens = [token for token in tokens if token]
        if tokens:
            test_hyp_processed.append(tokens)

    val_pairs = [(ref, hyp) for ref, hyp in zip(val_ref_processed, val_hyp_processed) if ref and hyp]
    val_ref_processed = [pair[0] for pair in val_pairs]
    val_hyp_processed = [pair[1] for pair in val_pairs]

    test_pairs = [(ref, hyp) for ref, hyp in zip(test_ref_processed, test_hyp_processed) if ref and hyp]
    test_ref_processed = [pair[0] for pair in test_pairs]
    test_hyp_processed = [pair[1] for pair in test_pairs]

    print(f"Number of validation units after token alignment: {len(val_ref_processed)} (ref), {len(val_hyp_processed)} (hyp)")
    print(f"Number of test units after token alignment: {len(test_ref_processed)} (ref), {len(test_hyp_processed)} (hyp)")

    val_ref_lengths = [len(unit) for unit in val_ref_processed]
    val_hyp_lengths = [len(unit) for unit in val_hyp_processed]
    test_ref_lengths = [len(unit) for unit in test_ref_processed]
    test_hyp_lengths = [len(unit) for unit in test_hyp_processed]

    wer_transform = Compose([RemovePunctuation(), CustomSplitWords()])
    cer_transform = Compose([RemovePunctuation(), ReduceToListOfListOfChars()])

    print(f"\nBefore Correction (n={n}):")
    val_ref_str = [' '.join(tokens) for tokens in val_ref_processed]
    val_hyp_str = [' '.join(tokens) for tokens in val_hyp_processed]
    test_ref_str = [' '.join(tokens) for tokens in test_ref_processed]
    test_hyp_str = [' '.join(tokens) for tokens in test_hyp_processed]

    val_wer_before = wer(val_ref_str, val_hyp_str, reference_transform=wer_transform, hypothesis_transform=wer_transform)
    test_wer_before = wer(test_ref_str, test_hyp_str, reference_transform=wer_transform, hypothesis_transform=wer_transform)
    val_cer_before = cer(val_ref_str, val_hyp_str, reference_transform=cer_transform, hypothesis_transform=cer_transform)
    test_cer_before = cer(test_ref_str, test_hyp_str, reference_transform=cer_transform, hypothesis_transform=cer_transform)

    print(f"Validation WER: {val_wer_before:.4f}, CER: {val_cer_before:.4f}")
    print(f"Test WER: {test_wer_before:.4f}, CER: {test_cer_before:.4f}")

    start_time = time.time()
    val_corrected = [corrector.correct(' '.join(unit)) for unit in val_err]
    #print(f"Validation correction time: {time.time() - start_time:.2f} seconds")
    start_time = time.time()
    test_corrected = [corrector.correct(' '.join(unit)) for unit in test_err]
    print(f"Test correction time: {time.time() - start_time:.2f} seconds")

    val_corrected_processed = []
    test_corrected_processed = []
    for unit in val_corrected:
        tokens = word_tokenize(unit)
        tokens = [normalize_text(token) for token in tokens if token not in string.punctuation or token == '-']
        tokens = [token for token in tokens if token]
        if tokens:
            val_corrected_processed.append(tokens)
    for unit in test_corrected:
        tokens = word_tokenize(unit)
        tokens = [normalize_text(token) for token in tokens if token not in string.punctuation or token == '-']
        tokens = [token for token in tokens if token]
        if tokens:
            test_corrected_processed.append(tokens)

    val_pairs_after = [(ref, hyp) for ref, hyp in zip(val_ref_processed, val_corrected_processed) if ref and hyp]
    val_ref_processed_after = [pair[0] for pair in val_pairs_after]
    val_corrected_processed = [pair[1] for pair in val_pairs_after]

    test_pairs_after = [(ref, hyp) for ref, hyp in zip(test_ref_processed, test_corrected_processed) if ref and hyp]
    test_ref_processed_after = [pair[0] for pair in test_pairs_after]
    test_corrected_processed = [pair[1] for pair in test_pairs_after]

    val_corrected_lengths = [len(unit) for unit in val_corrected_processed]
    test_corrected_lengths = [len(unit) for unit in test_corrected_processed]


    print(f"\nAfter Correction (n={n}):")
    val_ref_str_after = [' '.join(tokens) for tokens in val_ref_processed_after]
    val_corrected_str = [' '.join(tokens) for tokens in val_corrected_processed]
    test_ref_str_after = [' '.join(tokens) for tokens in test_ref_processed_after]
    test_corrected_str = [' '.join(tokens) for tokens in test_corrected_processed]

    val_wer_after = wer(val_ref_str_after, val_corrected_str, reference_transform=wer_transform, hypothesis_transform=wer_transform)
    test_wer_after = wer(test_ref_str_after, test_corrected_str, reference_transform=wer_transform, hypothesis_transform=wer_transform)
    val_cer_after = cer(val_ref_str_after, val_corrected_str, reference_transform=cer_transform, hypothesis_transform=cer_transform)
    test_cer_after = cer(test_ref_str_after, test_corrected_str, reference_transform=cer_transform, hypothesis_transform=cer_transform)

    print(f"Validation WER: {val_wer_after:.4f}, CER: {val_cer_after:.4f}")
    print(f"Test WER: {test_wer_after:.4f}, CER: {test_cer_after:.4f}")

    print("\nError Analysis (Validation):")
    measures = compute_measures(val_ref_str, val_corrected_str)
    print(f"Insertions: {measures['insertions']}, Deletions: {measures['deletions']}, Substitutions: {measures['substitutions']}")

    print("\nSample corrected units (Validation):")
    for i in range(min(3, len(val_corrected))):
        print(f"Original (erroneous): {' '.join(val_err[i])}")
        print(f"Corrected: {val_corrected[i]}")
        print(f"Reference: {' '.join(val_orig[i])}\n")


# Run spelling correction evaluation for bigram and trigram models.
if __name__ == '__main__':
    filepath = 'news.csv'
    print("    Evaluation with Paragraph Units    ")
    train_proc, val_err, test_err, val_orig, test_orig = prepare_data(filepath, stem=True)
    evaluate_spelling_correction(2, 0.05, 2, train_proc, val_err, test_err, val_orig, test_orig)
    evaluate_spelling_correction(3, 0.05, 2, train_proc, val_err, test_err, val_orig, test_orig)

    Evaluation with Paragraph Units    
Number of validation units: 1500 (ref), 1500 (hyp)
Number of test units: 1500 (ref), 1500 (hyp)
Number of validation units after preprocessing: 1500 (ref), 1500 (hyp)
Number of test units after preprocessing: 1500 (ref), 1500 (hyp)
Number of n-grams: 455053
Number of contexts: 23423
Training time: 3.52 seconds
Vocab size: 23424, N-grams: 455053
Evaluating on 1500 validation units and 1500 test units
Number of validation units after token alignment: 1500 (ref), 1500 (hyp)
Number of test units after token alignment: 1500 (ref), 1500 (hyp)

Unit lengths before correction (Validation - Reference):
First 5 units: [365, 70, 106, 65, 195]
Average length: 231.12
Unit lengths before correction (Validation - Hypothesis):
First 5 units: [366, 78, 108, 70, 198]
Average length: 234.90
Unit lengths before correction (Test - Reference):
First 5 units: [92, 121, 123, 146, 170]
Average length: 234.41
Unit lengths before correction (Test - Hypothesis):
First 5 uni