# N-Gram Text Correction using Natural Language Processing

## Overview

Developed two N-gram language models, a bigram and a fivegram, to predict missing words in a text. The sample text utilized to train and evaluate these models is the work of William Shakespeare.

**Tasks:**
1. Data Preparation
2. N-gram Model Training
3. Text Correction
4. Evaluation
5. Export Models

**Files used:**

WS_train.txt - All WS works.

WS_test.txt - The manuscript, with the words lost marked as .

WS_validation - Text to validade our models performance.

### Import Libraries

In [4]:
import string, nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams

### Load Text Function

In [6]:
def load_text(file_path):
    text = open(file_path, encoding = 'utf-8').read().lower()
    return text

train_text = load_text('./WS_train.txt')

In [7]:
def clean_text(text):
    punct = string.punctuation
    stop = stopwords.words('english')
    tokens = word_tokenize(text)
    cleaned = []
    for word in tokens:
        if  word == 'deleted' or word == 'DELETED' or word == '<DELETED>' or word == '<deleted>':
            cleaned.append('<DELETED>')
        else:
            if word.lower() not in punct:
                if word.lower() not in stop:
                    cleaned.append(word.lower())
    return cleaned

cleaned_train_text = clean_text(train_text)

In [8]:
def create_ngrams(tokens, n):
    new = []
    grams = []
    padded = []
    i = 1

    while i < n:
        padded.append('<s>')
        i += 1

    padded = (padded + tokens)
    padded.append('</s>')
    grams = list(ngrams(padded, n))
    return grams

train_bigrams = create_ngrams(cleaned_train_text, 2)
train_fivegrams = create_ngrams(cleaned_train_text, 5)

In [9]:
def build_vocab(tokens):
    vocab = []
    for x in tokens:
        if x != '<DELETED>':
            vocab.append(x)
    return set(vocab)

vocab = build_vocab(cleaned_train_text)

In [10]:
from nltk.probability import FreqDist, ConditionalFreqDist, ConditionalProbDist, MLEProbDist

def calculate_ngram_freq(ngrams_list):
    freq = FreqDist(ngrams_list)
    return freq

bigram_freq_dist = calculate_ngram_freq(train_bigrams)
fivegram_freq_dist = calculate_ngram_freq(train_fivegrams)

In [11]:
def estimate_ngram_probabilities(ngrams_list):
    new = []
    for y in ngrams_list:
        first = y[:-1]
        last = y[-1]
        new.append((first, last))
    confreq = ConditionalFreqDist(new)
    x = ConditionalProbDist(confreq, MLEProbDist, 50000)
    return x

bigram_prob_dist = estimate_ngram_probabilities(train_bigrams)
fivegram_prob_dist = estimate_ngram_probabilities(train_fivegrams)

In [12]:
def predict_next_word(context, cpd, top_n=1):
    if context not in cpd:
        return ['<UNK>']
    else:
        prob1 = []
        cond_dist = cpd[context]
        for y in cond_dist.samples():
            prob1.append((y, cond_dist.prob(y)))
        top_prob = sorted(prob1, key = lambda x: x[1], reverse = True)
        return [word for word, prob in top_prob[:top_n]]

In [13]:
def correct_text_with_ngrams(text_data, cpd, n):
    corrected_text = []
    for i in range(len(text_data)):
        word = text_data[i]
        if word == '<DELETED>':
            context = tuple(text_data[max(0, i-(n-1)):i])
            pred_word = predict_next_word(context, cpd, n)
            corrected_text.append(pred_word[0])
        else:
            corrected_text.append(word)
    return corrected_text

In [14]:
test_text = load_text('./WS_test.txt')
cleaned_test_text = clean_text(test_text)
corrected_test_text_bigram = correct_text_with_ngrams(cleaned_test_text, bigram_prob_dist, 2)
corrected_test_text_fivegram = correct_text_with_ngrams(cleaned_test_text, fivegram_prob_dist, 5)

In [15]:
def calculate_accuracy(test_tokens, corrected_tokens, validation_tokens):
    if (len(test_tokens) != len(corrected_tokens)) or (len(test_tokens) != len(validation_tokens)):
        print("Test Tokens, Validation Token and Corrected Tokens must have the same length")
        return
    
    correct_predictions = 0
    all_predictions = 0 
    for i in range(len(test_tokens)):
        if test_tokens[i] == '<DELETED>':
            all_predictions+=1
            if corrected_tokens[i] == validation_tokens[i]:
                   correct_predictions+=1
                    
    return correct_predictions/all_predictions

In [16]:
validation_text = load_text('./WS_validation.txt')
cleaned_validation_text = clean_text(validation_text)
bigram_accuracy = calculate_accuracy(cleaned_test_text, corrected_test_text_bigram, cleaned_validation_text)
fivegram_accuracy = calculate_accuracy(cleaned_test_text, corrected_test_text_fivegram, cleaned_validation_text)

In [34]:
import pickle

with open('fivegram_prob_dist.pkl', 'wb') as file:
    pickle.dump(fivegram_prob_dist , file)

with open('bigram_prob_dist.pkl', 'wb') as file:
    pickle.dump(bigram_prob_dist , file)