<a href="https://colab.research.google.com/github/JumanaWanass/Pride-Prediction-Austen-based-Text-Recognition/blob/main/Austen_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import unicodedata
import string
import random
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.probability import ConditionalFreqDist

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
import re
import unicodedata
import string

def filter(text):
    # normalize text
    text = (unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
    text = re.sub('<.*?>', ' ', text)     # replace html chars with ' '
    text = text.translate(str.maketrans(' ', ' ', string.punctuation)) # remove punctuation
    # only alphabets and numerics
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub("\n", " ", text)
    text = text.lower()
    # split and join the words
    text = ' '.join(text.split())
    return text


In [None]:
def clean(text):
    tokens = nltk.word_tokenize(text)
    wnl = nltk.stem.WordNetLemmatizer()
    output = []
    for word in tokens:
        # lemmatize word
        output.append(wnl.lemmatize(word))

    return ' '.join(output)


In [None]:
import nltk
from nltk.util import ngrams
from collections import defaultdict

def n_gram_model(text, n):
    tokens = nltk.word_tokenize(text)
    ngram_freq = defaultdict(lambda: defaultdict(int))

    # Generate n-grams and count frequencies
    for ngram_tuple in ngrams(tokens, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'):
        context = ngram_tuple[:-1]
        word = ngram_tuple[-1]
        ngram_freq[context][word] += 1

    # Convert frequencies to probabilities
    ngram_prob = defaultdict(lambda: defaultdict(float))
    for context, words in ngram_freq.items():
        total_count = sum(words.values())
        for word, count in words.items():
            ngram_prob[context][word] = count / total_count

    return ngram_prob

In [None]:
import random

def predict(model, user_input, n, num_words, end_token):
    # Tokenize user input
    user_input = user_input.split()

    # Extract previous n-1 words as context and convert to tuple
    context = tuple(user_input[-(n - 1):])

    # Generate specified number of words
    for i in range(num_words):
        # Check if it's the last word to generate
        is_last_word = i == num_words - 1

        # Check if context exists in the model
        if context not in model:
            print("Context not found in model.")
            break

        # Sort predictions by probability
        predictions = sorted(model[context].keys(), key=lambda x: model[context][x], reverse=True)

        # Generate weights for predictions with add-one smoothing
        total_count = sum(model[context].values())
        weights = [(model[context][word] + 1) / (total_count + len(model[context])) for word in predictions]

        # Adjust probabilities for end-of-text token if it's the last word
        if is_last_word and end_token in predictions:
            weights[predictions.index(end_token)] += 1 / (total_count + len(model[context]))  # Increment count for end token with add-one smoothing

        next_word = random.choices(list(model[context]), weights=weights, k=1)[0]

        # If it's the last word and end token is chosen, break the loop
        if is_last_word and next_word == end_token:
            break

        # Add predicted word to user input
        user_input.append(next_word)

        # Update context for next iteration
        context = tuple(user_input[-(n - 1):])

    print(' '.join(user_input))

In [None]:
file = open('/content/JaneAustenWorks.txt', 'r')
text = ""
while True:
    line = file.readline()
    text += line
    if not line:
        break

# pre-process text
print("Filtering...")
words = filter(text)
print("Cleaning...")
words = clean(words)

Filtering...
Cleaning...


In [None]:
# make language model
print("Making model...")
n = 3
model = n_gram_model(words, n)

Making model...


In [None]:
# According to jane austen:
predict(model, "It is a truth universally  acknowledged that", n,10, '</s>')

It is a truth universally acknowledged that he explained himself at what time you may now and
