In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def preprocess(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

# Load the dataset
with open('Pride and Prejudice.txt', 'r' , encoding = 'utf8') as f:
    text = f.read()

# Preprocess the text
tokens = preprocess(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Khalil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Khalil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def train_model(tokens, n=2):
    model = {}
    for i in range(len(tokens)-n):
        key = tuple(tokens[i:i+n])
        value = tokens[i+n]
        if key in model:
            if value in model[key]:
                model[key][value] += 1
            else:
                model[key][value] = 1
        else:
            model[key] = {value: 1}
    return model

# Train the model
model = train_model(tokens, n=2)

In [3]:
def generate_prediction(model, prefix):
    if prefix in model:
        suffixes = model[prefix]
        return max(suffixes, key=suffixes.get)
    else:
        return None

# Take input from user
input_str = input("Enter two or three words: ")

# Preprocess the input
input_tokens = preprocess(input_str)

# Generate prediction
prediction = generate_prediction(model, tuple(input_tokens))

# Print the prediction
if prediction:
    print("Next word prediction:", prediction)
else:
    print("No prediction found.")

Enter two or three words: displayed consummate
Next word prediction: command


In [4]:
import random

def train_test_split(tokens, test_size=0.2):
    n = len(tokens)
    train_size = int((1-test_size) * n)
    train_tokens = tokens[:train_size]
    test_tokens = tokens[train_size:]
    return train_tokens, test_tokens

def evaluate_model(model, test_tokens):
    n_correct = 0
    n_total = 0
    for i in range(len(test_tokens)-2):
        prefix = tuple(test_tokens[i:i+2])
        true_suffix = test_tokens[i+2]
        pred_suffix = generate_prediction(model, prefix)
        if pred_suffix == true_suffix:
            n_correct += 1
        n_total += 1
    return float(n_correct) / n_total

# Split the dataset into training and testing sets
train_tokens, test_tokens = train_test_split(tokens, test_size=0.2)

# Train the model on the training set
model = train_model(train_tokens, n=2)

# Evaluate the accuracy on the testing set
accuracy = evaluate_model(model, test_tokens)
print("Accuracy:", accuracy)

Accuracy: 0.00985741946840345
