In [2]:
import nltk
import pandas as pd
import string
from nltk.corpus import brown
from collections import Counter
from nltk.util import ngrams

# Step 1: Download and load the corpus
nltk.download('brown')
nltk.download('punkt')

# Remove punctuation and lowercase the words
def preprocess_brown():
    words = brown.words()
    words = [word.lower() for word in words if word not in string.punctuation]
    return words

# Step 2: Generate n-grams and return as DataFrame with frequencies
def get_ngrams_freq(words, n):
    n_grams = list(ngrams(words, n))
    ngram_freq = Counter(n_grams)
    df = pd.DataFrame(ngram_freq.items(), columns=['ngram', 'frequency'])
    df = df.sort_values(by='frequency', ascending=False).reset_index(drop=True)
    return df

# Step 3: Predict next words using n-gram model
def predict_next_word(words, ngram_df, input_seq, n, k):
    """
    input_seq: sequence of (n-1) words (as a string), e.g. 'the dog'
    n: size of n-gram model
    k: number of predictions
    """
    tokens = input_seq.lower().split()
    if len(tokens) != n - 1:
        raise ValueError(f"Expected {n-1} words for {n}-gram model, but got {len(tokens)}.")

    # Filter n-grams that start with the input sequence
    filtered = ngram_df[ngram_df['ngram'].apply(lambda x: x[:-1] == tuple(tokens))]
    
    # Get top-k most frequent next words
    top_k = filtered.sort_values(by='frequency', ascending=False).head(k)
    predictions = top_k['ngram'].apply(lambda x: x[-1]).tolist()
    
    return predictions

# Example usage
if __name__ == "__main__":
    # Preprocess and generate data
    tokens = preprocess_brown()
    n = 3  # for trigram model
    ngram_df = get_ngrams_freq(tokens, n)

    # Predict next word
    input_seq = "the united"
    k = 5
    predictions = predict_next_word(tokens, ngram_df, input_seq, n, k)
    
    print(f"Top {k} next-word predictions for '{input_seq}':")
    print(predictions)


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\U1024363\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\U1024363\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 5 next-word predictions for 'the united':
['states', 'nations', "states'", "nations'", 'kingdom']
