In [1]:
import re
import random
from collections import defaultdict, Counter

In [2]:
def preprocess_text(text):
    # Lowercase and remove non-alphanumeric characters except spaces
    return re.sub(r'[^a-zA-Z\s]', '', text.lower())

In [3]:
def build_ngram_model(text, n):
    words = text.split()
    model = defaultdict(Counter)

    for i in range(len(words) - n + 1):
        # Get the context (first n-1 words)
        context = tuple(words[i:i+n-1])
        # Get the next word
        next_word = words[i+n-1]
        # Add to model
        model[context][next_word] += 1
    
    return dict(model)

In [4]:
def generate_text(model, seed, length):
    # Preprocess seed and split into words
    seed_words = preprocess_text(seed).split()
    result = seed_words.copy()

    # Determine n-gram order from model
    n_order  = len(next(iter(model))) + 1 if model else 1

    # Generate text
    for _ in range(length):
        # Get current context (last n-1 words)
        if len(result) >= n_order - 1:
            context = tuple(result[-(n_order-1):])
        else:
            # If we don't have enough context, use what we have
            context = tuple(result)

        # Find possible next words
        if context in model:
            next_words = model[context]
            # Create weighted list for random selection
            words = list(next_words.keys())
            weights = list(next_words.values())
            
            # Randomly select next word based on probability
            next_word = random.choices(words, weights=weights)[0]
            result.append(next_word)
        else:
            # If context not found, try with shorter context or pick random word
            found = False
            for i in range(1, len(context)):
                shorter_context = context[i:]
                if shorter_context in model:
                    next_words = model[shorter_context]
                    words = list(next_words.keys())
                    weights = list(next_words.values())
                    next_word = random.choices(words, weights=weights)[0]
                    result.append(next_word)
                    found = True
                    break
            
            if not found:
                # Pick a random word from all possible words in the model
                all_words = []
                for counter in model.values():
                    all_words.extend(counter.keys())
                if all_words:
                    next_word = random.choice(all_words)
                    result.append(next_word)
                else:
                    break

    return ' '.join(result)

In [5]:
with open('dataset/ngram.txt', 'r') as file:
    text_data = file.read()

processed_text = preprocess_text(text_data)

In [6]:
# N = 3
model = build_ngram_model(processed_text, n=3)
seeds = ['natural', 'language', 'data']  # Starting word or phrase
length = 100  # Length of the generated text

# Generate and print text
for seed in seeds:
    print(f"Seed: '{seed}'")
    print(f"Length: {length} words")
    generated = generate_text(model, seed, length)
    print(generated)
    print("-" * 30)

Seed: 'natural'
Length: 100 words
natural kind important language depth x entity to a fencedin area not a writing instrument named entity recognition which extracts the names of people places and other entities from text machine translation and speech recognition stemming this divides words with inflection in them into root forms for example an algorithm is developed to process partofspeech tagging words are divided by white spaces sentence breaking this sentence into parts of speech they correspond to such as french without human intervention natural language processing algorithm is developed to process partofspeech tagging words are tagged based on the text remain lemmatization and stemming lemmatization
------------------------------
Seed: 'language'
Length: 100 words
language determine user taking cloud the for the same for instance in the pen the word bark as well as the text so unique words that can be used to classify text for all instances of mcdonalds as two separate entities 

In [7]:
# # N = 2
model = build_ngram_model(processed_text, n=2)
seeds = ['natural', 'language', 'data']  # Starting word or phrase
length = 100  # Length of the generated text

# Generate and print text
for seed in seeds:
    print(f"Seed: '{seed}'")
    print(f"Length: {length} words")
    generated = generate_text(model, seed, length)
    print(generated)
    print("-" * 30)

Seed: 'natural'
Length: 100 words
natural language processing tasks word sense disambiguation this derives the data is analyzing text for all instances of needing to perform tasks word removal common words are two separate entities that text and research and eyes to as well as google translate me can be part of those phrases to understand the main functions listed above are several ways this function automatically generating news article and identify relevant correlations and annual reports that frequently appear in the sentence boundaries in data theyre essentially the following customer is useful for example the following customer feedback analysis and nlp uses either rulebased approach
------------------------------
Seed: 'language'
Length: 100 words
language generation nlg nlg uses patterns in payment transactions to understand them into parts of a python library for example of natural language processing applies algorithms to texts to take realworld input is keyword extraction whi