# Unigram, Bigram, Trigram using NLTK

In [6]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams


# Download NLTK resources if not already downloaded
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
def generate_ngrams(text: str, n: int) -> list:
    """
    Generate n-grams (unigrams, bigrams, trigrams) from the given text.

    Parameters:
    text (str): The input text to generate n-grams from.
    n (int): The value of n for n-grams (1 for unigram, 2 for bigram, 3 for trigram).

    Returns:
    list: A list of n-grams.
    """
    # Tokenize the input text into words
    tokens = word_tokenize(text)

    # Generate n-grams
    n_grams = list(ngrams(tokens, n))

    return n_grams

In [8]:
# Example usage
if __name__ == "__main__":
    sample_text = "Natural language processing is an exciting field of study."

    # Generate Unigrams (1-grams)
    unigrams = generate_ngrams(sample_text, 1)
    print("Unigrams:", unigrams)

    # Generate Bigrams (2-grams)
    bigrams = generate_ngrams(sample_text, 2)
    print("Bigrams:", bigrams)

    # Generate Trigrams (3-grams)
    trigrams = generate_ngrams(sample_text, 3)
    print("Trigrams:", trigrams)

Unigrams: [('Natural',), ('language',), ('processing',), ('is',), ('an',), ('exciting',), ('field',), ('of',), ('study',), ('.',)]
Bigrams: [('Natural', 'language'), ('language', 'processing'), ('processing', 'is'), ('is', 'an'), ('an', 'exciting'), ('exciting', 'field'), ('field', 'of'), ('of', 'study'), ('study', '.')]
Trigrams: [('Natural', 'language', 'processing'), ('language', 'processing', 'is'), ('processing', 'is', 'an'), ('is', 'an', 'exciting'), ('an', 'exciting', 'field'), ('exciting', 'field', 'of'), ('field', 'of', 'study'), ('of', 'study', '.')]
