<a href="https://colab.research.google.com/github/MuskanSharma-22CSU123/LabManual_DeepLearning/blob/main/DL_Exp14.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Part A: Tokenization and Stemming using NLTK

def tokenize_and_stem(text):
    """
    Tokenizes the input text and returns a list of stemmed tokens.
    """
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

# Part B: Lemmatization and Stopwords Removal using NLTK

def lemmatize_and_remove_stopwords_nltk(text):
    """
    Tokenizes the text, removes English stopwords, and returns a list of lemmas.
    """
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    return [lemmatizer.lemmatize(token.lower())
            for token in tokens
            if token.isalpha() and token.lower() not in stop_words]

# Part C: Lemmatization and Stopwords Removal using spaCy

nlp = spacy.load('en_core_web_sm')

def lemmatize_and_remove_stopwords_spacy(text):
    """
    Processes the text with spaCy, removes stopwords and punctuation,
    and returns a list of token lemmas.
    """
    doc = nlp(text)
    return [token.lemma_ for token in doc
            if not token.is_stop and not token.is_punct]

# Part D: Bag-of-Words using CountVectorizer

def bag_of_words_count(texts, max_features=None):
    """
    Fits a CountVectorizer on the list of texts and returns
    the feature matrix and vectorizer instance.
    """
    vectorizer = CountVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

# Part E: Bag-of-n-grams using CountVectorizer

def bag_of_ngrams_count(texts, n_range=(1, 2), max_features=None):
    """
    Fits a CountVectorizer that captures n-grams in the specified range
    and returns the feature matrix and vectorizer instance.
    """
    vectorizer = CountVectorizer(ngram_range=n_range, max_features=max_features)
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

# Part F: TF-IDF Bag-of-Words using TfidfVectorizer

def bag_of_words_tfidf(texts, max_features=None):
    """
    Fits a TfidfVectorizer on the list of texts and returns
    the TF-IDF feature matrix and vectorizer instance.
    """
    vectorizer = TfidfVectorizer(max_features=max_features)
    X = vectorizer.fit_transform(texts)
    return X, vectorizer

# Example usage
def main():
    sample_texts = [
        "Natural Language Processing (NLP) is fascinating.",
        "Tokenization, stemming, and lemmatization are crucial steps."
    ]

    # Preprocessing examples
    print("[NLTK] Tokenize & Stem:", tokenize_and_stem(sample_texts[0]))
    print("[NLTK] Lemmatize & Remove Stopwords:", lemmatize_and_remove_stopwords_nltk(sample_texts[1]))
    print("[spaCy] Lemmatize & Remove Stopwords:", lemmatize_and_remove_stopwords_spacy(sample_texts[1]))

    # Vectorization examples
    X_bow, bow_vec = bag_of_words_count(sample_texts)
    print("\nBag-of-Words feature names:", bow_vec.get_feature_names_out())

    X_ngrams, ngram_vec = bag_of_ngrams_count(sample_texts, n_range=(1, 3))
    print("\nBag-of-n-grams (1-3) feature names:", ngram_vec.get_feature_names_out())

    X_tfidf, tfidf_vec = bag_of_words_tfidf(sample_texts)
    print("\nTF-IDF feature names:", tfidf_vec.get_feature_names_out())

if __name__ == '__main__':
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[NLTK] Tokenize & Stem: ['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'fascin', '.']
[NLTK] Lemmatize & Remove Stopwords: ['tokenization', 'stemming', 'lemmatization', 'crucial', 'step']
[spaCy] Lemmatize & Remove Stopwords: ['Tokenization', 'stemming', 'lemmatization', 'crucial', 'step']

Bag-of-Words feature names: ['and' 'are' 'crucial' 'fascinating' 'is' 'language' 'lemmatization'
 'natural' 'nlp' 'processing' 'stemming' 'steps' 'tokenization']

Bag-of-n-grams (1-3) feature names: ['and' 'and lemmatization' 'and lemmatization are' 'are' 'are crucial'
 'are crucial steps' 'crucial' 'crucial steps' 'fascinating' 'is'
 'is fascinating' 'language' 'language processing'
 'language processing nlp' 'lemmatization' 'lemmatization are'
 'lemmatization are crucial' 'natural' 'natural language'
 'natural language processing' 'nlp' 'nlp is' 'nlp is fascinating'
 'processing' 'processing nlp' 'processing nlp is' 'stemming'
 'stemming and' 'stemming and lemmatization' 'steps' 'tokenizat