<a href="https://colab.research.google.com/github/MuskanSharma-22CSU123/LabManual_DeepLearning/blob/main/DL_Exp13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import spacy

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
# Download the 'punkt_tab' data package
nltk.download('punkt_tab') # This line is added to download the missing data

# Part A: Tokenization and Stemming using NLTK

def tokenize_and_stem(text):
    """
    Tokenizes the input text and returns a list of stemmed tokens.
    """
    # Tokenize text into words
    tokens = word_tokenize(text)
    # Initialize the Porter Stemmer
    stemmer = PorterStemmer()
    # Stem each token
    stemmed = [stemmer.stem(token) for token in tokens]
    return stemmed

# Part B: Lemmatization and Stopwords Removal using NLTK

def lemmatize_and_remove_stopwords_nltk(text):
    """
    Tokenizes the text, removes English stopwords, and returns a list of lemmas.
    """
    # Tokenize text
    tokens = word_tokenize(text)
    # Initialize lemmatizer and stopwords list
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    # Lowercase, remove stopwords, and lemmatize
    lemmas = [lemmatizer.lemmatize(token.lower())
              for token in tokens
              if token.isalpha() and token.lower() not in stop_words]
    return lemmas

# Part C: Lemmatization and Stopwords Removal using spaCy

# Load the English model. Make sure to install with:
#   python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

def lemmatize_and_remove_stopwords_spacy(text):
    """
    Processes the text with spaCy, removes stopwords and punctuation,
    and returns a list of token lemmas.
    """
    doc = nlp(text)
    cleaned = [token.lemma_ for token in doc
               if not token.is_stop and not token.is_punct]
    return cleaned

# Example usage
def main():
    sample_text = "Natural Language Processing (NLP) is a fascinating field of AI!"
    print("Original Text:", sample_text)
    print("\n[NLTK] Tokenize & Stem:", tokenize_and_stem(sample_text))
    print("\n[NLTK] Lemmatize & Remove Stopwords:", lemmatize_and_remove_stopwords_nltk(sample_text))
    print("\n[spaCy] Lemmatize & Remove Stopwords:", lemmatize_and_remove_stopwords_spacy(sample_text))

if __name__ == '__main__':
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Original Text: Natural Language Processing (NLP) is a fascinating field of AI!

[NLTK] Tokenize & Stem: ['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'a', 'fascin', 'field', 'of', 'ai', '!']

[NLTK] Lemmatize & Remove Stopwords: ['natural', 'language', 'processing', 'nlp', 'fascinating', 'field', 'ai']

[spaCy] Lemmatize & Remove Stopwords: ['Natural', 'Language', 'Processing', 'NLP', 'fascinating', 'field', 'AI']
