<a href="https://colab.research.google.com/github/Matrix-69/GenAI/blob/main/GenAI_Lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab') # Added to resolve LookupError
nltk.download('averaged_perceptron_tagger_eng') # Added to resolve new LookupError

# Load spaCy model (run once in terminal if not installed)
# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Porsche, founded by Ferdinand Porsche in 1931 and headquartered in Stuttgart, is a renowned German manufacturer of high-performance luxury sports cars, SUVs, and sedans."

# -----------------------------
# 1. Tokenization
# -----------------------------
tokens = word_tokenize(text)
print("\nTokens:")
print(tokens)

# -----------------------------
# 2. Stop Word Removal
# -----------------------------
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word.lower() not in stop_words]

print("\nAfter Stop Word Removal:")
print(filtered_tokens)

# -----------------------------
# 3. Stemming
# -----------------------------
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

print("\nStemming:")
print(stemmed_words)

# -----------------------------
# 4. Lemmatization
# -----------------------------
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("\nLemmatization:")
print(lemmatized_words)

# -----------------------------
# 5. POS Tagging
# -----------------------------
pos_tags = nltk.pos_tag(tokens)

print("\nPOS Tagging:")
print(pos_tags)

# -----------------------------
# 6. Named Entity Recognition (NER)
# -----------------------------
doc = nlp(text)
print("\nNamed Entities:")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)

# -----------------------------
# 7. Word Frequency Analysis
# -----------------------------
word_freq = Counter(filtered_tokens)

print("\nWord Frequency:")
for word, freq in word_freq.items():
    print(word, ":", freq)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.



Tokens:
['Porsche', ',', 'founded', 'by', 'Ferdinand', 'Porsche', 'in', '1931', 'and', 'headquartered', 'in', 'Stuttgart', ',', 'is', 'a', 'renowned', 'German', 'manufacturer', 'of', 'high-performance', 'luxury', 'sports', 'cars', ',', 'SUVs', ',', 'and', 'sedans', '.']

After Stop Word Removal:
['Porsche', 'founded', 'Ferdinand', 'Porsche', '1931', 'headquartered', 'Stuttgart', 'renowned', 'German', 'manufacturer', 'luxury', 'sports', 'cars', 'SUVs', 'sedans']

Stemming:
['porsch', 'found', 'ferdinand', 'porsch', '1931', 'headquart', 'stuttgart', 'renown', 'german', 'manufactur', 'luxuri', 'sport', 'car', 'suv', 'sedan']

Lemmatization:
['Porsche', 'founded', 'Ferdinand', 'Porsche', '1931', 'headquartered', 'Stuttgart', 'renowned', 'German', 'manufacturer', 'luxury', 'sport', 'car', 'SUVs', 'sedan']

POS Tagging:
[('Porsche', 'NNP'), (',', ','), ('founded', 'VBN'), ('by', 'IN'), ('Ferdinand', 'NNP'), ('Porsche', 'NNP'), ('in', 'IN'), ('1931', 'CD'), ('and', 'CC'), ('headquartered', '