In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

nltk.download('punkt_tab') #apparently you need this before downloading punkt
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "I love exploring the latest advancements in artificial intelligence. The rapid evolution of machine learning models fascinates me. Companies are integrating AI into everyday applications like virtual assistants and chatbots. Healthcare is benefiting from AI-driven diagnostics and predictive analytics. It excites me to see how technology can improve lives. The potential for innovation in this field is endless."

# Q1
low = text.lower()
no_punc = re.sub(r'[^\w\s]', '', low)
s = sent_tokenize(text)
w = word_tokenize(no_punc)
sw = set(stopwords.words('english'))
ns_w = [t for t in w if t not in sw]
freq = nltk.FreqDist(ns_w)
print("Sentences:", s)
print("Words:", w)
print("Words without stopwords:", ns_w)
print("Frequency Distribution:", freq.most_common())

# Q2
ps = PorterStemmer()
ls = LancasterStemmer()
le = WordNetLemmatizer()
stem_p = [ps.stem(t) for t in ns_w]
stem_l = [ls.stem(t) for t in ns_w]
lemma = [le.lemmatize(t) for t in ns_w]
print("Porter Stemming:", stem_p)
print("Lancaster Stemming:", stem_l)
print("Lemmatization:", lemma)

# Q3
gt5 = re.findall(r'\b\w{6,}\b', text)
nums = re.findall(r'\d+', text)
caps = re.findall(r'\b[A-Z][a-z]*\b', text)
alpha = re.findall(r'\b[a-zA-Z]+\b', text)
vow = re.findall(r'\b[AEIOUaeiou]\w*', text)
print("Words with >5 letters:", gt5)
print("Numbers:", nums)
print("Capitalized words:", caps)
print("Alphabet-only words:", alpha)
print("Words starting with vowel:", vow)

# Q4
def tokenize(t):
    t = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', t)
    t = re.sub(r'https?://\S+|www\.\S+', '<URL>', t)
    t = re.sub(r'\+?\d{1,3}[\s-]\d{6,10}', '<PHONE>', t)
    return re.findall(r'(?:\d+\.\d+|\w+(?:[-\']\w+)*)', t)

tokens = tokenize(text)
print("Custom tokens:", tokens)


[nltk_data] Downloading package punkt_tab to /home/dnp/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /home/dnp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dnp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dnp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sentences: ['I love exploring the latest advancements in artificial intelligence.', 'The rapid evolution of machine learning models fascinates me.', 'Companies are integrating AI into everyday applications like virtual assistants and chatbots.', 'Healthcare is benefiting from AI-driven diagnostics and predictive analytics.', 'It excites me to see how technology can improve lives.', 'The potential for innovation in this field is endless.']
Words: ['i', 'love', 'exploring', 'the', 'latest', 'advancements', 'in', 'artificial', 'intelligence', 'the', 'rapid', 'evolution', 'of', 'machine', 'learning', 'models', 'fascinates', 'me', 'companies', 'are', 'integrating', 'ai', 'into', 'everyday', 'applications', 'like', 'virtual', 'assistants', 'and', 'chatbots', 'healthcare', 'is', 'benefiting', 'from', 'aidriven', 'diagnostics', 'and', 'predictive', 'analytics', 'it', 'excites', 'me', 'to', 'see', 'how', 'technology', 'can', 'improve', 'lives', 'the', 'potential', 'for', 'innovation', 'in', 'th