In [12]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')


text = """I love learning about new technologies. Artificial Intelligence is evolving rapidly.
Python is my favorite programming language because of its simplicity and power.
Every year, we see breakthroughs in natural language processing.
Machine learning makes computers smarter and more useful.
The tech world is always full of surprises and innovations."""


text = text.lower().translate(str.maketrans('', '', string.punctuation))


tokens = word_tokenize(text)


stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]


freq_dist = Counter(filtered_tokens)
print(freq_dist)


Counter({'learning': 2, 'language': 2, 'love': 1, 'new': 1, 'technologies': 1, 'artificial': 1, 'intelligence': 1, 'evolving': 1, 'rapidly': 1, 'python': 1, 'favorite': 1, 'programming': 1, 'simplicity': 1, 'power': 1, 'every': 1, 'year': 1, 'see': 1, 'breakthroughs': 1, 'natural': 1, 'processing': 1, 'machine': 1, 'makes': 1, 'computers': 1, 'smarter': 1, 'useful': 1, 'tech': 1, 'world': 1, 'always': 1, 'full': 1, 'surprises': 1, 'innovations': 1})


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
import nltk
nltk.download('wordnet')
porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()


porter_stems = [porter.stem(word) for word in filtered_tokens]
lancaster_stems = [lancaster.stem(word) for word in filtered_tokens]


lemmatized = [lemmatizer.lemmatize(word) for word in filtered_tokens]

print("Porter Stemmer:", porter_stems)
print("Lancaster Stemmer:", lancaster_stems)
print("Lemmatized:", lemmatized)


Porter Stemmer: ['love', 'learn', 'new', 'technolog', 'artifici', 'intellig', 'evolv', 'rapidli', 'python', 'favorit', 'program', 'languag', 'simplic', 'power', 'everi', 'year', 'see', 'breakthrough', 'natur', 'languag', 'process', 'machin', 'learn', 'make', 'comput', 'smarter', 'use', 'tech', 'world', 'alway', 'full', 'surpris', 'innov']
Lancaster Stemmer: ['lov', 'learn', 'new', 'technolog', 'art', 'intellig', 'evolv', 'rapid', 'python', 'favorit', 'program', 'langu', 'simpl', 'pow', 'every', 'year', 'see', 'breakthrough', 'nat', 'langu', 'process', 'machin', 'learn', 'mak', 'comput', 'smart', 'us', 'tech', 'world', 'alway', 'ful', 'surpr', 'innov']
Lemmatized: ['love', 'learning', 'new', 'technology', 'artificial', 'intelligence', 'evolving', 'rapidly', 'python', 'favorite', 'programming', 'language', 'simplicity', 'power', 'every', 'year', 'see', 'breakthrough', 'natural', 'language', 'processing', 'machine', 'learning', 'make', 'computer', 'smarter', 'useful', 'tech', 'world', 'al

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
import re




long_words = [word for word in tokens if len(word) > 5]


numbers = re.findall(r'\d+', text)


capitalized_words = re.findall(r'\b[A-Z][a-z]*\b', """Original Text Here""")


alpha_words = [word for word in tokens if word.isalpha()]


vowel_words = [word for word in tokens if word[0] in 'aeiou']

print("Long words:", long_words)
print("Numbers:", numbers)
print("Capitalized words:", capitalized_words)
print("Only alphabets:", alpha_words)
print("Words starting with vowels:", vowel_words)


Long words: ['learning', 'technologies', 'artificial', 'intelligence', 'evolving', 'rapidly', 'python', 'favorite', 'programming', 'language', 'because', 'simplicity', 'breakthroughs', 'natural', 'language', 'processing', 'machine', 'learning', 'computers', 'smarter', 'useful', 'always', 'surprises', 'innovations']
Numbers: []
Capitalized words: ['Original', 'Text', 'Here']
Only alphabets: ['i', 'love', 'learning', 'about', 'new', 'technologies', 'artificial', 'intelligence', 'is', 'evolving', 'rapidly', 'python', 'is', 'my', 'favorite', 'programming', 'language', 'because', 'of', 'its', 'simplicity', 'and', 'power', 'every', 'year', 'we', 'see', 'breakthroughs', 'in', 'natural', 'language', 'processing', 'machine', 'learning', 'makes', 'computers', 'smarter', 'and', 'more', 'useful', 'the', 'tech', 'world', 'is', 'always', 'full', 'of', 'surprises', 'and', 'innovations']
Words starting with vowels: ['i', 'about', 'artificial', 'intelligence', 'is', 'evolving', 'is', 'of', 'its', 'and'

In [15]:
import re

def custom_tokenize(text):

    text = re.sub(r"(\w)(')(\w)", r"\1\2\3", text)


    text = re.sub(r"(\w+)-(\w+)", r"\1-\2", text)


    tokens = re.findall(r"\d+\.\d+|\w+(?:'\w+)?|[\w-]+", text)
    return tokens

custom_tokens = custom_tokenize("""Email me at john@example.com or visit https://my-site.org.
My number is 123-456-7890 or +91 9876543210.""")

print("Custom Tokens:", custom_tokens)


Custom Tokens: ['Email', 'me', 'at', 'john', 'example', 'com', 'or', 'visit', 'https', 'my', '-site', 'org', 'My', 'number', 'is', '123', '-456-7890', 'or', '91', '9876543210']


In [16]:
text = """Email me at john@example.com or visit https://my-site.org.
My number is 123-456-7890 or +91 9876543210."""


text = re.sub(r'\S+@\S+', '<EMAIL>', text)


text = re.sub(r'http\S+|www.\S+', '<URL>', text)


text = re.sub(r'(\+?\d{1,3}[- ]?)?\d{3}[- ]?\d{3}[- ]?\d{4}', '<PHONE>', text)

print(text)


Email me at <EMAIL> or visit <URL> 
My number is <PHONE> or <PHONE>.
