In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rmshw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rmshw\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rmshw\AppData\Roaming\nltk_data...


True

In [18]:
paragraph = "Technology is evolving rapidly in today's world. From smartphones to smart homes, innovation is constant. Artificial Intelligence and Machine Learning are transforming industries. I enjoy exploring new gadgets and learning about futuristic tech. The impact of technology on healthcare and education is remarkable. It's fascinating how fast things are changing."
lowercase_text = paragraph.lower()
no_punct_text = re.sub(r'[^\w\s]', '', lowercase_text)
print(no_punct_text)

technology is evolving rapidly in todays world from smartphones to smart homes innovation is constant artificial intelligence and machine learning are transforming industries i enjoy exploring new gadgets and learning about futuristic tech the impact of technology on healthcare and education is remarkable its fascinating how fast things are changing


In [19]:
nltk.download('punkt_tab')
sentences = sent_tokenize(paragraph)
words = word_tokenize(no_punct_text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rmshw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

In [21]:
word_freq = Counter(filtered_words)
for word, count in word_freq.most_common(10):
    print(f"{word}: {count}")

technology: 2
learning: 2
evolving: 1
rapidly: 1
todays: 1
world: 1
smartphones: 1
smart: 1
homes: 1
innovation: 1


In [22]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
porter = PorterStemmer()
lancaster = LancasterStemmer()


In [8]:
lemmatizer = WordNetLemmatizer()

In [24]:
for word in filtered_words[:10]:
    porter_result = porter.stem(word)
    lancaster_result = lancaster.stem(word)
    lemma_result = lemmatizer.lemmatize(word)
    print(f"{word}\t{porter_result}\t{lancaster_result}\t{lemma_result}")


technology	technolog	technolog	technology
evolving	evolv	evolv	evolving
rapidly	rapidli	rapid	rapidly
todays	today	today	today
world	world	world	world
smartphones	smartphon	smartphon	smartphones
smart	smart	smart	smart
homes	home	hom	home
innovation	innov	innov	innovation
constant	constant	const	constant


In [26]:
long_words = re.findall(r'\b\w{6,}\b', paragraph)
print(long_words[:15])
numbers = re.findall(r'\b\d+\b', paragraph)
print(numbers)
cap_words =  re.findall(r'\b[A-Z][a-z]*\b', paragraph)
print(cap_words)

['Technology', 'evolving', 'rapidly', 'smartphones', 'innovation', 'constant', 'Artificial', 'Intelligence', 'Machine', 'Learning', 'transforming', 'industries', 'exploring', 'gadgets', 'learning']
[]
['Technology', 'From', 'Artificial', 'Intelligence', 'Machine', 'Learning', 'I', 'The', 'It']


In [14]:
alpha_only = re.findall(r'\b[a-zA-Z]+\b', paragraph)
print(alpha_only[:15])
vowel_words = re.findall(r'\b[aeiouAEIOU][a-zA-Z]*\b', paragraph)
print(vowel_words)

['Steve', 'Cohen', 'is', 'a', 'prominent', 'American', 'billionaire', 'and', 'hedge', 'fund', 'manager', 'best', 'known', 'as', 'the']
['is', 'a', 'American', 'and', 'as', 'and', 'of', 'Asset', 'a', 'investment', 'Advisors', 'in', 'achieving', 'and', 'earning', 'a', 'as', 'one', 'of', 'impressive', 'ultimately', 'after', 'insider', 'in', 'in', 'in', 'After', 'a', 'outside', 'emerged', 'operations', 'into', 'in', 'assets', 'and', 'is', 'also', 'extensive', 'art', 'and', 'as', 'owner', 'and', 'of', 'influence', 'in', 'and']


In [15]:
text_sample = paragraph + " His email is steve.cohen@example.com. Check out https://www.point72.com. Call at 123-456-7890 or +91 9876543210. The firm's value is $3.14 billion."

In [16]:
def custom_tokenize(text):
    text_temp = re.sub(r"(\w+)'(\w+)", r"\1'\2", text)
    text_temp = re.sub(r"(\w+)-(\w+)(-(\w+))?", lambda m: m.group(0).replace("-", "HYPHEN"), text_temp)
    text_temp = re.sub(r"(\d+)\.(\d+)", lambda m: m.group(0).replace(".", "DECIMAL"), text_temp)
    text_temp = re.sub(r'[^\w\s]', ' ', text_temp)
    tokens = text_temp.split()
    tokens = [token.replace("HYPHEN", "-").replace("DECIMAL", ".") for token in tokens]

    return tokens
custom_tokens = custom_tokenize(text_sample)
print(custom_tokens[:15])

['Steve', 'Cohen', 'is', 'a', 'prominent', 'American', 'billionaire', 'and', 'hedge', 'fund', 'manager', 'best', 'known', 'as', 'the']


In [17]:
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_replaced = re.sub(email_pattern, '<EMAIL>', text_sample)
url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
url_replaced = re.sub(url_pattern, '<URL>', email_replaced)

phone_pattern = r'(\+\d{1,3}\s\d{10}|\d{3}-\d{3}-\d{4})'
phone_replaced = re.sub(phone_pattern, '<PHONE>', url_replaced)

print(phone_replaced)

Steve Cohen is a prominent American billionaire and hedge fund manager, best known as the founder and CEO of Point72 Asset Management, a global multi-strategy investment firm. He launched his first major hedge fund, SAC Capital Advisors, in 1992, achieving remarkable returns and earning a reputation as one of Wall Street's most successful traders. Despite SAC Capital's impressive performance, the firm was ultimately shut down after pleading guilty to insider trading charges in 2013, resulting in $1.8 billion in fines, though Cohen himself was never personally charged. After a period during which he was barred from managing outside money, Cohen re-emerged by transforming his operations into Point72, which now manages billions in assets and serves clients worldwide. Beyond finance, Cohen is also known for his philanthropy, extensive modern art collection, and as the owner and CEO of Major League Baseball's New York Mets, reflecting his wide-ranging influence in both business and culture 