In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Q1.1

In [3]:
paragraph = "Steve Cohen is a prominent American billionaire and hedge fund manager, best known as the founder and CEO of Point72 Asset Management, a global multi-strategy investment firm. He launched his first major hedge fund, SAC Capital Advisors, in 1992, achieving remarkable returns and earning a reputation as one of Wall Street's most successful traders. Despite SAC Capital's impressive performance, the firm was ultimately shut down after pleading guilty to insider trading charges in 2013, resulting in $1.8 billion in fines, though Cohen himself was never personally charged. After a period during which he was barred from managing outside money, Cohen re-emerged by transforming his operations into Point72, which now manages billions in assets and serves clients worldwide. Beyond finance, Cohen is also known for his philanthropy, extensive modern art collection, and as the owner and CEO of Major League Baseball's New York Mets, reflecting his wide-ranging influence in both business and culture"
lowercase_text = paragraph.lower()
no_punct_text = re.sub(r'[^\w\s]', '', lowercase_text)
print(no_punct_text[:100], "...")

steve cohen is a prominent american billionaire and hedge fund manager best known as the founder and ...


Q1.2


In [7]:
nltk.download('punkt_tab')
sentences = sent_tokenize(paragraph)
words = word_tokenize(no_punct_text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Q1.3


In [8]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

Q1.4

In [9]:
word_freq = Counter(filtered_words)
for word, count in word_freq.most_common(10):
    print(f"{word}: {count}")

cohen: 4
hedge: 2
fund: 2
known: 2
ceo: 2
point72: 2
firm: 2
major: 2
sac: 2
steve: 1


Q2.2

In [10]:
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
porter = PorterStemmer()
lancaster = LancasterStemmer()


Q2.3


In [11]:
lemmatizer = WordNetLemmatizer()

Q2.4

In [12]:
for word in filtered_words[:10]:
    porter_result = porter.stem(word)
    lancaster_result = lancaster.stem(word)
    lemma_result = lemmatizer.lemmatize(word)
    print(f"{word}\t{porter_result}\t{lancaster_result}\t{lemma_result}")


steve	steve	stev	steve
cohen	cohen	coh	cohen
prominent	promin	promin	prominent
american	american	am	american
billionaire	billionair	billionair	billionaire
hedge	hedg	hedg	hedge
fund	fund	fund	fund
manager	manag	man	manager
best	best	best	best
known	known	known	known


Q3.2

In [13]:
long_words = re.findall(r'\b\w{6,}\b', paragraph)
print(long_words[:15])
numbers = re.findall(r'\d+\.?\d*', paragraph)
print(numbers)
cap_words = re.findall(r'\b[A-Z][a-zA-Z]*\b', paragraph)
print(cap_words)

['prominent', 'American', 'billionaire', 'manager', 'founder', 'Point72', 'Management', 'global', 'strategy', 'investment', 'launched', 'Capital', 'Advisors', 'achieving', 'remarkable']
['72', '1992', '2013', '1.8', '72']
['Steve', 'Cohen', 'American', 'CEO', 'Asset', 'Management', 'He', 'SAC', 'Capital', 'Advisors', 'Wall', 'Street', 'Despite', 'SAC', 'Capital', 'Cohen', 'After', 'Cohen', 'Beyond', 'Cohen', 'CEO', 'Major', 'League', 'Baseball', 'New', 'York', 'Mets']


Q3.3

In [14]:
alpha_only = re.findall(r'\b[a-zA-Z]+\b', paragraph)
print(alpha_only[:15])
vowel_words = re.findall(r'\b[aeiouAEIOU][a-zA-Z]*\b', paragraph)
print(vowel_words)

['Steve', 'Cohen', 'is', 'a', 'prominent', 'American', 'billionaire', 'and', 'hedge', 'fund', 'manager', 'best', 'known', 'as', 'the']
['is', 'a', 'American', 'and', 'as', 'and', 'of', 'Asset', 'a', 'investment', 'Advisors', 'in', 'achieving', 'and', 'earning', 'a', 'as', 'one', 'of', 'impressive', 'ultimately', 'after', 'insider', 'in', 'in', 'in', 'After', 'a', 'outside', 'emerged', 'operations', 'into', 'in', 'assets', 'and', 'is', 'also', 'extensive', 'art', 'and', 'as', 'owner', 'and', 'of', 'influence', 'in', 'and']


Q4.1


In [15]:
text_sample = paragraph + " His email is steve.cohen@example.com. Check out https://www.point72.com. Call at 123-456-7890 or +91 9876543210. The firm's value is $3.14 billion."

Q4.2

In [16]:
def custom_tokenize(text):
    text_temp = re.sub(r"(\w+)'(\w+)", r"\1'\2", text)
    text_temp = re.sub(r"(\w+)-(\w+)(-(\w+))?", lambda m: m.group(0).replace("-", "HYPHEN"), text_temp)
    text_temp = re.sub(r"(\d+)\.(\d+)", lambda m: m.group(0).replace(".", "DECIMAL"), text_temp)
    text_temp = re.sub(r'[^\w\s]', ' ', text_temp)
    tokens = text_temp.split()
    tokens = [token.replace("HYPHEN", "-").replace("DECIMAL", ".") for token in tokens]

    return tokens
custom_tokens = custom_tokenize(text_sample)
print(custom_tokens[:15])

['Steve', 'Cohen', 'is', 'a', 'prominent', 'American', 'billionaire', 'and', 'hedge', 'fund', 'manager', 'best', 'known', 'as', 'the']


Q4.3

In [17]:
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_replaced = re.sub(email_pattern, '<EMAIL>', text_sample)
url_pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
url_replaced = re.sub(url_pattern, '<URL>', email_replaced)

phone_pattern = r'(\+\d{1,3}\s\d{10}|\d{3}-\d{3}-\d{4})'
phone_replaced = re.sub(phone_pattern, '<PHONE>', url_replaced)

print(phone_replaced)

Steve Cohen is a prominent American billionaire and hedge fund manager, best known as the founder and CEO of Point72 Asset Management, a global multi-strategy investment firm. He launched his first major hedge fund, SAC Capital Advisors, in 1992, achieving remarkable returns and earning a reputation as one of Wall Street's most successful traders. Despite SAC Capital's impressive performance, the firm was ultimately shut down after pleading guilty to insider trading charges in 2013, resulting in $1.8 billion in fines, though Cohen himself was never personally charged. After a period during which he was barred from managing outside money, Cohen re-emerged by transforming his operations into Point72, which now manages billions in assets and serves clients worldwide. Beyond finance, Cohen is also known for his philanthropy, extensive modern art collection, and as the owner and CEO of Major League Baseball's New York Mets, reflecting his wide-ranging influence in both business and culture 