In [1]:
import nltk, spacy
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag, ngrams

In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
text = "Apple is looking at buying U.K. startup for $1 billion in 2025."

In [4]:
# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', 'in', '2025', '.']


In [16]:
# Stopword Removal
filtered = [w for w in tokens if w.isalpha() and w.lower() not in stopwords.words("english")]
print("After Stopword Removal:", filtered)

After Stopword Removal: ['Apple', 'looking', 'buying', 'startup', 'billion']


In [17]:
#import string
# Remove punctuation (just in case any remain)
#punct = [w for w in filtered if w not in string.punctuation]

#print("After Punctuation Removal:",punct)

In [18]:
# POS Tagging
print("POS Tags:", pos_tag(filtered))

POS Tags: [('Apple', 'NNP'), ('looking', 'VBG'), ('buying', 'VBG'), ('startup', 'NN'), ('billion', 'CD')]


In [8]:
# Named Entity Recognition
doc = nlp(text)
print("NER:", [(ent.text, ent.label_) for ent in doc.ents])

NER: [('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY'), ('2025', 'DATE')]


## to demonstrate Bag of words,TF & IDF Vectrization, N-grams

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.util import ngrams

In [3]:
#nltk.download('punkt')

In [4]:
text = "Apple is looking at buying U.K. startup for $1 billion in 2025."
t2="buying new startup hi"
t3 = [ "Apple is looking at buying U.K. startup for $1 billion in 2025.",
       "buying new startup"
]


In [5]:
# ---- 1. Tokenization ----
tokens = word_tokenize(text)
print("Tokens:", tokens)


Tokens: ['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', 'in', '2025', '.']


In [6]:
# ---- 2. Bag of Words (BoW) ----
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(t3)
print("\nBoW Words:", vectorizer.get_feature_names_out())
print("BoW Counts:", bow.toarray())


BoW Words: ['2025' 'apple' 'at' 'billion' 'buying' 'for' 'in' 'is' 'looking' 'new'
 'startup']
BoW Counts: [[1 1 1 1 1 1 1 1 1 0 1]
 [0 0 0 0 1 0 0 0 0 1 1]]


In [7]:
# ---- 3. TF-IDF ----
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform([text,t2])
print("\nTF-IDF Words:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Values:", tfidf.toarray())


TF-IDF Words: ['2025' 'apple' 'at' 'billion' 'buying' 'for' 'hi' 'in' 'is' 'looking'
 'new' 'startup']
TF-IDF Values: [[0.33310232 0.33310232 0.33310232 0.33310232 0.23700504 0.33310232
  0.         0.33310232 0.33310232 0.33310232 0.         0.23700504]
 [0.         0.         0.         0.         0.40993715 0.
  0.57615236 0.         0.         0.         0.57615236 0.40993715]]


In [8]:
# ---- 3. TF-IDF ----
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(t3)
print("\nTF-IDF Words:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Values:", tfidf.toarray())


TF-IDF Words: ['2025' 'apple' 'at' 'billion' 'buying' 'for' 'in' 'is' 'looking' 'new'
 'startup']
TF-IDF Values: [[0.33310232 0.33310232 0.33310232 0.33310232 0.23700504 0.33310232
  0.33310232 0.33310232 0.33310232 0.         0.23700504]
 [0.         0.         0.         0.         0.50154891 0.
  0.         0.         0.         0.70490949 0.50154891]]


In [10]:
# N-grams
print("Bigrams:", list(ngrams(tokens, 2)))
print("Trigrams:", list(ngrams(tokens, 3)))

Bigrams: [('Apple', 'is'), ('is', 'looking'), ('looking', 'at'), ('at', 'buying'), ('buying', 'U.K.'), ('U.K.', 'startup'), ('startup', 'for'), ('for', '$'), ('$', '1'), ('1', 'billion'), ('billion', 'in'), ('in', '2025'), ('2025', '.')]
Trigrams: [('Apple', 'is', 'looking'), ('is', 'looking', 'at'), ('looking', 'at', 'buying'), ('at', 'buying', 'U.K.'), ('buying', 'U.K.', 'startup'), ('U.K.', 'startup', 'for'), ('startup', 'for', '$'), ('for', '$', '1'), ('$', '1', 'billion'), ('1', 'billion', 'in'), ('billion', 'in', '2025'), ('in', '2025', '.')]
