In [12]:
import nltk
nltk.download('punkt')  # Download necessary data files (if not done already)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt to C:\Users\Anand
[nltk_data]     Vishwakarma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Anand
[nltk_data]     Vishwakarma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Anand
[nltk_data]     Vishwakarma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Anand
[nltk_data]     Vishwakarma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [2]:
from nltk.tokenize import word_tokenize
text = "word_tokenize splits the text into words and punctuation as separate tokens It uses the Punkt tokenizer, which is a pre-trained model for tokenizing text in many languages."
tokens = word_tokenize(text)
tokens

['word_tokenize',
 'splits',
 'the',
 'text',
 'into',
 'words',
 'and',
 'punctuation',
 'as',
 'separate',
 'tokens',
 'It',
 'uses',
 'the',
 'Punkt',
 'tokenizer',
 ',',
 'which',
 'is',
 'a',
 'pre-trained',
 'model',
 'for',
 'tokenizing',
 'text',
 'in',
 'many',
 'languages',
 '.']

In [3]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
clean_words = [word.lower() for word in tokens if word not in stop_words]
clean_words

['word_tokenize',
 'splits',
 'text',
 'words',
 'punctuation',
 'separate',
 'tokens',
 'it',
 'uses',
 'punkt',
 'tokenizer',
 ',',
 'pre-trained',
 'model',
 'tokenizing',
 'text',
 'many',
 'languages',
 '.']

In [None]:
## Porter stemming
from nltk.stem import PorterStemmer
stemming = PorterStemmer()
stem = [stemming.stem(w) for w in clean_words]
stem

['word_token',
 'split',
 'text',
 'word',
 'punctuat',
 'separ',
 'token',
 'it',
 'use',
 'punkt',
 'token',
 ',',
 'pre-train',
 'model',
 'token',
 'text',
 'mani',
 'languag',
 '.']

In [7]:
## Porter SnoballStemmer
from nltk.stem import SnowballStemmer
stemming = SnowballStemmer("english")
stem = [stemming.stem(w) for w in clean_words]
stem

['word_token',
 'split',
 'text',
 'word',
 'punctuat',
 'separ',
 'token',
 'it',
 'use',
 'punkt',
 'token',
 ',',
 'pre-train',
 'model',
 'token',
 'text',
 'mani',
 'languag',
 '.']

In [8]:
## Lemmatizer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Anand
[nltk_data]     Vishwakarma\AppData\Roaming\nltk_data...


True

In [10]:
lemmatizer = WordNetLemmatizer()
word_lemmatizer = [lemmatizer.lemmatize(word) for word in clean_words]
word_lemmatizer

['word_tokenize',
 'split',
 'text',
 'word',
 'punctuation',
 'separate',
 'token',
 'it',
 'us',
 'punkt',
 'tokenizer',
 ',',
 'pre-trained',
 'model',
 'tokenizing',
 'text',
 'many',
 'language',
 '.']

In [None]:
## Part Of Speech and lemmatizer
pos = nltk.pos_tag(clean_words)
pos_lemm = [lemmatizer.lemmatize(word, pos='v' if tag.startswith('V') else 'n') for word, tag in pos]
pos_lemm

['word_tokenize',
 'split',
 'text',
 'word',
 'punctuation',
 'separate',
 'token',
 'it',
 'use',
 'punkt',
 'tokenizer',
 ',',
 'pre-trained',
 'model',
 'tokenizing',
 'text',
 'many',
 'language',
 '.']

In [21]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 3.6 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip available: 22.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
import spacy
nlp = spacy.load("en_core_web_sm")

text2 = "Apple Inc. is planning to open a new store in New York City on March 10, 2025."

## Named Entity Recognition (NER)
ner_doc = nlp(text2)
ner_doc, ner_doc.ents

(Apple Inc. is planning to open a new store in New York City on March 10, 2025.,
 (Apple Inc., New York City, March 10, 2025))

In [33]:
for ent in ner_doc:
    print(ent.text)

Apple
Inc.
is
planning
to
open
a
new
store
in
New
York
City
on
March
10
,
2025
.


In [None]:
# Process the text
doc = nlp(text)

# Entity Labels:
# Here are some common entity labels you'll encounter:

# PERSON: A person’s name (e.g., "Barack Obama").
# ORG: Organization (e.g., "Apple", "United Nations").
# GPE: Geopolitical entity, typically a country or city (e.g., "New York", "Germany").
# DATE: Date or time expression (e.g., "March 10, 2025").
# LOC: Location (e.g., "Mount Everest").
# MONEY: Monetary values (e.g., "$100", "€50").
# TIME: Time expressions (e.g., "2 PM", "midnight").
# Extract named entities
print("Named Entities, Phrases, and Concepts:")

for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

Named Entities, Phrases, and Concepts:
Punkt (ORG)


In [None]:
# Sentiment Analysis