# Tokenization

In [1]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
# Define a sample text for processing
a = "Hello and welcome friends to NLP workshop. My name is shridhar mankar. I will be teaching you NLP from scratch"

In [4]:
# Tokenize the text into words
A = word_tokenize(a)
A

['Hello',
 'and',
 'welcome',
 'friends',
 'to',
 'NLP',
 'workshop',
 '.',
 'My',
 'name',
 'is',
 'shridhar',
 'mankar',
 '.',
 'I',
 'will',
 'be',
 'teaching',
 'you',
 'NLP',
 'from',
 'scratch']

In [5]:
# Tokenize the text into sentences
S = sent_tokenize(a)
S

['Hello and welcome friends to NLP workshop.',
 'My name is shridhar mankar.',
 'I will be teaching you NLP from scratch']

# Type, Length and Frequency Checking

In [9]:
type(A),len(A),type(S),len(S)

(list, 22, list, 3)

In [10]:
# Import Frequency Distribution class
from nltk.probability import FreqDist
frequency = FreqDist(A)
frequency

FreqDist({'NLP': 2, '.': 2, 'Hello': 1, 'and': 1, 'welcome': 1, 'friends': 1, 'to': 1, 'workshop': 1, 'My': 1, 'name': 1, ...})

# Stemming

In [11]:
# Import PorterStemmer
from nltk.stem import PorterStemmer
pst = PorterStemmer()

In [12]:
# Test stemming from a specific word
pst.stem('Making')

'make'

In [13]:
# Apply stemming to all words in the tokenized list
for i in A:
    print(pst.stem(i))

hello
and
welcom
friend
to
nlp
workshop
.
my
name
is
shridhar
mankar
.
i
will
be
teach
you
nlp
from
scratch


In [14]:
pst.stem('universal')

'univers'

In [15]:
pst.stem('universe')

'univers'

In [16]:
pst.stem('university')

'univers'

In [17]:
pst.stem('alumni')

'alumni'

In [18]:
pst.stem('alumnus')

'alumnu'

# Lemmatization

In [19]:
import nltk
# Download WordNet resource
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
lemmatizer = WordNetLemmatizer()

In [21]:
pst.stem('trouble')

'troubl'

In [22]:
lemmatizer.lemmatize('trouble')

'trouble'

In [23]:
# Apply lemmatization to the list
for i in A:
    print(lemmatizer.lemmatize(i))

Hello
and
welcome
friend
to
NLP
workshop
.
My
name
is
shridhar
mankar
.
I
will
be
teaching
you
NLP
from
scratch


In [24]:
lemmatizer.lemmatize('alumnus')

'alumnus'

In [25]:
lemmatizer.lemmatize('alumni')

'alumnus'

In [26]:
lemmatizer.lemmatize('universe')

'universe'

In [27]:
lemmatizer.lemmatize('university')

'university'

# pos_tag

In [28]:
# Download POS tagger resource
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [29]:
# Print POS tags for each word (Note: Tagging individually loses context)
for i in A:
 print(nltk.pos_tag([i]))

[('Hello', 'NN')]
[('and', 'CC')]
[('welcome', 'NN')]
[('friends', 'NNS')]
[('to', 'TO')]
[('NLP', 'NN')]
[('workshop', 'NN')]
[('.', '.')]
[('My', 'PRP$')]
[('name', 'NN')]
[('is', 'VBZ')]
[('shridhar', 'NN')]
[('mankar', 'NN')]
[('.', '.')]
[('I', 'PRP')]
[('will', 'MD')]
[('be', 'VB')]
[('teaching', 'VBG')]
[('you', 'PRP')]
[('NLP', 'NN')]
[('from', 'IN')]
[('scratch', 'NN')]


    •NN: Noun, singular or mass
    •CC: Coordinating conjunction
    •NNS: Noun, plural
    •TO: "to" as preposition or infinitive marker
    •PRP$: Possessive pronoun (e.g., my, his)
    •VBZ: Verb, 3rd person singular present
    •PRP: Personal pronoun (e.g., I, he, she)
    •MD: Modal (e.g., will, can)
    •VB: Verb, base form
    •VBG: Verb, gerund or present participle
    •IN: Preposition or subordinating conjunction
    •.: Punctuation (Sentence terminator)

# Named entity recognition

    Identifies real-world objects (Entities).
    Gives Labels like PERSON, GPE (Location), ORGANIZATION.
    Focus on Semantics (Meaning/Information extraction).
    Usually happens after POS tagging (it uses POS tags to help identify entities).

In [30]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [31]:
# Sample text for NER
text= '''Harry Lives in New York'''
words= word_tokenize(text)
postags=pos_tag(words)

In [32]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [33]:
# Perform Named Entity Chunking
tree = nltk.ne_chunk(postags)
print(tree)

(S (PERSON Harry/NNP) Lives/VBZ in/IN (GPE New/NNP York/NNP))


In [34]:
# Another example for NER
text= 'John wants a new Samsung device from Pune'
words= word_tokenize(text)
postags=pos_tag(words)

In [35]:
tree = nltk.ne_chunk(postags)
print(tree)

(S
  (PERSON John/NNP)
  wants/VBZ
  a/DT
  new/JJ
  (ORGANIZATION Samsung/NNP)
  device/NN
  from/IN
  (GPE Pune/NNP))


# Stopwords     ...which connects sentences.

In [36]:
from nltk.corpus import stopwords

In [39]:
# Set of English stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaud\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [40]:
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [55]:
msg = "My name is shridhar mankar, I love making videos and watching kdrama. My speciality is making things easy"

# Tokenize the message
words = word_tokenize(msg)

filtered_sentence = []

# Filter out stopwords
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

print(words)
print(filtered_sentence)

['My', 'name', 'is', 'shridhar', 'mankar', ',', 'I', 'love', 'making', 'videos', 'and', 'watching', 'kdrama', '.', 'My', 'speciality', 'is', 'making', 'things', 'easy']
['My', 'name', 'shridhar', 'mankar', ',', 'I', 'love', 'making', 'videos', 'watching', 'kdrama', '.', 'My', 'speciality', 'making', 'things', 'easy']


In [56]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
msg = "My name my is shridhar mankar, I love making videos and watching kdrama. My speciality is making things easy"

# Tokenize the message
words = word_tokenize(msg)

filtered_sentence = []

# Filter out stopwords
filtered_sentence = [w for w in words if w not in stop_words]

print(words)
print(filtered_sentence)

['My', 'name', 'my', 'is', 'shridhar', 'mankar', ',', 'I', 'love', 'making', 'videos', 'and', 'watching', 'kdrama', '.', 'My', 'speciality', 'is', 'making', 'things', 'easy']
['My', 'name', 'shridhar', 'mankar', ',', 'I', 'love', 'making', 'videos', 'watching', 'kdrama', '.', 'My', 'speciality', 'making', 'things', 'easy']
