In [1]:
txt = "Hello Geeks. We're hoping you all are doing great."
txt

"Hello Geeks. We're hoping you all are doing great."

In [4]:
txt.split('.')

['Hello Geeks', " We're hoping you all are doing great", '']

In [7]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [8]:
word_tokenize(txt)

['Hello',
 'Geeks',
 '.',
 'We',
 "'re",
 'hoping',
 'you',
 'all',
 'are',
 'doing',
 'great',
 '.']

In [9]:
sent_tokenize(txt)

['Hello Geeks.', "We're hoping you all are doing great."]

In [12]:
for word in word_tokenize(txt):
    if word != '.':
        print(word)

Hello
Geeks
We
're
hoping
you
all
are
doing
great


## 1. Lemmatization

In [2]:
stem = PorterStemmer()
lem = WordNetLemmatizer()

In [6]:
print(lem.lemmatize('change'))
print(lem.lemmatize('changed'))
print(lem.lemmatize('changer'))
print(lem.lemmatize('changes'))

change
changed
changer
change


## 2. Stemming

In [1]:
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [8]:
print(stem.stem('change'))
print(stem.stem('changer'))
print(stem.stem('changed'))
print(stem.stem('changes'))

chang
changer
chang
chang


## Stop-Words

In [14]:
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to C:\Users\Harsh
[nltk_data]     Kapoor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
txt = 'This is not the right time to talk. Can we do it now?'

In [25]:
for word in word_tokenize(txt):
    if word.lower() not in stopword and len(word) > 1:
        print(word)

right
time
talk


## Corpus and Vocabulary

In [48]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
corpus = 'India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area, the second-most populous country, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea in the south-west, and the Bay of Bengal in the south-east, it shares land borders with Pakistan to the west; China, Nepal and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar and Indonesia.'

In [50]:
# Stop words removal

words = []

for word in word_tokenize(corpus):
    if word.lower() not in stopwords.words('english') and len(word) > 2:
        words.append(word)

In [51]:
len(words)

54

In [52]:
len(set(words))

46

## Vocabulary with keras

In [56]:
from tensorflow.keras.preprocessing.text import Tokenizer

tok = Tokenizer()

In [57]:
corp = ['coffee is hot', 'water is cold']

tok.fit_on_texts(corp)

In [59]:
tok.word_index

{'is': 1, 'coffee': 2, 'hot': 3, 'water': 4, 'cold': 5}

In [60]:
tok.texts_to_sequences(corp)

[[2, 1, 3], [4, 1, 5]]