In [1]:
import nltk
 
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

True

**TOKEN**

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
doc = "Apples and oranges are similar. Boots and hippos aren't."
print(word_tokenize(doc))

['Apples', 'and', 'oranges', 'are', 'similar', '.', 'Boots', 'and', 'hippos', 'are', "n't", '.']


**SPAN**

In [3]:
from nltk.tokenize import TreebankWordTokenizer as twt
list(twt().span_tokenize('What is the airspeed of an unladen swallow ?'))

[(0, 4),
 (5, 7),
 (8, 11),
 (12, 20),
 (21, 23),
 (24, 26),
 (27, 34),
 (35, 42),
 (43, 44)]

**STOP WORDS**

In [4]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

print('Number of stop words: %d' % len(stopWords))
print('First ten stop words: %s' % list(stopWords)[:10])

Number of stop words: 179
First ten stop words: ['when', "mightn't", 'it', 'do', 'am', 'during', 'her', 'because', "don't", 'are']


In [5]:
doc = "I live in New York City, the capital of the New York State"
tokens = [token for token in word_tokenize(doc) if token not in stopWords]

print('Original Article: %s' % (doc))
print()
print(tokens)

Original Article: I live in New York City, the capital of the New York State

['I', 'live', 'New', 'York', 'City', ',', 'capital', 'New', 'York', 'State']


**LEMMATIZATION**

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
doc = "Apples and oranges are similar. Boots and hippos aren't."
tokens = word_tokenize(doc)
print(tokens)
lemmas = [lemmatizer.lemmatize(token) for token in tokens]
print(lemmas)

['Apples', 'and', 'oranges', 'are', 'similar', '.', 'Boots', 'and', 'hippos', 'are', "n't", '.']
['Apples', 'and', 'orange', 'are', 'similar', '.', 'Boots', 'and', 'hippo', 'are', "n't", '.']


In [7]:
lemmatizer.lemmatize("are", "v") #POS

'be'

**PART-OF-SPEECH and Syntactic dependencies**

In [8]:
from nltk.tokenize import PunktSentenceTokenizer

doc = "This is a text."
print(word_tokenize(doc))
print(nltk.pos_tag(word_tokenize(doc)))

['This', 'is', 'a', 'text', '.']
[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('text', 'NN'), ('.', '.')]


**NAMED ENTITIES**

In [9]:
doc = "Larry Page founded Google"

for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(doc))):
  if hasattr(chunk, 'label'):
     print(chunk.label(), ' '.join(c[0] for c in chunk))


PERSON Larry
PERSON Page
PERSON Google


**SENTENCES**

In [10]:
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']


**CHUNK and Base noun phrases**

In [11]:
doc = "I have a red car"
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(doc))):
  print(chunk)

('I', 'PRP')
('have', 'VBP')
('a', 'DT')
('red', 'JJ')
('car', 'NN')
