# NLTK

#### Install NLTK

In [None]:
%%bash
pip install nltk



#### Download models or corpora

In [None]:
%%bash
#import nltk
python -m nltk.downloader # shows a window when graphical output available

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> 

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/nltk/downloader.py", line 982, in _interactive_download
    DownloaderGUI(self).mainloop()
  File "/usr/local/lib/python3.7/dist-packages/nltk/downloader.py", line 1226, in __init__
    top = self.top = Tk()
  File "/usr/lib/python3.7/tkinter/__init__.py", line 2023, in __init__
    self.tk = _tkinter.create(screenName, baseName, className, interactive, wantobjects, useTk, sync, use)
_tkinter.TclError: no display name and no $DISPLAY environment variable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/nltk/downloader.py", line 2278, in <module>
    halt_on_error=options.halt_on_error)
  File "/usr/local/lib/python3.7/dist

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Tokenization

In [None]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"

In [None]:
query = 'fast'

The naive way...

In [None]:
tweet.find(query)

31

In [None]:
tweet.split()

['RT',
 '@lOR42wsOEFcv3f:',
 'I',
 'fall',
 'too',
 'fast,',
 'crash',
 'too',
 'hard,',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much...',
 ':(',
 '#amiright']

In [None]:
[query in tweet.split()]

[False]

Correct tokenization: informed splitting of the text into tokens

In [None]:
nltk.word_tokenize(tweet)

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [None]:
nltk.word_tokenize("The U.S.A. is a big country. ")

['The', 'U.S.A.', 'is', 'a', 'big', 'country', '.']

In [None]:
[query in nltk.word_tokenize(tweet)]
# query

[True]

More options...

In [None]:
nltk.word_tokenize(tweet, language='spanish')

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [None]:
from nltk.tokenize import RegexpTokenizer
custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]+', discard_empty=True)

In [None]:
custom_tokenizer.tokenize(tweet)

['RT',
 'lOR42wsOEFcv3f',
 'I',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 'amiright']

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [None]:
tweet_tokenizer.tokenize(tweet)
tweet_tokenizer.tokenize("helloooooooo how are youu")

['hellooo', 'how', 'are', 'youu']

In [None]:
from nltk.tokenize import MWETokenizer
mwe = MWETokenizer()
mwe.add_mwe(('too', 'fast'))
mwe.tokenize(tweet_tokenizer.tokenize(tweet))

['RT',
 ':',
 'I',
 'fall',
 'too_fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [None]:
mwe.add_mwe((('too', 'fast'), ('too', 'hard')))

In [None]:
query = 'fast'
query in mwe.tokenize(tweet_tokenizer.tokenize(tweet))

False

### Normalization

In [None]:
from collections import Counter
Counter(nltk.word_tokenize("I am so fast. Fast is my name.".lower()))

Counter({'.': 2,
         'am': 1,
         'fast': 2,
         'i': 1,
         'is': 1,
         'my': 1,
         'name': 1,
         'so': 1})

In [None]:
tweet.lower()


'rt @lor42wsoefcv3f: i fall too fast, crash too hard, forgive too easily and care too much... :( #amiright'

In [None]:
import re
import string

def normalize_tokens(tokenized_text):
    # Lowercase
    tokens = [t.lower() for t in tokenized_text]
    # Remove hashtags
    tokens = [t for t in tokens if not t.startswith('#')]
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    # Keep only letters
#     tokens = [t for t in tokens if re.match('^[a-z]+$', t)]
    # Normalize characters
#     tokens = [re.sub('á', 'a', t) for t in tokens]

    return tokens

In [None]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalize_tokens(nltk.word_tokenize(tweet))


['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
spanish_query = 'muy rápido'
normalize_tokens(tweet_tokenizer.tokenize(spanish_query))

['muy', 'rápido']

In [None]:
!pip install unidecode
import unidecode
unidecode.unidecode(spanish_query)

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 20.3 MB/s eta 0:00:01[K     |██▉                             | 20 kB 25.3 MB/s eta 0:00:01[K     |████▏                           | 30 kB 22.4 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 17.2 MB/s eta 0:00:01[K     |███████                         | 51 kB 9.3 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 10.8 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 10.1 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 10.1 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 11.1 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 10.0 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 10.0 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 10.0 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 10.0 MB/s et

'muy rapido'

#### Uniform normalization principle

In [None]:
query = 'TOO fast TOO furious'
tokenized_query = tweet_tokenizer.tokenize(query)
normalized_query = normalize_tokens(tokenized_query)
# normalized_query = tokenized_query
normalized_query

['too', 'fast', 'too', 'furious']

In [None]:
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalized_tweet = normalize_tokens(tweet.split())
normalized_tweet

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
common_words = set(normalized_query).intersection(normalized_tweet)
print(common_words)
print(len(common_words), "common word(s)")

{'fast', 'too'}
2 common word(s)


#### Stemming / Lemmatization


In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer


In [None]:
stemmer = PorterStemmer()

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
stemmer = nltk.LancasterStemmer() # is prone to overstemming
[stemmer.stem(t) for t in normalized_tweet]


['rt',
 'i',
 'fal',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forg',
 'too',
 'easy',
 'and',
 'car',
 'too',
 'much',
 '...',
 ':(']

In [None]:
stemmer = SnowballStemmer(language='english') # Porter2

[stemmer.stem(t) for t in normalized_tweet]

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgiv',
 'too',
 'easili',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
print(stemmer.stem("running"))

print(stemmer.stem("runs"))

print(stemmer.stem("ran"))

print(stemmer.stem("darling"))

print(stemmer.stem("are"))

print(stemmer.stem("bring"))

print(stemmer.stem("being"))

print(stemmer.stem("Charles"))


run
run
ran
darl
are
bring
be
charl


In [None]:
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[lemmatizer.lemmatize(t) for t in normalized_tweet]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [None]:
nltk.download('averaged_perceptron_tagger')


tagged_tweet = nltk.pos_tag(normalized_tweet)
print(tagged_tweet)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[('rt', 'NN'), ('i', 'NN'), ('fall', 'VBP'), ('too', 'RB'), ('fast', 'JJ'), ('crash', 'NN'), ('too', 'RB'), ('hard', 'JJ'), ('forgive', 'JJ'), ('too', 'RB'), ('easily', 'RB'), ('and', 'CC'), ('care', 'VB'), ('too', 'RB'), ('much', 'JJ'), ('...', ':'), (':(', 'NN')]


In [None]:
from nltk.corpus import wordnet as wn
tag_map = {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV, 'N': wn.NOUN}
def get_lemmas(tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    return [lemmatizer.lemmatize(w, pos=tag_map.get(p[0], wn.NOUN)) for (w, p) in tagged_text]


In [None]:
query = "the fastest!"
normalized_query = normalize_tokens(tweet_tokenizer.tokenize(query))
print(normalized_query)

['the', 'fastest']


In [None]:
lemmatized_tweet = get_lemmas(normalized_tweet)
lemmatized_query = get_lemmas(normalized_query)
print(lemmatized_tweet)
print(lemmatized_query)


['rt', 'i', 'fall', 'too', 'fast', 'crash', 'too', 'hard', 'forgive', 'too', 'easily', 'and', 'care', 'too', 'much', '...', ':(']
['the', 'fast']


In [None]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
normalized_tweet


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [None]:
[lemmatizer.lemmatize(t) for t in normalized_tweet]


['i', 'am', 'so', 'fast', 'i', 'am', 'the', 'fastest']

In [None]:
get_lemmas(normalized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']

In [None]:
print("Common words:", set(lemmatized_tweet).intersection(set(lemmatized_query)))

Common words: {'fast'}


#### Stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
from nltk.corpus import stopwords
stopwords.words('romanian')

['a',
 'abia',
 'acea',
 'aceasta',
 'această',
 'aceea',
 'aceeasi',
 'acei',
 'aceia',
 'acel',
 'acela',
 'acelasi',
 'acele',
 'acelea',
 'acest',
 'acesta',
 'aceste',
 'acestea',
 'acestei',
 'acestia',
 'acestui',
 'aceşti',
 'aceştia',
 'adica',
 'ai',
 'aia',
 'aibă',
 'aici',
 'al',
 'ala',
 'ale',
 'alea',
 'alt',
 'alta',
 'altceva',
 'altcineva',
 'alte',
 'altfel',
 'alti',
 'altii',
 'altul',
 'am',
 'anume',
 'apoi',
 'ar',
 'are',
 'as',
 'asa',
 'asta',
 'astea',
 'astfel',
 'asupra',
 'atare',
 'atat',
 'atata',
 'atatea',
 'atatia',
 'ati',
 'atit',
 'atita',
 'atitea',
 'atitia',
 'atunci',
 'au',
 'avea',
 'avem',
 'aveţi',
 'avut',
 'aş',
 'aţi',
 'ba',
 'ca',
 'cam',
 'cand',
 'care',
 'careia',
 'carora',
 'caruia',
 'cat',
 'catre',
 'ce',
 'cea',
 'ceea',
 'cei',
 'ceilalti',
 'cel',
 'cele',
 'celor',
 'ceva',
 'chiar',
 'ci',
 'cind',
 'cine',
 'cineva',
 'cit',
 'cita',
 'cite',
 'citeva',
 'citi',
 'citiva',
 'cu',
 'cui',
 'cum',
 'cumva',
 'cât',
 'câte

In [None]:
blacklist_words = stopwords.words('english') + ['rt']

In [None]:
cleaned_tweet = [t for t in normalized_tweet if t not in blacklist_words]
print(cleaned_tweet)

['fast', 'fastest']


#### Vocabulary

In [None]:
from collections import Counter

Counter(get_lemmas(normalized_tweet)).most_common(5)

[('i', 2), ('be', 2), ('fast', 2), ('so', 1), ('the', 1)]

In [None]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
lemmatized_tweet = get_lemmas(normalized_tweet)
print(lemmatized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']


In [None]:
print(Counter(normalized_tweet))
print(Counter(lemmatized_tweet))

Counter({'i': 2, 'am': 2, 'so': 1, 'fast': 1, 'the': 1, 'fastest': 1})
Counter({'i': 2, 'be': 2, 'fast': 2, 'so': 1, 'the': 1})


#### Sentence segmentation

In [None]:
query = "I am too fast. I am too furious." 

In [None]:
"I am in the U.S.A.".split(".")
# query
"I am lazy. He is not in the U.S.A."

'I am lazy. He is not in the U.S.A.'

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
sent_tokenize(query)

['I am too fast.', 'I am too furious.']

In [None]:
sent_tokenize("I am in the U.S.A.")

['I am in the U.S.A.']

In [None]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_query = 'Soy muy rápido! Estoy muy furioso!'
spanish_tokenizer.tokenize(spanish_query)

['Soy muy rápido!', 'Estoy muy furioso!']

In [None]:
sent_tokenize("J.K. Rowling is rich. I am not as rich as J.K.")

['J.K. Rowling is rich.', 'I am not as rich as J.K.']

In [None]:
from nltk.tokenize import PunktSentenceTokenizer
pkt = PunktSentenceTokenizer()
pkt.tokenize("hello! how are you?")

['hello!', 'how are you?']

#### Numeral conversion

In [None]:
!pip install word2number
!pip install num2word



In [None]:
!pip install num2words

Collecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[?25l[K     |███▎                            | 10 kB 16.8 MB/s eta 0:00:01[K     |██████▌                         | 20 kB 22.0 MB/s eta 0:00:01[K     |█████████▊                      | 30 kB 26.7 MB/s eta 0:00:01[K     |█████████████                   | 40 kB 19.6 MB/s eta 0:00:01[K     |████████████████▏               | 51 kB 9.2 MB/s eta 0:00:01[K     |███████████████████▍            | 61 kB 10.5 MB/s eta 0:00:01[K     |██████████████████████▋         | 71 kB 9.2 MB/s eta 0:00:01[K     |█████████████████████████▉      | 81 kB 10.1 MB/s eta 0:00:01[K     |█████████████████████████████   | 92 kB 11.1 MB/s eta 0:00:01[K     |████████████████████████████████| 101 kB 6.1 MB/s 
Installing collected packages: num2words
Successfully installed num2words-0.5.10


In [None]:
import word2number
from word2number import w2n
w2n.word_to_num("eleven")


ModuleNotFoundError: ignored

In [None]:
w2n.word_to_num("twenty three")

23

In [None]:
from num2words import num2words
num2words(12)


'twelve'

In [None]:
num2words(101)


'one hundred and one'

In [None]:
num2words(2020)

'two thousand and twenty'

In [None]:
w2n.word_to_num("Twelve o'clock!")

12

### Exercise

In [None]:
text = "The Environment Ministry’s draft notification to regulate the use of membrane-based water purification systems primarily concerns the manufacturers of reverse osmosis (RO) water filters but effectively bars domestic users from installing RO systems. The notification is the culmination of a legal dispute before the National Green Tribunal, which had banned RO water filter use in Delhi as the purification process wastes water. The association of water filter manufacturers challenged this order and the litigation led to this pan-India notification, where the intent is to conserve water and cut waste. In RO, the total dissolved solids (TDS) in water — which covers trace chemicals, certain viruses, bacteria and salts — can be reduced, to meet potable water standards. Home filters waste nearly 80% of the water during treatment. "

In [None]:
def normalize1(text):
  text = text.lower()
  words = nltk.word_tokenize(text)
  stop_words = stopwords.words('english')
  words = [word for word in words if word not in stop_words]
  return words


def normalize2(text):
  custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]+', discard_empty=True)
  words = custom_tokenizer.tokenize(text)
  lemmatizer = WordNetLemmatizer()

  cleaned_words = []
  for word in words:
    if word.isdigit():
      cleaned_words.append(num2words(word))
    else:
      cleaned_words.append(lemmatizer.lemmatize(word))
  return cleaned_words

normalized_words1 = normalize1(text)
normalized_words2 = normalize2(text)

voc1 = Counter(normalized_words1)
print(len(voc1.keys()))
print(voc1)

voc2 = Counter(normalized_words2)
print(len(voc2.keys()))
print(voc2)

69
Counter({'water': 9, ',': 6, '.': 5, 'ro': 4, 'notification': 3, 'use': 2, 'purification': 2, 'systems': 2, 'manufacturers': 2, '(': 2, ')': 2, 'filters': 2, 'filter': 2, 'waste': 2, '—': 2, 'environment': 1, 'ministry': 1, '’': 1, 'draft': 1, 'regulate': 1, 'membrane-based': 1, 'primarily': 1, 'concerns': 1, 'reverse': 1, 'osmosis': 1, 'effectively': 1, 'bars': 1, 'domestic': 1, 'users': 1, 'installing': 1, 'culmination': 1, 'legal': 1, 'dispute': 1, 'national': 1, 'green': 1, 'tribunal': 1, 'banned': 1, 'delhi': 1, 'process': 1, 'wastes': 1, 'association': 1, 'challenged': 1, 'order': 1, 'litigation': 1, 'led': 1, 'pan-india': 1, 'intent': 1, 'conserve': 1, 'cut': 1, 'total': 1, 'dissolved': 1, 'solids': 1, 'tds': 1, 'covers': 1, 'trace': 1, 'chemicals': 1, 'certain': 1, 'viruses': 1, 'bacteria': 1, 'salts': 1, 'reduced': 1, 'meet': 1, 'potable': 1, 'standards': 1, 'home': 1, 'nearly': 1, '80': 1, '%': 1, 'treatment': 1})
82
Counter({'the': 9, 'water': 9, 'of': 5, 'to': 4, 'RO': 4

Find a recent news article online.
Read it in a python variable (input it manually or read from a file).

Write a function that normalizes the text and splits it into tokens. Add flags to customize the different preprocessing choices (which stemmer/lemmatizer to use, whether to lowercase, whether to convert numbers, whether to remove stopwords, ...). 

Store the vocabulary of unique tokens found in the text.

Compare the number of unique tokens ("types") with different preprocessing settings.