Sources:
- NLTK Tutorial: https://medium.com/towards-artificial-intelligence/text-mining-in-python-steps-and-examples-78b3f8fd913b
- NLTK Part-of-Speech Tagging Definitions: https://www.guru99.com/pos-tagging-chunking-nltk.html

In [1]:
# Downloads you might need to try: 
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

# Import libraries
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Create sample text 
text = "In Brazil they drive on the right-hand side of the road. Brazil has a large coastline on the eastern side of South America"


In [2]:
# Passing the string text into word tokenize for breaking the sentences
token = word_tokenize(text)
token

['In',
 'Brazil',
 'they',
 'drive',
 'on',
 'the',
 'right-hand',
 'side',
 'of',
 'the',
 'road',
 '.',
 'Brazil',
 'has',
 'a',
 'large',
 'coastline',
 'on',
 'the',
 'eastern',
 'side',
 'of',
 'South',
 'America']

In [3]:
# Creates a dictionary, key = word, value = frequency of that word *
tokenFrequency = FreqDist(token)
tokenFrequency

FreqDist({'the': 3, 'Brazil': 2, 'on': 2, 'side': 2, 'of': 2, 'In': 1, 'they': 1, 'drive': 1, 'right-hand': 1, 'road': 1, ...})

In [4]:
# Frequency of the top ten words in a text box *
topTokenFrequency = tokenFrequency.most_common(10)
topTokenFrequency

# FIXME: would be more useful after taking out words like "the", "on", "at"

[('the', 3),
 ('Brazil', 2),
 ('on', 2),
 ('side', 2),
 ('of', 2),
 ('In', 1),
 ('they', 1),
 ('drive', 1),
 ('right-hand', 1),
 ('road', 1)]

In [5]:
# Stemmer *
# Use stemming to identify the same word under different tenses
# Might be useful, might not. Doesn't work for everything, either. 
pst = PorterStemmer()

# Ex. 1: Individual word
pst.stem("holding")
pst.stem("waiting")
pst.stem("waits")

# Ex. 2: List of Words 
stm = ["waited", "waiting", "waits"]
for word in stm :
    print(word+ " --> " + pst.stem(word))

waited --> wait
waiting --> wait
waits --> wait


In [6]:
# Lemmatizer 
lemmatizer = WordNetLemmatizer() 
print("rocks: ", lemmatizer.lemmatize("rocks")) 
print("corpora: ", lemmatizer.lemmatize("corpora"))

rocks:  rock
corpora:  corpus


In [7]:
# Stop Words (i.e. clearing the clutter) *
from nltk.corpus import stopwords
a = set(stopwords.words("english"))
text = "In Brazil they drive on the right-hand side of the road. Brazil has a large coastline on the eastern side of South America"
text1 = word_tokenize(text.lower())
print("====================================\nAll Words:")
print(text1)
print("====================================\nNo Stop Words:")
stopwords = [x for x in text1 if x not in a]
print(stopwords)
print("====================================")

All Words:
['in', 'brazil', 'they', 'drive', 'on', 'the', 'right-hand', 'side', 'of', 'the', 'road', '.', 'brazil', 'has', 'a', 'large', 'coastline', 'on', 'the', 'eastern', 'side', 'of', 'south', 'america']
No Stop Words:
['brazil', 'drive', 'right-hand', 'side', 'road', '.', 'brazil', 'large', 'coastline', 'eastern', 'side', 'south', 'america']


In [14]:
# Part-of-Speech Tagging
# Verbs, nouns, prepositions, etc.
# Definitions: https://www.guru99.com/pos-tagging-chunking-nltk.html
text = "In Brazil they drive on the right-hand side of the road. Brazil has a large coastline on the eastern side of South America"
textTokenized = word_tokenize(text)
for token in textTokenized:
    print(nltk.pos_tag([token]))

[('In', 'IN')]
[('Brazil', 'NNP')]
[('they', 'PRP')]
[('drive', 'NN')]
[('on', 'IN')]
[('the', 'DT')]
[('right-hand', 'NN')]
[('side', 'NN')]
[('of', 'IN')]
[('the', 'DT')]
[('road', 'NN')]
[('.', '.')]
[('Brazil', 'NNP')]
[('has', 'VBZ')]
[('a', 'DT')]
[('large', 'JJ')]
[('coastline', 'NN')]
[('on', 'IN')]
[('the', 'DT')]
[('eastern', 'JJ')]
[('side', 'NN')]
[('of', 'IN')]
[('South', 'NNP')]
[('America', 'NNP')]
