# Intro

In [None]:
# This is just me following a tutorial from Sentdex
# https://www.youtube.com/watch?v=imPpT2Qo2sk&list=PLQVvvaa0QuDf2JswnfiGkliBInZnIC4HL&index=5&ab_channel=sentdex

# I wanted to familiarise myself with NLTK so it will probably be pretty messy and useless.
# Keeping it around so I can refer back to it if I need it

# Setup

In [63]:
import nltk

In [64]:
# This will need to be run to setup the package
# Download all parts (will be around 3.5GB, hence not in this github repo)
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

# Tokenizing and stopwords

In [8]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords 

In [16]:
example_sentence = """
    hello computer, how are you today? 
    I would like to discuss apples and oranges.
    Perhaps we can then move on to talking about stocks.
    I like the stock.
   $GME to the moon!
"""

In [17]:
# Stopwords are just useless information to the model, things like "a, and, the"
# This is the process for stripping these words out of your sample data

stop_words = set(stopwords.words("english"))

In [18]:
words = word_tokenize(example_sentence)

In [19]:
filtered_sentence = [word for word in words if not word in stop_words]
filtered_sentence

['hello',
 'computer',
 ',',
 'today',
 '?',
 'I',
 'would',
 'like',
 'discuss',
 'apples',
 'oranges',
 '.',
 'Perhaps',
 'move',
 'talking',
 'stocks',
 '.',
 'I',
 'like',
 'stock',
 '.',
 '$',
 'GME',
 'moon',
 '!']

# Stemming

In [20]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [21]:
# Reduces words to their root
# Things like WordNet actually solve this so that we won't need to do this
# But it is always good to know where this stuff is coming from

ps = PorterStemmer()
example_words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

In [22]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [23]:
new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once"

In [24]:
words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

it
is
veri
import
to
be
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc


# Part of speech tagging

In [26]:
import nltk

from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [30]:
# POS tag list:

# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there (like: "there is" ... think of it like "there exists")
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	adjective	'big'
# JJR	adjective, comparative	'bigger'
# JJS	adjective, superlative	'biggest'
# LS	list marker	1)
# MD	modal	could, will
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'
# PDT	predeterminer	'all the kids'
# POS	possessive ending	parent\'s
# PRP	personal pronoun	I, he, she
# PRP$	possessive pronoun	my, his, hers
# RB	adverb	very, silently,
# RBR	adverb, comparative	better
# RBS	adverb, superlative	best
# RP	particle	give up
# TO	to	go 'to' the store.
# UH	interjection	errrrrrrrm
# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# WDT	wh-determiner	which
# WP	wh-pronoun	who, what
# WP$	possessive wh-pronoun	whose
# WRB	wh-abverb	where, when

In [27]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [28]:
def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            print(tagged)
            
    except Exception as e:
        print(e)

In [29]:
process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

# Chunking

In [None]:
# Chunking in Natural Language Processing (NLP) is the process by which we group 
# various words together by their part of speech tags. 

In [37]:
def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            chunked.draw()
    
    except Exception as e:
        print(e)

In [38]:
process_content()

None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


KeyboardInterrupt: 

# Chinking

In [None]:
# Chinking is a part of the chunking process with natural language processing with NLTK. 
# A chink is what we wish to remove from the chunk. 
# We define a chink in a very similar fashion compared to how we defined the chunk. 

In [41]:
def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            chunkGram = r"""Chunk: {<.*>+}
                                                        }<VB.?|IN|DT>+{"""
            
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            chunked.draw()
    
    except Exception as e:
        print(e)

In [40]:
process_content()

None
None
None
None
None
None
None
None
None
None


# Named Entity Recognition

In [None]:
# Named entity recognition is useful to quickly find out what the subjects of discussion are. 
# NLTK comes packed full of options for us. 
# We can find just about any named entity, or we can look for specific ones.

In [None]:
# NE Type and Examples

# ORGANIZATION - Georgia-Pacific Corp., WHO
# PERSON - Eddy Bonte, President Obama
# LOCATION - Murray River, Mount Everest
# DATE - June, 2008-06-29
# TIME - two fifty a m, 1:30 p.m.
# MONEY - 175 million Canadian Dollars, GBP 10.40
# PERCENT - twenty pct, 18.75 %
# FACILITY - Washington Monument, Stonehenge
# GPE - South East Asia, Midlothian

In [45]:
def process_content():
    try:
        for i in tokenized[:10]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            
            namedEnt.draw()
    
    except Exception as e:
        print(e)

In [46]:
process_content()

# Lemmatizing

In [None]:
# A very similar operation to stemming is called lemmatizing. 
# The major difference between these is, as you saw earlier, 
# stemming can often create non-existent words.

In [47]:
from nltk.stem import WordNetLemmatizer

In [48]:
lemmatizer = WordNetLemmatizer()

In [55]:
# Get root
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))

cat
cactus
goose


In [56]:
# Adjectives - pos = part of speech
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))

good
best


In [62]:
# Verbs
print(lemmatizer.lemmatize("running", pos="v"))
print(lemmatizer.lemmatize("batting", pos="v"))

run
bat


# NLTK Corpora

In [None]:
# Bodies of texts. 
# Generally, corpora are grouped by some sort of defining

In [66]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

In [69]:
sample = gutenberg.raw("bible-kjv.txt")
tokenized = sent_tokenize(sample)

In [78]:
for sentence in tokenized[0:15]:
    print(sentence)

[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.
1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.
And the Spirit of God moved upon the face of the
waters.
1:3 And God said, Let there be light: and there was light.
1:4 And God saw the light, that it was good: and God divided the light
from the darkness.
1:5 And God called the light Day, and the darkness he called Night.
And the evening and the morning were the first day.
1:6 And God said, Let there be a firmament in the midst of the waters,
and let it divide the waters from the waters.
1:7 And God made the firmament, and divided the waters which were
under the firmament from the waters which were above the firmament:
and it was so.
1:8 And God called the firmament Heaven.
And the evening and the
morning were the second day.
1:9 And God said, Let the waters under the heaven be ga

# WordNet

In [None]:
# Part of the NLTK Corpora is WordNet. 
# I wouldn't totally classify WordNet as a Corpora, 
# if anything it is really a giant Lexicon, but, either way, it is super useful. 
# With WordNet we can do things like look up words and their meaning 
# according to their parts of speech, we can find synonyms, antonyms, 
# and even examples of the word in use. 

In [79]:
from nltk.corpus import wordnet

In [81]:
syns = wordnet.synsets("program")
for syn in syns:
    print(syn)

Synset('plan.n.01')
Synset('program.n.02')
Synset('broadcast.n.02')
Synset('platform.n.02')
Synset('program.n.05')
Synset('course_of_study.n.01')
Synset('program.n.07')
Synset('program.n.08')
Synset('program.v.01')
Synset('program.v.02')


In [82]:
# Specific synset
syns[0]

Synset('plan.n.01')

In [84]:
# All lemmas of that synset
syns[0].lemmas()

[Lemma('plan.n.01.plan'),
 Lemma('plan.n.01.program'),
 Lemma('plan.n.01.programme')]

In [85]:
# Just the word of the synset
syns[0].lemmas()[0].name()

'plan'

In [86]:
# Definition
syns[0].definition()

'a series of steps to be carried out or goals to be accomplished'

In [87]:
# Examples
syns[0].examples()

['they drew up a six-step plan', 'they discussed plans for a new bond issue']

In [94]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())

print("Synonyms:")
for word in set(synonyms):
    print(word)
    
print()
print("Antonyms:")
for word in set(antonyms):
    print(word)

Synonyms:
just
goodness
honest
honorable
practiced
adept
undecomposed
near
thoroughly
sound
estimable
dependable
well
good
beneficial
expert
in_force
salutary
right
skillful
safe
in_effect
skilful
ripe
serious
respectable
unspoilt
proficient
commodity
upright
trade_good
effective
secure
soundly
dear
unspoiled
full

Antonyms:
ill
bad
evil
evilness
badness


In [95]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")

# wup = wu and palmer (semantic similarity)
print(w1.wup_similarity(w2))

0.9090909090909091


In [96]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")

print(w1.wup_similarity(w2))

0.6956521739130435


In [97]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")

print(w1.wup_similarity(w2))

0.32


# Text classification

In [99]:
# Now that we understand some of the basics of of natural language processing 
# with the Python NLTK module, we're ready to try out text classification. 
# This is where we attempt to identify a body of text with some sort of label. 

In [100]:
import nltk
import random

from nltk.corpus import movie_reviews

In [102]:
documents = [
    (list(movie_reviews.words(fileId)), category) 
    for category in movie_reviews.categories() 
    for fileId in movie_reviews.fileids(category)
]

random.shuffle(documents)

In [105]:
all_words = []
for word in movie_reviews.words():
    all_words.append(word.lower())

all_words = nltk.FreqDist(all_words)

In [113]:
all_words.most_common(20)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595),
 (')', 11781),
 ('(', 11664),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961)]

In [107]:
all_words["stupid"]

253