Nltk book is here: http://www.nltk.org/book/

NLP applications: http://blog.mashape.com/list-of-25-natural-language-processing-apis/

In [1]:

import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet


In [2]:
# we have a sentence, and we want to extract all the words from it.
sentence = '''I charged the phone completely out of the box and then turned it on. It went to the Oneplus logo screen but did nothing after that. No buttons work, no combinations of any buttons do anything to take it out of logo screen, it won't even turn off. Amazon won't allow return until technician looks at the phone??? Oneplus support said take it to a service center but couldn't tell me the closest one. Really disappointed in lack of support form both companies for a brand new product and release. I'm sure they will now try to push a replacement but I'm not really interested in the product anymore.'''

In [3]:
sentence.split(" ")

['I',
 'charged',
 'the',
 'phone',
 'completely',
 'out',
 'of',
 'the',
 'box',
 'and',
 'then',
 'turned',
 'it',
 'on.',
 'It',
 'went',
 'to',
 'the',
 'Oneplus',
 'logo',
 'screen',
 'but',
 'did',
 'nothing',
 'after',
 'that.',
 'No',
 'buttons',
 'work,',
 'no',
 'combinations',
 'of',
 'any',
 'buttons',
 'do',
 'anything',
 'to',
 'take',
 'it',
 'out',
 'of',
 'logo',
 'screen,',
 'it',
 "won't",
 'even',
 'turn',
 'off.',
 'Amazon',
 "won't",
 'allow',
 'return',
 'until',
 'technician',
 'looks',
 'at',
 'the',
 'phone???',
 'Oneplus',
 'support',
 'said',
 'take',
 'it',
 'to',
 'a',
 'service',
 'center',
 'but',
 "couldn't",
 'tell',
 'me',
 'the',
 'closest',
 'one.',
 'Really',
 'disappointed',
 'in',
 'lack',
 'of',
 'support',
 'form',
 'both',
 'companies',
 'for',
 'a',
 'brand',
 'new',
 'product',
 'and',
 'release.',
 "I'm",
 'sure',
 'they',
 'will',
 'now',
 'try',
 'to',
 'push',
 'a',
 'replacement',
 'but',
 "I'm",
 'not',
 'really',
 'interested',
 'in',

In [4]:
# We can split the function on a space (” “) to get all the words. 
# However, The problem with this is, we cannot extract punctuation marks like full stops, 
# and this simple parser will not be able to handle every single type of sentence.

# Which is why we should use the word tokenizer provided by the NLTK library. This correctly identifies punctuation marks:
word_tokenize(sentence)

['I',
 'charged',
 'the',
 'phone',
 'completely',
 'out',
 'of',
 'the',
 'box',
 'and',
 'then',
 'turned',
 'it',
 'on',
 '.',
 'It',
 'went',
 'to',
 'the',
 'Oneplus',
 'logo',
 'screen',
 'but',
 'did',
 'nothing',
 'after',
 'that',
 '.',
 'No',
 'buttons',
 'work',
 ',',
 'no',
 'combinations',
 'of',
 'any',
 'buttons',
 'do',
 'anything',
 'to',
 'take',
 'it',
 'out',
 'of',
 'logo',
 'screen',
 ',',
 'it',
 'wo',
 "n't",
 'even',
 'turn',
 'off',
 '.',
 'Amazon',
 'wo',
 "n't",
 'allow',
 'return',
 'until',
 'technician',
 'looks',
 'at',
 'the',
 'phone',
 '?',
 '?',
 '?',
 'Oneplus',
 'support',
 'said',
 'take',
 'it',
 'to',
 'a',
 'service',
 'center',
 'but',
 'could',
 "n't",
 'tell',
 'me',
 'the',
 'closest',
 'one',
 '.',
 'Really',
 'disappointed',
 'in',
 'lack',
 'of',
 'support',
 'form',
 'both',
 'companies',
 'for',
 'a',
 'brand',
 'new',
 'product',
 'and',
 'release',
 '.',
 'I',
 "'m",
 'sure',
 'they',
 'will',
 'now',
 'try',
 'to',
 'push',
 'a',
 '

In [5]:
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## POS Tagging

Remove all punct
Identify the parts of speech 

In [6]:
# 
# Now, let's get a tag associated with each and every token and see what part of speech these are.
# Whether they're noun, pronoun, adverb, adjective etc.

# By doing so, we can learn more about the constituents of a statement/tweet and see what kind of worlds are 
# present in it.
w = word_tokenize(sentence)
tokensLC = list()
for words in w:
    tokensLC.append(words.lower())

nltk.pos_tag(tokensLC)

[('i', 'NN'),
 ('charged', 'VBD'),
 ('the', 'DT'),
 ('phone', 'NN'),
 ('completely', 'RB'),
 ('out', 'IN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('box', 'NN'),
 ('and', 'CC'),
 ('then', 'RB'),
 ('turned', 'VBD'),
 ('it', 'PRP'),
 ('on', 'IN'),
 ('.', '.'),
 ('it', 'PRP'),
 ('went', 'VBD'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('oneplus', 'NN'),
 ('logo', 'NN'),
 ('screen', 'NN'),
 ('but', 'CC'),
 ('did', 'VBD'),
 ('nothing', 'NN'),
 ('after', 'IN'),
 ('that', 'DT'),
 ('.', '.'),
 ('no', 'DT'),
 ('buttons', 'NNS'),
 ('work', 'VBP'),
 (',', ','),
 ('no', 'DT'),
 ('combinations', 'NNS'),
 ('of', 'IN'),
 ('any', 'DT'),
 ('buttons', 'NNS'),
 ('do', 'VBP'),
 ('anything', 'NN'),
 ('to', 'TO'),
 ('take', 'VB'),
 ('it', 'PRP'),
 ('out', 'IN'),
 ('of', 'IN'),
 ('logo', 'NN'),
 ('screen', 'NN'),
 (',', ','),
 ('it', 'PRP'),
 ('wo', 'MD'),
 ("n't", 'RB'),
 ('even', 'RB'),
 ('turn', 'VB'),
 ('off', 'RP'),
 ('.', '.'),
 ('amazon', 'NN'),
 ('wo', 'MD'),
 ("n't", 'RB'),
 ('allow', 'VB'),
 ('return', 'NN'),
 ('u

List of tags: http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

![image.png](attachment:image.png)

In [7]:
# The Nltk has many great features, like finding the meaning of words, finding examples of words, 
# finding similar and opposite words etc. 

# You can see how useful these features would be if you were building like a search engine, or a text parser.

# Let’s look at a few of these features.

# The first thing you can do it, find the definition of any word.
syn = wordnet.synsets("collaborate")
print(syn)
print(syn[0].name())
print(syn[0].definition())

print(syn[1].name())
print(syn[1].definition())

[Synset('collaborate.v.01'), Synset('collaborate.v.02')]
collaborate.v.01
work together on a common enterprise of project
collaborate.v.02
cooperate as a traitor


In [9]:
syn = wordnet.synsets("verify")
print(syn)
print(syn[0].name())
print(syn[0].definition())

print(syn[1].name())
print(syn[1].definition())

[Synset('verify.v.01'), Synset('control.v.05'), Synset('verify.v.03'), Synset('affirm.v.02')]
verify.v.01
confirm the truth of
control.v.05
check or regulate (a scientific experiment) by conducting a parallel experiment or comparing with another standard


In [10]:
syn = wordnet.synsets("feature")
syn[0].examples()

['the map showed roads and other features',
 'generosity is one of his best characteristics']

In [None]:
syn = wordnet.synsets("set")
syn[1].examples()
syn[2].examples()

In [14]:
# We can get words closer to a certain word using synsets, hypernyms and hyponymns.
# Eg., we use the word "Speak" here. and what we get is the synonymns of speak.

# Hymernym : a word with a broad meaning constituting a category into which words with more 
#            specific meanings fall

# Hyponyms : each of two or more words having the same spelling or pronunciation but 
#            different meanings and origins 

syn = wordnet.synsets("book")[0]
print(syn.name())
print(syn.hypernyms())
print("**********************************************")
print(syn.hyponyms())

book.n.01
[Synset('publication.n.01')]
**********************************************
[Synset('appointment_book.n.01'), Synset('authority.n.07'), Synset('bestiary.n.01'), Synset('booklet.n.01'), Synset('catalog.n.01'), Synset('catechism.n.02'), Synset('copybook.n.01'), Synset('curiosa.n.01'), Synset('formulary.n.01'), Synset('phrase_book.n.01'), Synset('playbook.n.02'), Synset('pop-up_book.n.01'), Synset('prayer_book.n.01'), Synset('reference_book.n.01'), Synset('review_copy.n.01'), Synset('songbook.n.01'), Synset('storybook.n.01'), Synset('textbook.n.01'), Synset('tome.n.01'), Synset('trade_book.n.01'), Synset('workbook.n.01'), Synset('yearbook.n.01')]


In [17]:
# Here in this example, we try to find the opposite words of "good"

# lemma in NLTK is a canonical form of a word.
syn = wordnet.synsets("infuriating")

for s in syn:
    for l in s.lemmas():
        print(l)

Lemma('infuriate.v.01.infuriate')
Lemma('infuriate.v.01.exasperate')
Lemma('infuriate.v.01.incense')
Lemma('exasperating.s.01.exasperating')
Lemma('exasperating.s.01.infuriating')
Lemma('exasperating.s.01.maddening')
Lemma('exasperating.s.01.vexing')


In [None]:
# Lemmas can be used to find all similar words:

# And it heps us to reduce/substitute a set of words to one single word


syn = wordnet.synsets("book")
print("*************************************************************************************")
print("Synonyms of book")
print("-------------------------------------------------------------------------------------")
print(syn)
print("-------------------------------------------------------------------------------------")

print("*************************************************************************************")
print("Lemmas of book - Words that are similar to the word book and unique words in NLTK")
print("-------------------------------------------------------------------------------------")
for s in syn:
    print(s.lemmas())
print("-------------------------------------------------------------------------------------")


In [None]:
# Nltk comes inbuilt with a list of stop words for all main languages. 
# To see the stop words for English:


stopwords.words('english')
stopwords.words('german')


In [20]:
# https://en.wikipedia.org/wiki/Cadet_Nurse_Corps
from nltk.tokenize import word_tokenize

para = """
AnalytixLabs - leading Capability Building and Training Solutions Provider.

Our courses are crafted by experts to keep you ahead of the curve in industry best practices. 
Case study based modules ensure that participants learn practical applications along with the theoretical concepts. 

Further to this, new courses are continuously launched and old ones keep evolving as per the latest and upcoming 
industry trends.

High degree of commitment & personal attention is given through small batch size and individual counselling. 
Hands-on sessions and practice assignments on real life business datasets are included to ensure assimilated learning.

"""
words = word_tokenize(para)
print("*************************************************************************************")
print("Words with all stopwords")
print("-------------------------------------------------------------------------------------")
print(words)
print(len(words))
print("-------------------------------------------------------------------------------------")

useful_words = [word for word in words if word not in stopwords.words('english')]
print("*************************************************************************************")
print("Sentence is clean now - no stop words included")
print("-------------------------------------------------------------------------------------")
print(useful_words)
print(len(useful_words))
print("-------------------------------------------------------------------------------------")


*************************************************************************************
Words with all stopwords
-------------------------------------------------------------------------------------
['AnalytixLabs', '-', 'leading', 'Capability', 'Building', 'and', 'Training', 'Solutions', 'Provider', '.', 'Our', 'courses', 'are', 'crafted', 'by', 'experts', 'to', 'keep', 'you', 'ahead', 'of', 'the', 'curve', 'in', 'industry', 'best', 'practices', '.', 'Case', 'study', 'based', 'modules', 'ensure', 'that', 'participants', 'learn', 'practical', 'applications', 'along', 'with', 'the', 'theoretical', 'concepts', '.', 'Further', 'to', 'this', ',', 'new', 'courses', 'are', 'continuously', 'launched', 'and', 'old', 'ones', 'keep', 'evolving', 'as', 'per', 'the', 'latest', 'and', 'upcoming', 'industry', 'trends', '.', 'High', 'degree', 'of', 'commitment', '&', 'personal', 'attention', 'is', 'given', 'through', 'small', 'batch', 'size', 'and', 'individual', 'counselling', '.', 'Hands-on', 'sessio

In [21]:
# This is how the Naive Bayes classifier expects the input

def create_word_features(words):
    useful_words = [word for word in words if word not in stopwords.words("english")]
    my_dict = dict([(word, True) for word in useful_words])
    return my_dict

create_word_features(["the", "quick", "brown", "quick", "a", "fox"])

{'quick': True, 'brown': True, 'fox': True}

In [None]:
#Abbrevations and Words correction
def clean_text(text):
    import re
    text = text.lower()
    text = text.strip()
    text = re.sub(r' +', ' ', text)
    text = re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", text)
    return(text)

In [None]:
cleanedWords = []
for words in useful_words:
    
    cleanedWords.append(clean_text(words))
cleanedWords