In [None]:
#### NLTK Tutorial

NLTK is literally an acronym for Natural Language Toolkit.

Install NLTK with Python 3.x using:

sudo pip3 install nltk

In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

#### Tokenize words
A sentence or data can be split into words using the method **word_tokenize()**:



In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack a dull boy, all work and no play"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


#### Tokenizing sentences
The same principle can be applied to sentences. Simply change the to **sent_tokenize()**
We have added two sentences to the variable data:

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']


#### NLTK and arrays
If you wish to you can store the words and sentences in arrays:

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
 
phrases = sent_tokenize(data)
words = word_tokenize(data)
 
print(phrases)
print(words)

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']
['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']


### Natural Language Processing: remove stop words
#### NLTK stop words
NLTK Natural Language Processing with Python
Natural language processing (nlp) is a research field that presents many challenges such as natural language understanding.
Text may contain stop words like ‘the’, ‘is’, ‘are’. Stop words can be filtered from the text to be processed. There is no universal list of stop words in nlp research, however the nltk module contains a list of stop words.

In this article you will learn how to remove stop words with the nltk module.

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = []
 
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
 
print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


#### Alternatively

In [42]:
import pandas as pd
import numpy as np

Table1 = pd.DataFrame(words,columns=['word'])
                  

Table2=pd.DataFrame.transpose(pd.DataFrame([stopWords]))

Table2.columns = ['word']

index=Table1.word.isin(Table2.word) 

print(Table1.word[index==False])

print(Table1.word[index==True])


0       All
1      work
4      play
5     makes
6      jack
7      dull
8       boy
9         .
10      All
11     work
14     play
15    makes
16     jack
18     dull
19      boy
20        .
Name: word, dtype: object
2     and
3      no
12    and
13     no
17      a
Name: word, dtype: object


In [12]:
import pandas as pd
import numpy as np

TableA = pd.DataFrame(np.random.rand(4, 3),
                      pd.Index(list('abcd'), name='Key'),
                      ['A', 'B', 'C']).reset_index()
TableB = pd.DataFrame(np.random.rand(4, 3),
                      pd.Index(list('aecf'), name='Key'),
                      ['A', 'B', 'C']).reset_index()

In [11]:

TableB

Unnamed: 0,Key,A,B,C
0,a,0.801643,0.60558,0.796524
1,e,0.155228,0.760834,0.66948
2,c,0.853515,0.961704,0.513742
3,f,0.501308,0.187677,0.057073


In [12]:
TableA

Unnamed: 0,Key,A,B,C
0,a,0.379867,0.953032,0.971827
1,b,0.923072,0.011723,0.927991
2,c,0.630206,0.362749,0.886373
3,d,0.106573,0.519791,0.990087


In [14]:
# Identify what values are in TableB and not in TableA
key_diff = set(TableB.Key).difference(TableA.Key)
key_diff

{'e', 'f'}

In [32]:
where_diff = TableB.Key.isin(key_diff)
where_diff

0    False
1     True
2    False
3     True
Name: Key, dtype: bool

In [47]:
index=TableB.Key.isin(TableA.Key) 

TableB.Key[index==True]


0    a
2    c
Name: Key, dtype: object

In [16]:
# Slice TableB accordingly and append to TableA
TableA.append(TableB[where_diff], ignore_index=True)

Unnamed: 0,Key,A,B,C
0,a,0.379867,0.953032,0.971827
1,b,0.923072,0.011723,0.927991
2,c,0.630206,0.362749,0.886373
3,d,0.106573,0.519791,0.990087
4,e,0.155228,0.760834,0.66948
5,f,0.501308,0.187677,0.057073


In [27]:
# Method 2
import timeit

start=timeit.default_timer()

rows = []
for i, row in TableB.iterrows():
    if row.Key not in TableA.Key.values:
        rows.append(row)

pd.concat([TableA.T] + rows, axis=1).T

end=timeit.default_timer()
end - start #elapsed time in seconds

0.003781184001127258

In [17]:
TableB.iterrows()

<generator object DataFrame.iterrows at 0x110929fc0>

In [21]:
#Method three
TableB_only = pd.merge(
    TableA, TableB,
    how='outer', on='Key', indicator=True, suffixes=('_foo','')).query(
        '_merge == "right_only"')

print('TableB_only', TableB_only, sep='\n')

Table_concatenated = pd.concat((TableA, TableB_only), join='inner')

Table_concatenated

TableB_only
  Key  A_foo  B_foo  C_foo         A         B         C      _merge
4   e    NaN    NaN    NaN  0.155228  0.760834  0.669480  right_only
5   f    NaN    NaN    NaN  0.501308  0.187677  0.057073  right_only


Unnamed: 0,Key,A,B,C
0,a,0.379867,0.953032,0.971827
1,b,0.923072,0.011723,0.927991
2,c,0.630206,0.362749,0.886373
3,d,0.106573,0.519791,0.990087
4,e,0.155228,0.760834,0.66948
5,f,0.501308,0.187677,0.057073


#### NLTK – stemming
A word stem is part of a word. It is sort of a normalization idea, but linguistic.
For example, the stem of the word waiting is wait.

In [4]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
 
words = ["game","gaming","gamed","games"]
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

game
game
game
game


You can do word stemming for sentences too:

In [5]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
 
ps = PorterStemmer()
 
sentence = "gaming, the gamers play games"
words = word_tokenize(sentence)
 
for word in words:
    print(word + ":" + ps.stem(word))

gaming:game
,:,
the:the
gamers:gamer
play:play
games:game


In [6]:
words

['gaming', ',', 'the', 'gamers', 'play', 'games']

#### NLTK speech tagging

The module NLTK can automatically tag speech.
Given a sentence or paragraph, it can label words such as verbs, nouns and so on.

NLTK – speech tagging example

The example below automatically tags words with a corresponding class.

In [1]:
import nltk
from nltk.tokenize import PunktSentenceTokenizer
 
document = 'Whether you\'re new to programming or an experienced developer, it\'s easy to learn and use Python.'
sentences = nltk.sent_tokenize(document)   
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('Whether', 'IN'), ('you', 'PRP'), ("'re", 'VBP'), ('new', 'JJ'), ('to', 'TO'), ('programming', 'VBG'), ('or', 'CC'), ('an', 'DT'), ('experienced', 'JJ'), ('developer', 'NN'), (',', ','), ('it', 'PRP'), ("'s", 'VBZ'), ('easy', 'JJ'), ('to', 'TO'), ('learn', 'VB'), ('and', 'CC'), ('use', 'VB'), ('Python', 'NNP'), ('.', '.')]


We can filter this data based on the type of word:

In [2]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
 
document = 'Today the Netherlands celebrates King\'s Day. To honor this tradition, the Dutch embassy in San Francisco invited me to'
sentences = nltk.sent_tokenize(document)   
 
data = []
for sent in sentences:
    data = data + nltk.pos_tag(nltk.word_tokenize(sent))
for word in data: 
    if 'NNP' in word[1]: 
        print(word)

('Netherlands', 'NNP')
('King', 'NNP')
('Day', 'NNP')
('San', 'NNP')
('Francisco', 'NNP')


#### Natural Language Processing – prediction

#### NLTK
Natural Language Processing with Python
We can use natural language processing to make predictions.
Example: Given a product review, a computer can predict if its positive or negative based on the text.

In this article you will learn how to make a prediction program based on natural language processing.

nlp prediction example

Given a name, the classifier will predict if it’s a male or female.

To create our analysis program, we have several steps:

Data preparation
Feature extraction
Training
Prediction
Data preparation
The first step is to prepare data.
We use the names set included with nltk.



In [5]:
from nltk.corpus import names
 
# Load data and training 
names = ([(name, 'male') for name in names.words('male.txt')] + 
	 [(name, 'female') for name in names.words('female.txt')])


This dataset is simply a collection of tuples. To give you an idea of what the dataset looks like:

In [4]:
[(u'Aaron', 'male'), (u'Abbey', 'male'), (u'Abbie', 'male')]
[(u'Zorana', 'female'), (u'Zorina', 'female'), (u'Zorine', 'female')]

[('Zorana', 'female'), ('Zorina', 'female'), ('Zorine', 'female')]

You can define your own set of tuples if you wish, its simply a list containing many tuples.

Feature extraction
Based on the dataset, we prepare our feature. The feature we will use is the last letter of a name:
We define a featureset using:

In [7]:
featuresets = [(gender_features(n), g) for (n,g) in names]

NameError: name 'gender_features' is not defined

and the features (last letters) are extracted using:

In [8]:
def gender_features(word): 
    return {'last_letter': word[-1]}

Training and prediction
We train and predict using:

In [None]:
classifier = nltk.NaiveBayesClassifier.train(train_set) 
 
# Predict
print(classifier.classify(gender_features('Frank')))

Example
A classifier has a training and a test phrase.

In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
 
def gender_features(word): 
    return {'last_letter': word[-1]} 
 
# Load data and training 
names = ([(name, 'male') for name in names.words('male.txt')] + 
	 [(name, 'female') for name in names.words('female.txt')])
 
featuresets = [(gender_features(n), g) for (n,g) in names] 
train_set = featuresets
classifier = nltk.NaiveBayesClassifier.train(train_set) 
 
# Predict
print(classifier.classify(gender_features('Frank')))

print(classifier.classify(gender_features('Alex')))

In [None]:
If you want to give the name during runtime, change the last line to:

In [None]:
# Predict
name = input("Name: ")
print(classifier.classify(gender_features(name)))

#### Python Sentiment Analysis

#### Sentiment Analysis

In Natural Language Processing there is a concept known as Sentiment Analysis.

Given a movie review or a tweet, it can be automatically classified in categories.
These categories can be user defined (positive, negative) or whichever classes you want.

#### Sentiment Analysis Example

Classification is done using several steps: training and prediction.

The training phase needs to have training data, this is example data in which we define examples. The classifier will use the training data to make predictions.

We start by defining 3 classes: positive, negative and neutral.
Each of these is defined by a vocabulary:

In [None]:
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]


Every word is converted into a feature using a simplified bag of words model:

In [None]:
def word_feats(words):
    return dict([(word, True) for word in words])
 
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]

Our training set is then the sum of these three feature sets:

In [None]:
train_set = negative_features + positive_features + neutral_features

We train the classifier:

In [None]:
classifier = NaiveBayesClassifier.train(train_set)

And make predictions.

##### Code example
This example classifies sentences according to the training set.

In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import names
 
def word_feats(words):
    return dict([(word, True) for word in words])
 
positive_vocab = [ 'awesome', 'outstanding', 'fantastic', 'terrific', 'good', 'nice', 'great', ':)' ]
negative_vocab = [ 'bad', 'terrible','useless', 'hate', ':(' ]
neutral_vocab = [ 'movie','the','sound','was','is','actors','did','know','words','not' ]
 
positive_features = [(word_feats(pos), 'pos') for pos in positive_vocab]
negative_features = [(word_feats(neg), 'neg') for neg in negative_vocab]
neutral_features = [(word_feats(neu), 'neu') for neu in neutral_vocab]
 
train_set = negative_features + positive_features + neutral_features
 
classifier = NaiveBayesClassifier.train(train_set) 
 
# Predict
neg = 0
pos = 0
sentence = "Awesome movie, I liked it"
sentence = sentence.lower()
words = sentence.split(' ')
for word in words:
    classResult = classifier.classify( word_feats(word))
    if classResult == 'neg':
        neg = neg + 1
    if classResult == 'pos':
        pos = pos + 1
print('Positive: ' + str(float(pos)/len(words)))
print('Negative: ' + str(float(neg)/len(words)))