In [None]:
#### NLTK Tutorial

NLTK is literally an acronym for Natural Language Toolkit.

Install NLTK with Python 3.x using:

sudo pip3 install nltk

In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

#### Tokenize words
A sentence or data can be split into words using the method **word_tokenize()**:



In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack a dull boy, all work and no play"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


#### Tokenizing sentences
The same principle can be applied to sentences. Simply change the to **sent_tokenize()**
We have added two sentences to the variable data:

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']


#### NLTK and arrays
If you wish to you can store the words and sentences in arrays:

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
 
phrases = sent_tokenize(data)
words = word_tokenize(data)
 
print(phrases)
print(words)

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']
['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', '.']


### Natural Language Processing: remove stop words
#### NLTK stop words
NLTK Natural Language Processing with Python
Natural language processing (nlp) is a research field that presents many challenges such as natural language understanding.
Text may contain stop words like ‘the’, ‘is’, ‘are’. Stop words can be filtered from the text to be processed. There is no universal list of stop words in nlp research, however the nltk module contains a list of stop words.

In this article you will learn how to remove stop words with the nltk module.

In [8]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = []
 
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
 
print(wordsFiltered)

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.', 'All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']


#### Alternatively

In [42]:
import pandas as pd
import numpy as np

Table1 = pd.DataFrame(words,columns=['word'])
                  

Table2=pd.DataFrame.transpose(pd.DataFrame([stopWords]))

Table2.columns = ['word']

index=Table1.word.isin(Table2.word) 

print(Table1.word[index==False])

print(Table1.word[index==True])


0       All
1      work
4      play
5     makes
6      jack
7      dull
8       boy
9         .
10      All
11     work
14     play
15    makes
16     jack
18     dull
19      boy
20        .
Name: word, dtype: object
2     and
3      no
12    and
13     no
17      a
Name: word, dtype: object


In [12]:
import pandas as pd
import numpy as np

TableA = pd.DataFrame(np.random.rand(4, 3),
                      pd.Index(list('abcd'), name='Key'),
                      ['A', 'B', 'C']).reset_index()
TableB = pd.DataFrame(np.random.rand(4, 3),
                      pd.Index(list('aecf'), name='Key'),
                      ['A', 'B', 'C']).reset_index()

In [11]:

TableB

Unnamed: 0,Key,A,B,C
0,a,0.801643,0.60558,0.796524
1,e,0.155228,0.760834,0.66948
2,c,0.853515,0.961704,0.513742
3,f,0.501308,0.187677,0.057073


In [12]:
TableA

Unnamed: 0,Key,A,B,C
0,a,0.379867,0.953032,0.971827
1,b,0.923072,0.011723,0.927991
2,c,0.630206,0.362749,0.886373
3,d,0.106573,0.519791,0.990087


In [14]:
# Identify what values are in TableB and not in TableA
key_diff = set(TableB.Key).difference(TableA.Key)
key_diff

{'e', 'f'}

In [32]:
where_diff = TableB.Key.isin(key_diff)
where_diff

0    False
1     True
2    False
3     True
Name: Key, dtype: bool

In [47]:
index=TableB.Key.isin(TableA.Key) 

TableB.Key[index==True]


0    a
2    c
Name: Key, dtype: object

In [16]:
# Slice TableB accordingly and append to TableA
TableA.append(TableB[where_diff], ignore_index=True)

Unnamed: 0,Key,A,B,C
0,a,0.379867,0.953032,0.971827
1,b,0.923072,0.011723,0.927991
2,c,0.630206,0.362749,0.886373
3,d,0.106573,0.519791,0.990087
4,e,0.155228,0.760834,0.66948
5,f,0.501308,0.187677,0.057073


In [27]:
# Method 2
import timeit

start=timeit.default_timer()

rows = []
for i, row in TableB.iterrows():
    if row.Key not in TableA.Key.values:
        rows.append(row)

pd.concat([TableA.T] + rows, axis=1).T

end=timeit.default_timer()
end - start #elapsed time in seconds

0.003781184001127258

In [17]:
TableB.iterrows()

<generator object DataFrame.iterrows at 0x110929fc0>

In [21]:
#Method three
TableB_only = pd.merge(
    TableA, TableB,
    how='outer', on='Key', indicator=True, suffixes=('_foo','')).query(
        '_merge == "right_only"')

print('TableB_only', TableB_only, sep='\n')

Table_concatenated = pd.concat((TableA, TableB_only), join='inner')

Table_concatenated

TableB_only
  Key  A_foo  B_foo  C_foo         A         B         C      _merge
4   e    NaN    NaN    NaN  0.155228  0.760834  0.669480  right_only
5   f    NaN    NaN    NaN  0.501308  0.187677  0.057073  right_only


Unnamed: 0,Key,A,B,C
0,a,0.379867,0.953032,0.971827
1,b,0.923072,0.011723,0.927991
2,c,0.630206,0.362749,0.886373
3,d,0.106573,0.519791,0.990087
4,e,0.155228,0.760834,0.66948
5,f,0.501308,0.187677,0.057073


#### NLTK – stemming
A word stem is part of a word. It is sort of a normalization idea, but linguistic.
For example, the stem of the word waiting is wait.

In [6]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
 
words = ["game","gaming","gamed","games"]
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

game
game
game
game


In [43]:
Table1 = pd.DataFrame(words)
                  

Table2=pd.DataFrame.transpose(pd.DataFrame([stopWords]))



index=Table1.isin(Table2) 

print(Table1[index==False])

print(Table1[index==True])

        0
0     All
1    work
2     and
3      no
4    play
5   makes
6    jack
7    dull
8     boy
9       .
10    All
11   work
12    and
13     no
14   play
15  makes
16   jack
17      a
18   dull
19    boy
20      .
      0
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
9   NaN
10  NaN
11  NaN
12  NaN
13  NaN
14  NaN
15  NaN
16  NaN
17  NaN
18  NaN
19  NaN
20  NaN
