## Natural Language Processing Example

### Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
try:
    from gensim.models import word2vec
except:
    !pip install gensim
    from gensim.models import word2vec

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pingwu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/b3/54/1d7294672110d5c0565cabc4b99ed952ced9a2dc2ca1d59ad1b34303a6de/gensim-3.8.1-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (24.7MB)
[K    100% |████████████████████████████████| 24.7MB 464kB/s ta 0:00:01   35% |███████████▎                    | 8.7MB 1.9MB/s eta 0:00:09
Collecting smart-open>=1.8.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/0c/09/735f2786dfac9bbf39d244ce75c0313d27d4962e71e0774750dc809f2395/smart_open-1.9.0.tar.gz (70kB)
[K    100% |████████████████████████████████| 71kB 2.4MB/s ta 0:00:011
Collecting boto3 (from smart-open>=1.8.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/8e/a9/1ceaeda8aa5d3effc9098ae301820e27bf54c4000ec6f8ec79f9b265c50e/boto3-1.10.19-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 2.9MB/s ta 0:00:01
Collecting s3transf

### Import data

We have 5 examples of documents (tweets).

In [3]:
corpus = [
'All bears are lovely',
'Our tea was bad',
'That bear drinks with bear',
'The bear drinks tea',
'We love bears'
]

In [4]:
for i in range(len(corpus)):
    print('Example #{0:d}: "{1:s}"'.format(i+1,corpus[i]))

Example #1: "All bears are lovely"
Example #2: "Our tea was bad"
Example #3: "That bear drinks with bear"
Example #4: "The bear drinks tea"
Example #5: "We love bears"


### Clean Data

Convert to lower case, remove stop words, stem words, etc.

In [5]:
list_lc = []
s1 = ' '
corpus_clean = []
for line in corpus:
    lower_case = line.lower() # lowercase 
    list_lc.append(lower_case)
    tokenizer = RegexpTokenizer(r'\w+')         
    b = tokenizer.tokenize(lower_case)
    words_rmStop = [word for word in b if word not in stopwords.words('english')] # remove stop words
    ps = PorterStemmer()
    words_stem = [ps.stem(word) for word in words_rmStop] # stem 
    corpus_clean.append(s1.join(words_stem))

In [6]:
corpus_clean

['bear love', 'tea bad', 'bear drink bear', 'bear drink tea', 'love bear']

In [7]:
for i in range(len(corpus)):
    print('Cleaned example #{0:d}: "{1:s}"'.format(i+1,corpus_clean[i]))

Cleaned example #1: "bear love"
Cleaned example #2: "tea bad"
Cleaned example #3: "bear drink bear"
Cleaned example #4: "bear drink tea"
Cleaned example #5: "love bear"


### Word Frequency (WF)

The "word frequency" (WF) method records the number of times that term occurs in a document.

In [9]:
corpus_clean

['bear love', 'tea bad', 'bear drink bear', 'bear drink tea', 'love bear']

In [10]:
vectorizer = CountVectorizer()
array_WF = vectorizer.fit_transform(corpus_clean).toarray()
#print(vectorizer.vocabulary_)
print(vectorizer.get_feature_names())
print(array_WF)

['bad', 'bear', 'drink', 'love', 'tea']
[[0 1 0 1 0]
 [1 0 0 0 1]
 [0 2 1 0 0]
 [0 1 1 0 1]
 [0 1 0 1 0]]


Print results for the "bag of words" (WF) representation.

In [15]:
vectorizer.vocabulary_

{'bear': 1, 'love': 3, 'tea': 4, 'bad': 0, 'drink': 2}

In [16]:
#import operator
#sorted_voc = sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))
sorted_voc = vectorizer.get_feature_names()
print('\t\t', end = '')
for j in range(len(vectorizer.vocabulary_)):
    print('{0:7s}'.format(sorted_voc[j]), end = '')
    #print('{0:7s}'.format(sorted_voc[j][0]), end = '')
print('\t')
for j in range(len(vectorizer.vocabulary_)):
    print('Example #{0:d}'.format(j+1), end = '')
    for i in range(array_WF.shape[0]):
        print('{0:7d}'.format(array_WF[j][i]), end = '')
    print('         "{0:s}"\t'.format(corpus_clean[j]))

		bad    bear   drink  love   tea    	
Example #1      0      1      0      1      0         "bear love"	
Example #2      1      0      0      0      1         "tea bad"	
Example #3      0      2      1      0      0         "bear drink bear"	
Example #4      0      1      1      0      1         "bear drink tea"	
Example #5      0      1      0      1      0         "love bear"	


### Term Frequency (TF)


Term frequency method is used in order to reduce influence of a document length.

The way to calculate it: $\frac{\rm Word ~ Frequency}{\rm total ~ number ~ of ~ words ~ in ~ the ~ document}$.

In [17]:
array_TF = array_WF/array_WF.sum(axis=1,keepdims=True)
print(vectorizer.get_feature_names())
print(array_TF)

['bad', 'bear', 'drink', 'love', 'tea']
[[0.         0.5        0.         0.5        0.        ]
 [0.5        0.         0.         0.         0.5       ]
 [0.         0.66666667 0.33333333 0.         0.        ]
 [0.         0.33333333 0.33333333 0.         0.33333333]
 [0.         0.5        0.         0.5        0.        ]]


You get the same results using `TfidfVectorizer`.

In [18]:
vectorizer2 = TfidfVectorizer(use_idf=False, norm="l1")
array_TF1 = vectorizer2.fit_transform(corpus_clean).toarray()
print(vectorizer2.get_feature_names())
print( array_TF1 )

['bad', 'bear', 'drink', 'love', 'tea']
[[0.         0.5        0.         0.5        0.        ]
 [0.5        0.         0.         0.         0.5       ]
 [0.         0.66666667 0.33333333 0.         0.        ]
 [0.         0.33333333 0.33333333 0.         0.33333333]
 [0.         0.5        0.         0.5        0.        ]]


Print results for the "term frequency" (TF) representation.

In [19]:
sorted_voc = vectorizer2.get_feature_names()
print('\t     ', end = '')
for j in range(len(vectorizer2.vocabulary_)):
    print('{0:7s}'.format(sorted_voc[j]), end = '')
print('\t')
for j in range(len(vectorizer2.vocabulary_)):
    print('Example #{0:d}'.format(j+1), end = '')
    for i in range(array_TF1.shape[0]):
        print('{0:7.2f}'.format(array_TF1[j][i]), end = '')
    print('         "{0:s}"\t'.format(corpus_clean[j]))

	     bad    bear   drink  love   tea    	
Example #1   0.00   0.50   0.00   0.50   0.00         "bear love"	
Example #2   0.50   0.00   0.00   0.00   0.50         "tea bad"	
Example #3   0.00   0.67   0.33   0.00   0.00         "bear drink bear"	
Example #4   0.00   0.33   0.33   0.00   0.33         "bear drink tea"	
Example #5   0.00   0.50   0.00   0.50   0.00         "love bear"	


### Term Frequency–Inverse Document Frequency (TF-IDF)

The formula that is used to compute the $\mbox{tf-idf }$ of term $t$ is

$\mbox{tf-idf}(d, t) = \mbox{tf}(t) \cdot \mbox{idf}(d, t)$

There are a number of ways to calculate $\mbox{tf}$ and $\mbox{idf}$. According to `TfidfVectorizer` documentation

$\mbox{tf}(t)$ here is word frequency,

if `smooth_idf=False`, 
$\mbox{idf}$ is computed as $\mbox{idf}(d, t) = \log \left[ \frac{n}{{\rm df}(d, t)} \right] + 1$,

if `smooth_idf=True`, 
$\mbox{idf}$ is computed as $\mbox{idf}(d, t) = \log \left[ \frac{ 1+n }{ 1+{\rm df}(d, t) } \right] + 1$,

where $n$ is the total number of documents and $\mbox{df}(d, t)$ is the document frequency.

In [20]:
vectorizer3 = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
array_TFIDF = vectorizer3.fit_transform(corpus_clean).toarray()
print( vectorizer3.get_feature_names() )
print( array_TFIDF )

['bad', 'bear', 'drink', 'love', 'tea']
[[0.         1.18232156 0.         1.69314718 0.        ]
 [2.09861229 0.         0.         0.         1.69314718]
 [0.         2.36464311 1.69314718 0.         0.        ]
 [0.         1.18232156 1.69314718 0.         1.69314718]
 [0.         1.18232156 0.         1.69314718 0.        ]]


Print results for the "term frequency - inverse document frequency" (TF-IDF) representation.

In [21]:
sorted_voc = vectorizer3.get_feature_names()
print('\t     ', end = '')
for j in range(len(vectorizer3.vocabulary_)):
    print('{0:7s}'.format(sorted_voc[j]), end = '')
print('\t')
for j in range(len(vectorizer3.vocabulary_)):
    print('Example #{0:d}'.format(j+1), end = '')
    for i in range(array_TFIDF.shape[0]):
        print('{0:7.2f}'.format(array_TFIDF[j][i]), end = '')
    print('         "{0:s}"\t'.format(corpus_clean[j]))

	     bad    bear   drink  love   tea    	
Example #1   0.00   1.18   0.00   1.69   0.00         "bear love"	
Example #2   2.10   0.00   0.00   0.00   1.69         "tea bad"	
Example #3   0.00   2.36   1.69   0.00   0.00         "bear drink bear"	
Example #4   0.00   1.18   1.69   0.00   1.69         "bear drink tea"	
Example #5   0.00   1.18   0.00   1.69   0.00         "love bear"	


### Word Embedding

https://towardsdatascience.com/word-embeddings-exploration-explanation-and-exploitation-with-code-in-python-5dac99d5d795

Here we use `Word2Vec` as example.
There are a lot of ways to use the word embedding as features, here we use joining (averaging) vectors from the words from sentence.

In [32]:
tokenized_sentences = [sentence.split() for sentence in corpus_clean]
model = word2vec.Word2Vec(tokenized_sentences, size=100, min_count=1)

In [94]:
model.most_similar(['love'])

  """Entry point for launching an IPython kernel.



[('tea', 0.04989343136548996),
 ('bear', -0.03948143869638443),
 ('bad', -0.04141325503587723),
 ('drink', -0.06769246608018875)]

In [49]:
model['love'].reshape((1, 100))

  """Entry point for launching an IPython kernel.


array([[-4.5096301e-03,  1.0776015e-03,  2.3104686e-03,  3.7820239e-03,
         1.5000100e-03, -6.6087674e-04,  1.3785056e-03,  1.1966325e-03,
        -1.6633301e-03,  3.0567958e-03,  8.8104384e-04, -2.5930733e-03,
        -2.1055257e-03,  1.5693693e-03,  4.0840977e-03,  3.7100532e-03,
        -4.1637695e-03, -2.4407684e-04, -3.6076580e-03, -4.0282798e-03,
         1.2396622e-03,  2.3122001e-03, -6.1124103e-04, -1.1780466e-03,
         2.1181912e-03, -4.7766850e-03, -3.7995996e-03,  2.7426230e-04,
        -3.3462713e-03, -1.8063164e-03,  3.1984197e-03,  8.4785663e-04,
        -8.8840068e-05, -2.0279523e-03, -2.7868808e-03, -2.9570975e-03,
         1.6235323e-04,  1.4239192e-03, -3.9102710e-03, -8.0130203e-06,
        -3.0255269e-03, -1.2272932e-03, -2.5831209e-03,  1.4058248e-03,
         2.7082025e-03,  4.6033827e-03,  1.8440962e-03, -2.0039440e-04,
         6.0292627e-05, -4.6431632e-03,  3.1464512e-03, -3.3449712e-03,
        -1.6436554e-03,  2.3879504e-03, -2.6112557e-03, -3.60327

In [55]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    text = text.split(' ')
    for word in text:
        vec += model[word].reshape((1, size))
        count += 1.
    if count != 0:
        vec /= count
    return vec

In [56]:
corpus_clean

['bear love', 'tea bad', 'bear drink bear', 'bear drink tea', 'love bear']

In [57]:
array_wordEmbedding = np.concatenate([buildWordVector(z, 100) for z in corpus_clean])

  


In [59]:
print( array_wordEmbedding )

[[-9.95494192e-04  2.14229617e-03  1.00282682e-03  2.30378966e-03
  -8.36473424e-04 -2.57339864e-03  1.59368583e-03  1.05653459e-03
  -3.01944680e-03  2.44038715e-03  2.82365625e-03 -2.24264094e-03
   1.36526139e-03  1.20678419e-03  3.56960949e-03  3.29339586e-03
   2.51698773e-04 -1.23027073e-03 -4.21406853e-03 -4.08171886e-03
   7.97631496e-04 -1.16096553e-03 -1.95844870e-03 -2.65685952e-03
  -6.01756270e-04 -1.44713558e-05 -6.34645461e-04  1.57335617e-03
  -4.47480939e-04 -3.12123448e-03  2.81179172e-03 -2.81706394e-04
  -1.59697889e-03 -1.42891498e-03 -1.91793521e-03 -4.35673050e-04
  -6.55694595e-04  2.81806762e-03 -3.79118766e-03  4.89779380e-04
  -2.86811881e-03  2.06357916e-04 -2.32828106e-03  2.30017578e-03
   3.03848495e-03  1.08424074e-03 -1.69115840e-04  8.65153634e-04
   2.33931500e-03 -3.80504271e-03  3.38211865e-03 -2.68545514e-03
   2.67913449e-04  3.49915738e-03  5.33820363e-04  5.95182646e-05
  -4.95149288e-05 -5.64630151e-04 -4.62433370e-03 -2.47629781e-03
  -2.15658