# Python NLP Library Implmention

In [129]:
sentence1 = "At eight on Thursday morning Arthur felt very good good, but not perfect."
sentence2 = "Thursday night is good!"
sentence3 = "Although Arthur is is feeling good at eight feel"
article = "At eight on Thursday morning Arthur felt very good good, but not prefect. Thursday night is good! Although Arthur is is feeling good at eight feel."

## 1. NLTK

In [4]:
import nltk

### 1.1 Sentence tokenizer

In [130]:
print (type(nltk.sent_tokenize(article)), nltk.sent_tokenize(article))

<class 'list'> ['At eight on Thursday morning Arthur felt very good good, but not prefect.', 'Thursday night is good!', 'Although Arthur is is feeling good at eight feel.']


### 1.2 Normalization

Unify all words in lower case.

In [131]:
sentence1 = sentence1.lower()
print (sentence1)

at eight on thursday morning arthur felt very good good, but not perfect.


### 1.3 Word Tokenizer

In [132]:
words = nltk.word_tokenize(sentence1)
print (type(words), words)

<class 'list'> ['at', 'eight', 'on', 'thursday', 'morning', 'arthur', 'felt', 'very', 'good', 'good', ',', 'but', 'not', 'perfect', '.']


### 1.4 Remove Punctuations

In [133]:
import string
punctuations = set(string.punctuation)

In [169]:
print (punctuations)

{'^', '|', '~', '{', ':', ';', '=', '}', '\\', '%', '>', '.', '!', '+', '#', '?', '*', ')', '(', "'", '&', '[', ']', '@', '-', '`', ',', '<', '"', '/', '$', '_'}


In [134]:
print ([x for x in words if x not in punctuations])

['at', 'eight', 'on', 'thursday', 'morning', 'arthur', 'felt', 'very', 'good', 'good', 'but', 'not', 'perfect']


### 1.5 Remove Stopwords

In [135]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('English'))

In [101]:
print (stop_words)

{'between', 'below', 'ain', 'me', 'off', 'doesn', 'while', 'out', 'my', 'where', 'and', 'our', 'how', 'some', 'after', 'such', 'nor', 'same', 'himself', 'all', 'weren', 'mustn', 'to', 'can', 'mightn', 'needn', 'hers', 'for', 'will', 'o', 'then', 'few', 'she', 'wasn', 'has', 'each', 'with', 'in', 'is', 's', 'have', 'm', 'having', 'what', 'whom', 'not', 'those', 'do', 'ourselves', 'his', 'ma', 'against', 'we', 'ours', 'your', 'it', 'its', 'these', 'at', 'by', 'into', 'too', 'theirs', 'no', 'he', 'aren', 'over', 're', 'yours', 'now', 'but', 'down', 'their', 'who', 'both', 'because', 'most', 'so', 'as', 'only', 'll', 'on', 'itself', 'didn', 'won', 'am', 'an', 'than', 'of', 'hasn', 'hadn', 'which', 'they', 'there', 'be', 'i', 'if', 'once', 'until', 'up', 'further', 'a', 'own', 'shouldn', 'were', 'yourself', 'above', 'that', 'shan', 'before', 'through', 'him', 'her', 'this', 'wouldn', 'did', 'had', 'myself', 'should', 'isn', 'just', 'does', 'yourselves', 'the', 'under', 'been', 'when', 'you'

In [136]:
print ([x for x in words if x not in stop_words])

['eight', 'thursday', 'morning', 'arthur', 'felt', 'good', 'good', ',', 'perfect', '.']


### 1.6 Stemming

Turning all verb's tenses to present tense and plural nouns to singular noun.

In [137]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [138]:
print ([ps.stem(x) for x in words])

['at', 'eight', 'on', 'thursday', 'morn', 'arthur', 'felt', 'veri', 'good', 'good', ',', 'but', 'not', 'perfect', '.']


Note here the porter stemmer is unable to detect 'felt' with stemmed 'feel', so the porter stemmer doesn't work perfectly.

### 1.7 All Together

In [140]:
print (set([ps.stem(x.lower()) for x in words if x not in punctuations if x not in stop_words]))

{'eight', 'arthur', 'thursday', 'morn', 'good', 'perfect', 'felt'}


Now we can see the word space is smaller than directly word tokenizing the sentence.

## 2. Word Feature Vector Space

In [141]:
print (article)

At eight on Thursday morning Arthur felt very good good, but not prefect. Thursday night is good! Although Arthur is is feeling good at eight feel.


###  2.1 Feature space

In [143]:
all_corpus_words = []
for sentence in nltk.sent_tokenize(article.lower()):
    print (sentence)
    words = [ps.stem(x) for x in nltk.word_tokenize(sentence) 
                 if x not in punctuations if x not in stop_words]
    all_corpus_words += words
    print (words)
    
all_corpus_words = set(all_corpus_words)

at eight on thursday morning arthur felt very good good, but not prefect.
['eight', 'thursday', 'morn', 'arthur', 'felt', 'good', 'good', 'prefect']
thursday night is good!
['thursday', 'night', 'good']
although arthur is is feeling good at eight feel.
['although', 'arthur', 'feel', 'good', 'eight', 'feel']


### 2.2 Bag-of-words model

In [144]:
print (all_corpus_words)

{'eight', 'prefect', 'night', 'arthur', 'although', 'thursday', 'feel', 'morn', 'good', 'felt'}


In [59]:
def get_features(review):
    features = {}
    review_words = set([x.lower() for x in nltk.word_tokenize(str(review)) 
                     if x not in stop_words if x not in punctuations])
    for word in all_words:
        features[word] = (word in review_words)
    return features

### 2.3 Basic representaiton (0/1) for feature vectors

In [145]:
sent =1
for sentence in nltk.sent_tokenize(article):
    print (sent, get_features(sentence))
    sent +=1

1 {'good': True, 'eight': True, 'felt': True, 'feel': False, 'night': False, 'arthur': True, 'although': False, 'thursday': True, 'prefect': True, 'morn': False}
2 {'good': True, 'eight': False, 'felt': False, 'feel': False, 'night': True, 'arthur': False, 'although': False, 'thursday': True, 'prefect': False, 'morn': False}
3 {'good': True, 'eight': True, 'felt': False, 'feel': True, 'night': False, 'arthur': True, 'although': True, 'thursday': False, 'prefect': False, 'morn': False}


## 3. Sklearn: TF-IDF representation

## $$\textrm{tf-idf(w,d)} = \textrm{tf (w,d)}\times (\textrm{idf(w,d)}+1)$$

### 3.1 TF representation

In [146]:
import numpy as np
doc = np.array([sentence1, sentence2, sentence3])

In [147]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(lowercase=False)  
bag = count.fit_transform(doc)
print (count.vocabulary_)

{'eight': 6, 'very': 18, 'feeling': 8, 'thursday': 17, 'Thursday': 2, 'not': 14, 'morning': 12, 'but': 5, 'Arthur': 1, 'Although': 0, 'on': 15, 'night': 13, 'arthur': 3, 'feel': 7, 'at': 4, 'good': 10, 'is': 11, 'perfect': 16, 'felt': 9}


In [148]:
'''The deault to use CountVectorizer is lowercase = True'''
count = CountVectorizer()
bag = count.fit_transform(doc)
print (count.vocabulary_)

{'eight': 4, 'very': 16, 'feeling': 6, 'although': 0, 'thursday': 15, 'not': 12, 'morning': 10, 'but': 3, 'on': 13, 'night': 11, 'arthur': 1, 'feel': 5, 'at': 2, 'good': 8, 'is': 9, 'perfect': 14, 'felt': 7}


### 3.2 Feature index

In [164]:
count = CountVectorizer(stop_words='english')
bag = count.fit_transform(doc)
print (count.vocabulary_)

{'feeling': 2, 'night': 6, 'arthur': 0, 'thursday': 8, 'morning': 5, 'good': 4, 'feel': 1, 'perfect': 7, 'felt': 3}


In [158]:
print (all_corpus_words)

{'eight', 'prefect', 'night', 'arthur', 'although', 'thursday', 'feel', 'morn', 'good', 'felt'}


Note here the sklearn's CountVector also remvoed words **although** and **eight**, but previously we implemented stemmer so there is no **feeling** in **all_corpus_words**.

In [125]:
print (doc)

[ 'At eight on Thursday morning Arthur felt very very good, but not perfect.'
 'Thursday night is good!' 'Although Arthur is is feeling good at eight']


In [159]:
print (bag.toarray())

[[1 0 0 1 2 1 0 1 1]
 [0 0 0 0 1 0 1 0 1]
 [1 1 1 0 1 0 0 0 0]]


### 3.3 n-gram representation (n>1)

In [76]:
twograms = CountVectorizer(ngram_range=(1,2), stop_words ='english')

In [160]:
bag = twograms.fit_transform(doc)
print (twograms.vocabulary_)

{'feeling': 4, 'thursday': 17, 'morning': 12, 'arthur feeling': 1, 'felt good': 7, 'thursday night': 19, 'thursday morning': 18, 'night good': 15, 'feeling good': 5, 'arthur felt': 2, 'night': 14, 'arthur': 0, 'good perfect': 11, 'feel': 3, 'morning arthur': 13, 'good good': 10, 'good': 8, 'perfect': 16, 'felt': 6, 'good feel': 9}


In [161]:
print (bag.toarray())

[[1 0 1 0 0 0 1 1 2 0 1 1 1 1 0 0 1 1 1 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1]
 [1 1 0 1 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0]]


### 3.4 TF-IDF representation

#### 3.4.1 TfidfTransformer( )

In [165]:
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
tfidf = TfidfTransformer()  ## this bag is normalized and removes stop words.
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag).toarray())

[[ 0.32  0.    0.    0.42  0.5   0.42  0.    0.42  0.32]
 [ 0.    0.    0.    0.    0.43  0.    0.72  0.    0.55]
 [ 0.44  0.58  0.58  0.    0.35  0.    0.    0.    0.  ]]


#### 3.4.2 Using TfidfVectorizer( ) = CountVectorizer( ) + TfidfTransformer( )

In [166]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,lowercase=True,preprocessor=None, stop_words ='english')
print(tfidf.fit_transform(doc).toarray())

[[ 0.32  0.    0.    0.42  0.5   0.42  0.    0.42  0.32]
 [ 0.    0.    0.    0.    0.43  0.    0.72  0.    0.55]
 [ 0.44  0.58  0.58  0.    0.35  0.    0.    0.    0.  ]]


In [170]:
print (count.vocabulary_)
print (bag.toarray())

{'feeling': 2, 'night': 6, 'arthur': 0, 'thursday': 8, 'morning': 5, 'good': 4, 'feel': 1, 'perfect': 7, 'felt': 3}
[[1 0 0 1 2 1 0 1 1]
 [0 0 0 0 1 0 1 0 1]
 [1 1 1 0 1 0 0 0 0]]


In [177]:
doc

array([ 'at eight on thursday morning arthur felt very good good, but not perfect.',
       'Thursday night is good!',
       'Although Arthur is is feeling good at eight feel'], 
      dtype='<U73')

In [183]:
index = [count.vocabulary_['thursday'], count.vocabulary_['good'], count.vocabulary_['perfect'],count.vocabulary_['night']]
print (index)
print (bag.toarray()[0][index])

[8, 4, 7, 6]
[1 2 1 0]


### 3.5 Overall article in terms of tfidf reprentation

In [179]:
doc_tfidf = tfidf.fit_transform([x for x in nltk.sent_tokenize(article)])

In [181]:
print(doc_tfidf.toarray())

[[ 0.32  0.    0.    0.42  0.5   0.42  0.    0.42  0.32]
 [ 0.    0.    0.    0.    0.43  0.    0.72  0.    0.55]
 [ 0.44  0.58  0.58  0.    0.35  0.    0.    0.    0.  ]]


In [182]:
print (doc_tfidf.toarray()[0][index])

[ 0.32  0.5   0.42  0.  ]
