# Python NLP Libraryies Implmention

In [39]:
sentence1 = "At eight on Thursday morning Arthur felt very very good, but not perfect."
sentence2 = "Thursday night is good"
sentence3 = "Arthur is is feeling good at eight"
paragraph = "At eight on Thursday morning Arthur felt very very good, but not prefect. Thursday night is good! Although Arthur is is feeling good at eight."

## 1. NLTK

In [4]:
import nltk

### 1.1 Sentence tokenizer

In [19]:
print (type(nltk.sent_tokenize(paragraph)), nltk.sent_tokenize(paragraph))

<class 'list'> ['At eight on Thursday morning Arthur felt very very good, but not prefect.', 'Thursday night is good!', 'Although Arthur is is feeling good at eight']


### 1.2 Normalization

In [27]:
sentence1 = sentence1.lower()
print (sentence1)

at eight on thursday morning arthur felt very very good, but not perfect.


### 1.3 Word Tokenizer

In [28]:
words = nltk.word_tokenize(sentence1)
print (type(words), words)

<class 'list'> ['at', 'eight', 'on', 'thursday', 'morning', 'arthur', 'felt', 'very', 'very', 'good', ',', 'but', 'not', 'perfect', '.']


### 1.4 Remove Punctuations

In [29]:
import string
punctuations = set(string.punctuation)

In [30]:
print ([x for x in words if x not in punctuations])

['at', 'eight', 'on', 'thursday', 'morning', 'arthur', 'felt', 'very', 'very', 'good', 'but', 'not', 'perfect']


### 1.5 Remove Stopwords

In [22]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('English'))

In [31]:
print ([x for x in words if x not in stop_words])

['eight', 'thursday', 'morning', 'arthur', 'felt', 'good', ',', 'perfect', '.']


### 1.6 Stemming

In [32]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [34]:
print ([ps.stem(x) for x in words])

['at', 'eight', 'on', 'thursday', 'morn', 'arthur', 'felt', 'veri', 'veri', 'good', ',', 'but', 'not', 'perfect', '.']


Note here the porter stemmer is unable to detect 'felt' with stemmed 'feel'.

### 1.7 All Together

In [35]:
print ([ps.stem(x.lower()) for x in words if x not in punctuations if x not in stop_words])

['eight', 'thursday', 'morn', 'arthur', 'felt', 'good', 'perfect']


Now we can see the word space is smaller than directly word tokenizing the sentence.

## 2. Word Feature Vector Space

In [40]:
print (paragraph)

At eight on Thursday morning Arthur felt very very good, but not prefect. Thursday night is good! Although Arthur is is feeling good at eight.


###  2.1 Feature space

In [54]:
all_words = []
for sentence in nltk.sent_tokenize(paragraph):
    print (sentence)
    all_words += [ps.stem(x.lower()) for x in nltk.word_tokenize(sentence) 
                 if x not in punctuations if x not in stop_words]

At eight on Thursday morning Arthur felt very very good, but not prefect.
Thursday night is good!
Although Arthur is is feeling good at eight.


### 2.2 Bag-of-words model

In [55]:
print (all_words)

['at', 'eight', 'thursday', 'morn', 'arthur', 'felt', 'good', 'prefect', 'thursday', 'night', 'good', 'although', 'arthur', 'feel', 'good', 'eight']


In [59]:
def get_features(review):
    features = {}
    review_words = set([x.lower() for x in nltk.word_tokenize(str(review)) 
                     if x not in stop_words if x not in punctuations])
    for word in all_words:
        features[word] = (word in review_words)
    return features

### 2.3 Basic representaiton (0/1) for feature vectors

In [61]:
sent =1
for sentence in nltk.sent_tokenize(paragraph):
    print (sent, get_features(sentence))
    sent +=1

1 {'good': True, 'eight': True, 'felt': True, 'prefect': True, 'night': False, 'arthur': True, 'although': False, 'thursday': True, 'at': True, 'feel': False, 'morn': False}
2 {'good': True, 'eight': False, 'felt': False, 'prefect': False, 'night': True, 'arthur': False, 'although': False, 'thursday': True, 'at': False, 'feel': False, 'morn': False}
3 {'good': True, 'eight': True, 'felt': False, 'prefect': False, 'night': False, 'arthur': True, 'although': True, 'thursday': False, 'at': False, 'feel': False, 'morn': False}


## 3. Sklearn: TF-IDF representation

### $$\textrm{tf-idf(w,d)} = \textrm{tf (w,d)}\times (\textrm{idf(w,d)}+1)$$

### 3.1 TF (term frequency)

In [63]:
import numpy as np
doc = np.array([sentence1, sentence2, sentence3])

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(lowercase=False)  
bag = count.fit_transform(doc)
print (count.vocabulary_)

{'eight': 5, 'very': 15, 'feeling': 6, 'Thursday': 2, 'not': 12, 'morning': 10, 'but': 4, 'Arthur': 0, 'on': 13, 'At': 1, 'night': 11, 'at': 3, 'good': 8, 'is': 9, 'perfect': 14, 'felt': 7}


In [66]:
'''The deault to use CountVectorizer is lowercase = True'''
count = CountVectorizer()
bag = count.fit_transform(doc)
print (count.vocabulary_)

{'eight': 3, 'very': 14, 'feeling': 4, 'night': 9, 'on': 11, 'thursday': 13, 'arthur': 0, 'not': 10, 'morning': 8, 'at': 1, 'good': 6, 'but': 2, 'is': 7, 'perfect': 12, 'felt': 5}


### 3.2 Feature index

In [80]:
count = CountVectorizer(stop_words='english')
bag = count.fit_transform(doc)
print (count.vocabulary_)

{'feeling': 1, 'night': 5, 'arthur': 0, 'thursday': 7, 'morning': 4, 'good': 3, 'perfect': 6, 'felt': 2}


In [72]:
print (doc)

[ 'At eight on Thursday morning Arthur felt very very good, but not perfect.'
 'Thursday night is good' 'Arthur is is feeling good at eight']


In [71]:
print (bag.toarray())

[[1 0 1 1 1 0 1 1]
 [0 0 0 1 0 1 0 1]
 [1 1 0 1 0 0 0 0]]


### 3.3 n-gram representation (n>1)

In [76]:
twograms = CountVectorizer(ngram_range=(1,2), stop_words ='english')

In [77]:
bag = twograms.fit_transform(doc)
print (twograms.vocabulary_)

{'feeling': 3, 'thursday': 14, 'morning': 9, 'arthur feeling': 1, 'felt good': 6, 'thursday night': 16, 'thursday morning': 15, 'night good': 12, 'feeling good': 4, 'arthur felt': 2, 'night': 11, 'arthur': 0, 'good perfect': 8, 'morning arthur': 10, 'good': 7, 'perfect': 13, 'felt': 5}


In [78]:
print (bag.toarray())

[[1 0 1 0 0 1 1 1 1 1 1 0 0 1 1 1 0]
 [0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 0 1]
 [1 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0]]


### 3.4 TF-IDF representation

#### 3.4.1 Using CountVectorizer( ) + TfidfTransformer( )

In [82]:
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
tfidf = TfidfTransformer()  ## this bag is normalized and removes stop words.
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag).toarray())

[[ 0.36  0.    0.47  0.28  0.47  0.    0.47  0.36]
 [ 0.    0.    0.    0.43  0.    0.72  0.    0.55]
 [ 0.55  0.72  0.    0.43  0.    0.    0.    0.  ]]


#### 3.4.2 Using TfidfVectorizer( )

In [84]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,lowercase=True,preprocessor=None, stop_words ='english')
print(tfidf.fit_transform(doc).toarray())

[[ 0.36  0.    0.47  0.28  0.47  0.    0.47  0.36]
 [ 0.    0.    0.    0.43  0.    0.72  0.    0.55]
 [ 0.55  0.72  0.    0.43  0.    0.    0.    0.  ]]
