In [33]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd

%matplotlib inline

# Feature Extraction
This is the most important part of any machine learning model. We extract features that are relevant and that can be understood by the computer/device. For a text, there are many things that **"features"** mean.  

Unlike, images that have pixels values already available as numeric data/features, texts have to be analyzed to extract features. Some of the things we can do is:  
- count the occurence of each word and use the counts as features
- use one hot encoding scheme for a text (word/paragraph/document)
- use techniques like **tf-idf** (term-frequency-inverse-document-frequency) which utilizes the rareness of the text itself.

# Document-Term Matrix

This is nothing but the mapping of counts of each token/term in a document.  

**What we can do:**  
- extract all the relevant terms from each document along with the count
- create a set of all the tokens from all the documents
- now map each token from each document to the counts

**Say we have 2 documents:**  
- d1: "i am paradox. i am gru"
- d2: "i am nish"

### d1
tokens: i, am, paradox, i, am, gru  
**counts** 
i : 2
am : 2
paradox: 1
gru: 1

### d2
tokens: i, am, nish
**counts**  
i: 1  
am: 1  
nish: 1  

### generate vocabularies
Here we create a set of all the **unique** tokens from all the documents.
**tokens**  
i, am, paradox, gru, nish

### document-term matrix

| document | i | am | paradox | gru | nish |
| -------  |---|----|---------|-----|------|
| d1       | 2 | 2  | 1       |1    |0     |
| d2       | 1 | 1  | 0       |0    |1     |


So, feature vector for the documents are as follows:  
**d1** --> (2, 2, 1, 1, 0)  
**d2** --> (1, 1, 0, 0, 1)  

Finally, these vectors can be used for our machine learning model.  

Cheers...

In [45]:
text = "i am paradox. i am gru. i am nish. i am a caffeine addict.  i love caffeine"

In [46]:
# use simple counts as features
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
# let's train the feature extractor on this simple training "text
tokens = nltk.word_tokenize(text)
count_vectorizer = CountVectorizer().fit(tokens)
count_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [48]:
# all the available features/tokens
count_vectorizer.get_feature_names()

['addict', 'am', 'caffeine', 'gru', 'love', 'nish', 'paradox']

In [49]:
# let's test new document
test = "caffeine is love"
test_vect = count_vectorizer.transform(nltk.word_tokenize(test))
print(test_vect)

  (0, 2)	1
  (2, 4)	1


In [50]:
# get document-term matrix
test_vect.toarray()

array([[0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0]])

In [51]:
# see what document-term matrix is under the hood
pd.DataFrame(test_vect.toarray(), columns = count_vectorizer.get_feature_names())

Unnamed: 0,addict,am,caffeine,gru,love,nish,paradox
0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0


#### Let's try on more complex (real) text

In [79]:
train_docs = [
    "Yes, it's hard to get things done, to accept stuff despite being seemingly unworthy.But hey! Worthiness is just our own abstraction of comfort.",
    "We should be able to embrace what life throws at us diligently. Being pedantic won't do good.",
    "As we are always governed by the vastness of entropy, as such we tend to be over-dramatic towards the minor things in life.",
    "But, if we can pass that out, whether the withering wealth, health, love and shit, we can probably render ourselves joyous.",
    "I think that's the way of living of life. Live. Don't just breathe."
]

test_docs = [
    "we seem to be living our life. but we are not"
]
print(train_docs)
print(test_docs)

["Yes, it's hard to get things done, to accept stuff despite being seemingly unworthy.But hey! Worthiness is just our own abstraction of comfort.", "We should be able to embrace what life throws at us diligently. Being pedantic won't do good.", 'As we are always governed by the vastness of entropy, as such we tend to be over-dramatic towards the minor things in life.', 'But, if we can pass that out, whether the withering wealth, health, love and shit, we can probably render ourselves joyous.', "I think that's the way of living of life. Live. Don't just breathe."]
['we seem to be living our life. but we are not']


In [80]:
count_vectorizer = CountVectorizer().fit(train_docs)
print(count_vectorizer.get_feature_names())

['able', 'abstraction', 'accept', 'always', 'and', 'are', 'as', 'at', 'be', 'being', 'breathe', 'but', 'by', 'can', 'comfort', 'despite', 'diligently', 'do', 'don', 'done', 'dramatic', 'embrace', 'entropy', 'get', 'good', 'governed', 'hard', 'health', 'hey', 'if', 'in', 'is', 'it', 'joyous', 'just', 'life', 'live', 'living', 'love', 'minor', 'of', 'our', 'ourselves', 'out', 'over', 'own', 'pass', 'pedantic', 'probably', 'render', 'seemingly', 'shit', 'should', 'stuff', 'such', 'tend', 'that', 'the', 'things', 'think', 'throws', 'to', 'towards', 'unworthy', 'us', 'vastness', 'way', 'we', 'wealth', 'what', 'whether', 'withering', 'won', 'worthiness', 'yes']


In [81]:
vect = count_vectorizer.transform(test_docs)
print(vect.toarray())

[[0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0
  0 0 0]]


In [82]:
pd.DataFrame(vect.toarray(), columns=count_vectorizer.get_feature_names() )

Unnamed: 0,able,abstraction,accept,always,and,are,as,at,be,being,...,vastness,way,we,wealth,what,whether,withering,won,worthiness,yes
0,0,0,0,0,0,1,0,0,1,0,...,0,0,2,0,0,0,0,0,0,0


# TF-IDF
Tf-idf stands for term frequency - inverse document frequency which is used in text mining and information retrieval system to evauluate how important a word is in a document.

The importance is directly proportional to the number of times a word appears in the document but is also weighted down by the frequency of the word in the whole corpus.

### Mathematically

**term-frequency (tf)** of a term/word t is actually given by:

`tf = (number of times the term t appears in a document ) / (total number of terms in the same document)`

**inverse document frequency (idf)** mesaures how much rare a term is throughout the multiple documents.
That is, more the rareness of a term, the greater we tend to value the rareness.

`idf = natural_logarithm[ (total number of documents) / (number of documents having the term t) ]`

Here, `natural_logarithm` is the logarithmic function with base **e**.


Reference: https://github.com/NISH1001/tag-generator

In [87]:
# let's create tf-idf features
from sklearn.feature_extraction.text import TfidfVectorizer

In [88]:
tfidf_vectorizer = TfidfVectorizer().fit(train_docs)
print(tfidf_vectorizer.get_feature_names())

['able', 'abstraction', 'accept', 'always', 'and', 'are', 'as', 'at', 'be', 'being', 'breathe', 'but', 'by', 'can', 'comfort', 'despite', 'diligently', 'do', 'don', 'done', 'dramatic', 'embrace', 'entropy', 'get', 'good', 'governed', 'hard', 'health', 'hey', 'if', 'in', 'is', 'it', 'joyous', 'just', 'life', 'live', 'living', 'love', 'minor', 'of', 'our', 'ourselves', 'out', 'over', 'own', 'pass', 'pedantic', 'probably', 'render', 'seemingly', 'shit', 'should', 'stuff', 'such', 'tend', 'that', 'the', 'things', 'think', 'throws', 'to', 'towards', 'unworthy', 'us', 'vastness', 'way', 'we', 'wealth', 'what', 'whether', 'withering', 'won', 'worthiness', 'yes']


In [89]:
vect = tfidf_vectorizer.transform(test_docs)
print(vect.toarray())

[[0.         0.         0.         0.         0.         0.37815591
  0.         0.         0.30509381 0.         0.         0.30509381
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.25325542
  0.         0.37815591 0.         0.         0.         0.37815591
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.25325542 0.         0.         0.         0.
  0.         0.50651084 0.         0.         0.         0.
  0.         0.         0.        ]]


In [90]:
pd.DataFrame(vect.toarray(), columns=tfidf_vectorizer.get_feature_names() )

Unnamed: 0,able,abstraction,accept,always,and,are,as,at,be,being,...,vastness,way,we,wealth,what,whether,withering,won,worthiness,yes
0,0.0,0.0,0.0,0.0,0.0,0.378156,0.0,0.0,0.305094,0.0,...,0.0,0.0,0.506511,0.0,0.0,0.0,0.0,0.0,0.0,0.0
