### NLP Representations Exercise

In [60]:
# import pandas
import pandas as pd

# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
 
# import TfidfTransformer 
from sklearn.feature_extraction.text import TfidfTransformer

In [61]:
docs=["the house had a tiny little mouse",
      "the cat saw the mouse",
      "the mouse ran away from the house",
      "the cat finally ate the mouse",
      "the end of the mouse story"
     ]

# BagOfWords

* instantiate [CountVectorizer()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [62]:
cv = CountVectorizer(
    strip_accents='unicode',
    stop_words='english')

* use fit_transform method of CountVectorizer to 'docs' and store the result in 'word_count_vector'

In [63]:
word_count_vector  = cv.fit_transform(docs)

* print the shape 

In [64]:
vocab = dict((y, x) for x, y in cv.vocabulary_.items())

In [65]:
[value[1] for value in sorted(vocab.items())]

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'house',
 'little',
 'mouse',
 'ran',
 'saw',
 'story',
 'tiny']

* create dataframe from word_count_vector

In [66]:
pd.DataFrame(word_count_vector.toarray(), columns=[value[1] for value in sorted(vocab.items())])

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
0,0,0,0,0,0,1,1,1,0,0,0,1
1,0,0,1,0,0,0,0,1,0,1,0,0
2,0,1,0,0,0,1,0,1,1,0,0,0
3,1,0,1,0,1,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,0,1,0


# TfIdf with TfidfTransformer

* instantiate [TfidfTransformer()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html) with the following parameters: 
    * smooth_idf = True
    * use_idf = True

In [67]:
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)

* use fit_transform method of tfidf transformer on 'word_count_vector' created above and store the result in 'tf_idf_data'

In [68]:
tf_idf_data = tfidf.fit_transform(word_count_vector)

* create dataframe from 'tf_idf_data'

In [69]:
pd.DataFrame(tf_idf_data.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,0.0,0.0,0.0,0.0,0.475575,0.589463,0.280882,0.0,0.0,0.0,0.589463
1,0.0,0.0,0.588732,0.0,0.0,0.0,0.0,0.347715,0.0,0.729718,0.0,0.0
2,0.0,0.589463,0.0,0.0,0.0,0.475575,0.0,0.280882,0.589463,0.0,0.0,0.0
3,0.589463,0.0,0.475575,0.0,0.589463,0.0,0.0,0.280882,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.670092,0.0,0.0,0.0,0.319302,0.0,0.0,0.670092,0.0


* print the IDF for words in 'docs'

In [70]:
tfidf.idf_

array([2.09861229, 2.09861229, 1.69314718, 2.09861229, 2.09861229,
       1.69314718, 2.09861229, 1.        , 2.09861229, 2.09861229,
       2.09861229, 2.09861229])

# TfIdf with TfidfVectorizer

* instantiate [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) with the following parameters:
    - use_idf = True

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(use_idf=True)

* fit and transforn 'docs' with TfidfVectorizer and store the result in 'tfidf_vectorizer_data'

In [73]:
tfidf_vectorizer_data = tfidf_vec.fit_transform(docs)

* create dataframe from tfidf_vectorizer_data

In [85]:
tfidf_vec.vocabulary_

{'the': 14,
 'house': 7,
 'had': 6,
 'tiny': 15,
 'little': 8,
 'mouse': 9,
 'cat': 2,
 'saw': 12,
 'ran': 11,
 'away': 1,
 'from': 5,
 'finally': 4,
 'ate': 0,
 'end': 3,
 'of': 10,
 'story': 13}

In [82]:
pd.DataFrame(tfidf_vectorizer_data.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
1,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
2,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
3,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
4,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0


* print IDF for words in 'docs'

In [84]:
tfidf_vec.idf_

array([2.09861229, 2.09861229, 1.69314718, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 1.69314718, 2.09861229, 1.        ,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 1.        ,
       2.09861229])