In [27]:
import pandas as pd
import numpy as np

## CountVectorizer

In [28]:
sent1=[
    'The Brown Dog jumps over the Green Gate to chase Brown and white Rabbits which escaped into the white Gate',
    'white rabbits are eating and playing in Green Grass',
    'white Rabbits are chased by brown dog'
]

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
cvec=CountVectorizer()

In [31]:
cvec

CountVectorizer()

In [32]:
cvec.fit(sent1)

CountVectorizer()

In [33]:
cvec.get_feature_names() # all unique words

['and',
 'are',
 'brown',
 'by',
 'chase',
 'chased',
 'dog',
 'eating',
 'escaped',
 'gate',
 'grass',
 'green',
 'in',
 'into',
 'jumps',
 'over',
 'playing',
 'rabbits',
 'the',
 'to',
 'which',
 'white']

In [34]:
cvec.vocabulary_

{'the': 18,
 'brown': 2,
 'dog': 6,
 'jumps': 14,
 'over': 15,
 'green': 11,
 'gate': 9,
 'to': 19,
 'chase': 4,
 'and': 0,
 'white': 21,
 'rabbits': 17,
 'which': 20,
 'escaped': 8,
 'into': 13,
 'are': 1,
 'eating': 7,
 'playing': 16,
 'in': 12,
 'grass': 10,
 'chased': 5,
 'by': 3}

In [35]:
sent_cvec=cvec.transform(sent1)

In [36]:
sent_cvec

<3x22 sparse matrix of type '<class 'numpy.int64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [37]:
sent_cvec=sent_cvec.toarray()

In [38]:
sent1

['The Brown Dog jumps over the Green Gate to chase Brown and white Rabbits which escaped into the white Gate',
 'white rabbits are eating and playing in Green Grass',
 'white Rabbits are chased by brown dog']

In [39]:
sent_cvec

array([[1, 0, 2, 0, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 3, 1, 1, 2],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]],
      dtype=int64)

In [40]:
pd.DataFrame(sent_cvec,columns=cvec.get_feature_names())

Unnamed: 0,and,are,brown,by,chase,chased,dog,eating,escaped,gate,...,in,into,jumps,over,playing,rabbits,the,to,which,white
0,1,0,2,0,1,0,1,0,1,2,...,0,1,1,1,0,1,3,1,1,2
1,1,1,0,0,0,0,0,1,0,0,...,1,0,0,0,1,1,0,0,0,1
2,0,1,1,1,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [41]:
sent1

['The Brown Dog jumps over the Green Gate to chase Brown and white Rabbits which escaped into the white Gate',
 'white rabbits are eating and playing in Green Grass',
 'white Rabbits are chased by brown dog']

## TfidfTransformer

In [42]:
from sklearn.feature_extraction.text import TfidfTransformer

In [43]:
tftrans=TfidfTransformer()

In [44]:
tftrans

TfidfTransformer()

In [45]:
sent_cvec1=cvec.transform(sent1)

In [46]:
sent_cvec1

<3x22 sparse matrix of type '<class 'numpy.int64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [47]:
tftrans.fit(sent_cvec1) # output of count vectoriser should be fit here(sparse matrix)

TfidfTransformer()

In [48]:
sent_trans=tftrans.transform(sent_cvec1)

In [49]:
sent_trans

<3x22 sparse matrix of type '<class 'numpy.float64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [50]:
sent_trans.toarray()

array([[0.14974888, 0.        , 0.29949775, 0.        , 0.19690178,
        0.        , 0.14974888, 0.        , 0.19690178, 0.39380355,
        0.        , 0.14974888, 0.        , 0.19690178, 0.19690178,
        0.19690178, 0.        , 0.11629336, 0.59070533, 0.19690178,
        0.19690178, 0.23258672],
       [0.29985557, 0.29985557, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.39427404, 0.        , 0.        ,
        0.39427404, 0.29985557, 0.39427404, 0.        , 0.        ,
        0.        , 0.39427404, 0.2328646 , 0.        , 0.        ,
        0.        , 0.2328646 ],
       [0.        , 0.3612204 , 0.3612204 , 0.47496141, 0.        ,
        0.47496141, 0.3612204 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28051986, 0.        , 0.        ,
        0.        , 0.28051986]])

In [53]:
sent_cvec1.toarray()

array([[1, 0, 2, 0, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, 1, 1, 0, 1, 3, 1, 1, 2],
       [1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1],
       [0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1]],
      dtype=int64)

In [54]:
# The above two step process i.e CountVectorizer and TfidfTransformer can be performed in one step called TfidfVectorizer

## TfidfVectorization

In [55]:
sent1=[
    'The Brown Dog jumps over the Green Gate to chase Brown and white Rabbits which escaped into the white Gate',
    'white rabbits are eating and playing in Green Grass',
    'white Rabbits are chased by brown dog'
]

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
tfvec=TfidfVectorizer()

In [58]:
tfvec

TfidfVectorizer()

In [60]:
tfvec.get_feature_names()

['and',
 'are',
 'brown',
 'by',
 'chase',
 'chased',
 'dog',
 'eating',
 'escaped',
 'gate',
 'grass',
 'green',
 'in',
 'into',
 'jumps',
 'over',
 'playing',
 'rabbits',
 'the',
 'to',
 'which',
 'white']

In [61]:
tfvec.vocabulary_

{'the': 18,
 'brown': 2,
 'dog': 6,
 'jumps': 14,
 'over': 15,
 'green': 11,
 'gate': 9,
 'to': 19,
 'chase': 4,
 'and': 0,
 'white': 21,
 'rabbits': 17,
 'which': 20,
 'escaped': 8,
 'into': 13,
 'are': 1,
 'eating': 7,
 'playing': 16,
 'in': 12,
 'grass': 10,
 'chased': 5,
 'by': 3}

In [59]:
tfvec.fit(sent1)

TfidfVectorizer()

In [62]:
sent_tfvec=tfvec.transform(sent1)

In [63]:
sent_tfvec

<3x22 sparse matrix of type '<class 'numpy.float64'>'
	with 31 stored elements in Compressed Sparse Row format>

In [64]:
sent_tfvec.toarray()

array([[0.14974888, 0.        , 0.29949775, 0.        , 0.19690178,
        0.        , 0.14974888, 0.        , 0.19690178, 0.39380355,
        0.        , 0.14974888, 0.        , 0.19690178, 0.19690178,
        0.19690178, 0.        , 0.11629336, 0.59070533, 0.19690178,
        0.19690178, 0.23258672],
       [0.29985557, 0.29985557, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.39427404, 0.        , 0.        ,
        0.39427404, 0.29985557, 0.39427404, 0.        , 0.        ,
        0.        , 0.39427404, 0.2328646 , 0.        , 0.        ,
        0.        , 0.2328646 ],
       [0.        , 0.3612204 , 0.3612204 , 0.47496141, 0.        ,
        0.47496141, 0.3612204 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.28051986, 0.        , 0.        ,
        0.        , 0.28051986]])