In [1]:
doc = ['The quick brown fox jumped over the moon',
      'The moon was full that night',
      'The fox was a girl',
      'The girl was beautiful']

# Bag of Words (BOW) Model

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [3]:
bow_vec = CountVectorizer(lowercase=True)
bow_vec.fit(doc)
word_counts = bow_vec.transform(doc)

In [4]:
df = pd.DataFrame(word_counts.toarray(), columns=sorted(bow_vec.vocabulary_))

In [5]:
df

Unnamed: 0,beautiful,brown,fox,full,girl,jumped,moon,night,over,quick,that,the,was
0,0,1,1,0,0,1,1,0,1,1,0,2,0
1,0,0,0,1,0,0,1,1,0,0,1,1,1
2,0,0,1,0,1,0,0,0,0,0,0,1,1
3,1,0,0,0,1,0,0,0,0,0,0,1,1


# HashingVectorizer

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer

In [7]:
hash_vec = HashingVectorizer(n_features=15)
hash_vec.fit(doc)
word_counts = hash_vec.transform(doc)

In [8]:
df = pd.DataFrame(word_counts.toarray())

In [9]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.316228,0.0,-0.632456,0.0,-0.316228,0.0,0.0,0.316228,0.0,0.316228,-0.316228,0.0,0.316228,0.0
1,0.0,0.0,-0.5,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,0.0,0.0,0.5,0.0
2,0.0,0.0,0.0,-0.5,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.5,-0.5,0.0,0.0
3,0.0,0.0,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,-0.5,0.0,-0.5


In [10]:
word_counts.toarray()

array([[ 0.        ,  0.31622777,  0.        , -0.63245553,  0.        ,
        -0.31622777,  0.        ,  0.        ,  0.31622777,  0.        ,
         0.31622777, -0.31622777,  0.        ,  0.31622777,  0.        ],
       [ 0.        ,  0.        , -0.5       , -0.5       ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.5       ,  0.        ,  0.        ,  0.5       ,  0.        ],
       [ 0.        ,  0.        ,  0.        , -0.5       ,  0.        ,
        -0.5       ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.5       , -0.5       ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , -0.5       ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.5       , -0.5       ,  0.        , -0.5       ]])

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf_vec = TfidfVectorizer()
tfidf_vec.fit(doc)
counts = (tfidf_vec.transform(doc))

In [16]:
sorted(tfidf_vec.vocabulary_.keys())

['beautiful',
 'brown',
 'fox',
 'full',
 'girl',
 'jumped',
 'moon',
 'night',
 'over',
 'quick',
 'that',
 'the',
 'was']

In [18]:
df = pd.DataFrame(counts.toarray(), columns=sorted(tfidf_vec.vocabulary_.keys()))

In [19]:
df

Unnamed: 0,beautiful,brown,fox,full,girl,jumped,moon,night,over,quick,that,the,was
0,0.0,0.397387,0.313305,0.0,0.0,0.397387,0.313305,0.0,0.397387,0.397387,0.0,0.414746,0.0
1,0.0,0.0,0.0,0.482169,0.0,0.0,0.380147,0.482169,0.0,0.0,0.482169,0.251616,0.307762
2,0.0,0.0,0.568556,0.0,0.568556,0.0,0.0,0.0,0.0,0.0,0.0,0.376321,0.460295
3,0.659191,0.0,0.0,0.0,0.519714,0.0,0.0,0.0,0.0,0.0,0.0,0.343993,0.420753


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [23]:
tf_hash = Pipeline([
    ('hash vec', HashingVectorizer(n_features=15)),
    ('tfidf', TfidfTransformer())
])

In [24]:
output = tf_hash.fit_transform(doc).toarray()

In [25]:
df = pd.DataFrame(output)

In [26]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.437421,0.0,-0.456529,0.0,-0.344868,0.0,0.0,0.437421,0.0,0.344868,-0.228265,0.0,0.344868,0.0
1,0.0,0.0,-0.630504,-0.329023,0.0,0.0,0.0,0.0,0.0,0.0,-0.497096,0.0,0.0,0.497096,0.0
2,0.0,0.0,0.0,-0.39028,0.0,-0.589645,0.0,0.0,0.0,0.0,0.0,0.39028,-0.589645,0.0,0.0
3,0.0,0.0,0.0,-0.354557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354557,-0.535674,0.0,-0.679435


In [27]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.0,0.437421,0.0,-0.456529,0.0,-0.344868,0.0,0.0,0.437421,0.0,0.344868,-0.228265,0.0,0.344868,0.0
1,0.0,0.0,-0.630504,-0.329023,0.0,0.0,0.0,0.0,0.0,0.0,-0.497096,0.0,0.0,0.497096,0.0
2,0.0,0.0,0.0,-0.39028,0.0,-0.589645,0.0,0.0,0.0,0.0,0.0,0.39028,-0.589645,0.0,0.0
3,0.0,0.0,0.0,-0.354557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354557,-0.535674,0.0,-0.679435


# n grams

They take two words and process them as one

In [28]:
ngcv = CountVectorizer(ngram_range=(1,2))
ngcv.fit(doc)
counts = ngcv.transform(doc).toarray()

In [29]:
cols = [x.replace('', '_') for x in sorted(ngcv.vocabulary_.keys())]

In [30]:
cols

['_b_e_a_u_t_i_f_u_l_',
 '_b_r_o_w_n_',
 '_b_r_o_w_n_ _f_o_x_',
 '_f_o_x_',
 '_f_o_x_ _j_u_m_p_e_d_',
 '_f_o_x_ _w_a_s_',
 '_f_u_l_l_',
 '_f_u_l_l_ _t_h_a_t_',
 '_g_i_r_l_',
 '_g_i_r_l_ _w_a_s_',
 '_j_u_m_p_e_d_',
 '_j_u_m_p_e_d_ _o_v_e_r_',
 '_m_o_o_n_',
 '_m_o_o_n_ _w_a_s_',
 '_n_i_g_h_t_',
 '_o_v_e_r_',
 '_o_v_e_r_ _t_h_e_',
 '_q_u_i_c_k_',
 '_q_u_i_c_k_ _b_r_o_w_n_',
 '_t_h_a_t_',
 '_t_h_a_t_ _n_i_g_h_t_',
 '_t_h_e_',
 '_t_h_e_ _f_o_x_',
 '_t_h_e_ _g_i_r_l_',
 '_t_h_e_ _m_o_o_n_',
 '_t_h_e_ _q_u_i_c_k_',
 '_w_a_s_',
 '_w_a_s_ _b_e_a_u_t_i_f_u_l_',
 '_w_a_s_ _f_u_l_l_',
 '_w_a_s_ _g_i_r_l_']