<h1 align='center'>CountVectorizer (vs) TF-IDF (vs) Hash Vectorizer

In [34]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,HashingVectorizer

In [26]:
Train = ['Running the example first prints the vocabulary, then the shape of the first encoded document.']
Test = ['We can then see that the encoded vector is a sparse matrix of encoded.']

### Count vectorizer

In [27]:
count_vec = CountVectorizer(max_features=50,ngram_range=(1,3))
count_vec.fit(Train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=50, min_df=1,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [28]:
count_vec_output = count_vec.transform(Test)

In [29]:
print(count_vec.vocabulary_)
print(count_vec_output.shape)
print(count_vec_output.toarray())

{'running': 17, 'the': 23, 'example': 3, 'first': 6, 'prints': 14, 'vocabulary': 35, 'then': 32, 'shape': 20, 'of': 11, 'encoded': 1, 'document': 0, 'running the': 18, 'the example': 24, 'example first': 4, 'first prints': 9, 'prints the': 15, 'the vocabulary': 30, 'vocabulary then': 36, 'then the': 33, 'the shape': 28, 'shape of': 21, 'of the': 12, 'the first': 26, 'first encoded': 7, 'encoded document': 2, 'running the example': 19, 'the example first': 25, 'example first prints': 5, 'first prints the': 10, 'prints the vocabulary': 16, 'the vocabulary then': 31, 'vocabulary then the': 37, 'then the shape': 34, 'the shape of': 29, 'shape of the': 22, 'of the first': 13, 'the first encoded': 27, 'first encoded document': 8}
(1, 38)
[[0 2 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0
  0 0]]


### TF-IDF

In [30]:
tfidf_vec = TfidfVectorizer(max_features=50,ngram_range=(1,3))
tfidf_vec.fit(Train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=50,
                min_df=1, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [31]:
tfidf_output = tfidf_vec.transform(Test)

In [32]:
print(tfidf_vec.vocabulary_)
print(tfidf_vec.idf_)

{'running': 17, 'the': 23, 'example': 3, 'first': 6, 'prints': 14, 'vocabulary': 35, 'then': 32, 'shape': 20, 'of': 11, 'encoded': 1, 'document': 0, 'running the': 18, 'the example': 24, 'example first': 4, 'first prints': 9, 'prints the': 15, 'the vocabulary': 30, 'vocabulary then': 36, 'then the': 33, 'the shape': 28, 'shape of': 21, 'of the': 12, 'the first': 26, 'first encoded': 7, 'encoded document': 2, 'running the example': 19, 'the example first': 25, 'example first prints': 5, 'first prints the': 10, 'prints the vocabulary': 16, 'the vocabulary then': 31, 'vocabulary then the': 37, 'then the shape': 34, 'the shape of': 29, 'shape of the': 22, 'of the first': 13, 'the first encoded': 27, 'first encoded document': 8}
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [33]:
print(tfidf_output.shape)
print(tfidf_output.toarray())

(1, 38)
[[0.         0.75592895 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.37796447
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.37796447
  0.         0.         0.         0.         0.         0.
  0.         0.         0.37796447 0.         0.         0.
  0.         0.        ]]


### HashingVectorizer

In [35]:
hash_vec = HashingVectorizer(n_features=20,ngram_range=(1,3))

In [37]:
hash_output = hash_vec.transform(Train)

In [39]:
print(hash_output.shape)
print(hash_output.toarray())

(1, 20)
[[-0.13867505 -0.13867505  0.          0.          0.          0.
   0.         -0.13867505  0.13867505 -0.13867505 -0.2773501   0.41602515
   0.13867505  0.          0.5547002   0.          0.          0.13867505
  -0.5547002   0.        ]]
