In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# This code is a tutorial for vectorizing (counting frequency) over textual corpus

In [2]:
corpus = [
         'This is the first document.',
         'This document is the second docuemnt',
         'And this is the third one.',
         'Is this the first document?',
         ]

# This vectorizer works on unigram

In [3]:
vectorizer = CountVectorizer()

X1 = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X1.toarray())

['and', 'docuemnt', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 0 1 1 1 0 0 1 0 1]
 [0 1 1 0 1 0 1 1 0 1]
 [1 0 0 0 1 1 0 1 1 1]
 [0 0 1 1 1 0 0 1 0 1]]


# This vectorizer generates frequencies for bigram on the word-level

In [4]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))

X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())
print(X2.toarray())

['and this', 'document is', 'first document', 'is the', 'is this', 'second docuemnt', 'the first', 'the second', 'the third', 'third one', 'this document', 'this is', 'this the']
[[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]


# This vectorizer generates frequencies for both unigram and bigram on the word-level

In [5]:
vectorizer3 = CountVectorizer(analyzer='word', ngram_range=(1, 2))

X3 = vectorizer3.fit_transform(corpus)
print(vectorizer3.get_feature_names())
print(X3.toarray())

['and', 'and this', 'docuemnt', 'document', 'document is', 'first', 'first document', 'is', 'is the', 'is this', 'one', 'second', 'second docuemnt', 'the', 'the first', 'the second', 'the third', 'third', 'third one', 'this', 'this document', 'this is', 'this the']
[[0 0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0]
 [0 0 1 1 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 0 0]
 [1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0]
 [0 0 0 1 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1]]
