In [1]:
from sklearn.feature_extraction.text import CountVectorizer
#  tokenize and count the word occurrences of a minimalistic corpus of text documents
vectorizer = CountVectorizer()
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',]
X = vectorizer.fit_transform(corpus)
print(X)
# The default configuration tokenizes the string by extracting words of at least 2 letters
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (
    ['this', 'is', 'text', 'document', 'to', 'analyze'])

  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	1
  (1, 5)	2
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1


True

In [2]:
# Each term found by the analyzer during the fit is assigned a unique integer index corresponding to a column in the resulting matrix.
print(vectorizer.get_feature_names_out())
# find index beloning to fit
print(vectorizer.vocabulary_.get('document'))
X.toarray()

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
1


array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [3]:
#Note that in the previous corpus, the first and the last documents have exactly the same words hence are encoded in equal vectors. 
# In particular we lose the information that the last document is an interrogative form. 
# To preserve some of the local ordering information we can extract 2-grams of words in addition to the 1-grams (individual words):
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1)
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!') == (['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])

True

In [4]:
# In order to re-weight the count features (from stop words being too present) into floating point values suitable for usage by a classifier it is very common to use the tf–idf transform.
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

In [5]:
# A collection of unigrams (what bag of words is) cannot capture phrases and multi-word expressions, 
# effectively disregarding any word order dependence. Additionally, the bag of words model doesn’t account for potential misspellings or word derivations.
# N-grams to the rescue! Instead of building a simple collection of unigrams (n=1), 
# one might prefer a collection of bigrams (n=2), where occurrences of pairs of consecutive words are counted.

ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
print(ngram_vectorizer.get_feature_names_out())
counts.toarray().astype(int)


[' w' 'ds' 'or' 'pr' 'rd' 's ' 'wo' 'wp']


array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])

In [6]:
# In the above example, char_wb analyzer is used, which creates n-grams only from characters inside word boundaries (padded with space on each side). 
# The char analyzer, alternatively, creates n-grams that span across words:
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(5, 5))
ngram_vectorizer.fit_transform(['jumpy fox'])


<1x4 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [7]:
ngram_vectorizer.get_feature_names_out()

array([' fox ', ' jump', 'jumpy', 'umpy '], dtype=object)

In [8]:
ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(5, 5))
ngram_vectorizer.fit_transform(['jumpy fox'])

<1x5 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [9]:
ngram_vectorizer.get_feature_names_out()

array(['jumpy', 'mpy f', 'py fo', 'umpy ', 'y fox'], dtype=object)