In [64]:
s = 'This is the the the first document'

In [65]:
s_list = s.split()

In [66]:
# Map each word to an index (to maintain order of the sentence)
[(index, item) for index, item in enumerate(s_list)]

[(0, 'This'),
 (1, 'is'),
 (2, 'the'),
 (3, 'the'),
 (4, 'the'),
 (5, 'first'),
 (6, 'document')]

<h3>Sklearn CountVectorizer</h3>

[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [83]:
vectorizer = CountVectorizer(analyzer='word')
X = vectorizer.fit_transform([s])

In [84]:
vectorizer.get_feature_names_out()

array(['document', 'first', 'is', 'the', 'this'], dtype=object)

In [85]:
X.toarray()

array([[1, 1, 1, 3, 1]])

- Counts the occurrences of each word in the "document" (well a single sentence in this case).
- This is called tokenization of text (by default: lowercasing, splitting on whitespace, and removing punctuation).
- Note that the order of the words is not preserved.

<br>

In [87]:
vectorizer2 = CountVectorizer(analyzer='char')
X2 = vectorizer2.fit_transform(['Hi I am Marlon'])

In [88]:
X2.toarray()

array([[3, 2, 1, 2, 1, 2, 1, 1, 1]])

In [89]:
vectorizer2.get_feature_names_out()

array([' ', 'a', 'h', 'i', 'l', 'm', 'n', 'o', 'r'], dtype=object)

- If `analyzer='char'`, characters are counted instead of words.

<br>

In [2]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [3]:
vectorizer3 = CountVectorizer(analyzer='word')
X3 = vectorizer3.fit_transform(corpus)

In [4]:
vectorizer3.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [5]:
X3.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

- Shows the occurrences of each word within each sentence ("document") in the rows.
- Note that sklearn's CountVectorizer does not implement normalization.

<h3>Implement CountVectorizer manually</h3>

In [7]:
import re
import numpy as np

In [8]:
def clean(row):
    cleaned_sentence = []
    for word in row:
        # r in front of a string makes the string a raw string literal
        # ==> all special characters (e.g., backslashes) are treated as literal characters
        # (literal characters are characters that are interpreted exactly as typed)
        cleaned_word = re.sub(pattern=r'[.,\-_!?]', repl='', string=word).lower()
        cleaned_sentence.append(cleaned_word)
    return cleaned_sentence

In [9]:
def count_unique(cleaned_documents):
    unique_words = []
    for row in cleaned_documents:
        for word in row:
            if word not in unique_words:
                unique_words.append(word)
    return unique_words

In [19]:
def count_occurrences(document, unique_words):
    # Sorting the unique words is not necessary, but this way the output will match sklearn's
    d = {word: 0 for word in sorted(unique_words)}
    for word in document:
        if word in d.keys():
            d[word] += 1
    return d

In [20]:
cleaned_documents = []
for row in corpus:
    cleaned_sentence = clean(row.split())
    cleaned_documents.append(cleaned_sentence)
cleaned_documents

[['this', 'is', 'the', 'first', 'document'],
 ['this', 'document', 'is', 'the', 'second', 'document'],
 ['and', 'this', 'is', 'the', 'third', 'one'],
 ['is', 'this', 'the', 'first', 'document']]

In [26]:
unique_words = count_unique(cleaned_documents)
sorted(unique_words)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [22]:
counter_array = np.zeros((len(cleaned_documents), len(unique_words)), dtype=int)

In [23]:
for i in range(len(cleaned_documents)):
    count = count_occurrences(cleaned_documents[i], unique_words)
    counter_array[i, :] = list(count.values())

In [24]:
counter_array

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [25]:
# Comparing to sklearn CountVectorizer
X3.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])