In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# list of text documents
text = ["i am feeling very very happy.",
       "I am not well today.",
       "I want to be happy."]

**Binary Scoring**

In [2]:
# create the transform
vectorizer = CountVectorizer(binary=True, token_pattern = r"(?u)\b\w+\b")#regular expression

# tokenize and build vocab
vectorizer.fit(text)

# summarize
print(vectorizer.vocabulary_)

# encode documents
vectors = vectorizer.transform(text)
print(vectors.toarray())
print(vectors)

# summarize encoded vector
print(vectors.shape)
print(type(vectors))

{'i': 4, 'am': 0, 'feeling': 2, 'very': 8, 'happy': 3, 'not': 5, 'well': 10, 'today': 7, 'want': 9, 'to': 6, 'be': 1}
[[1 0 1 1 1 0 0 0 1 0 0]
 [1 0 0 0 1 1 0 1 0 0 1]
 [0 1 0 1 1 0 1 0 0 1 0]]
  (0, 0)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 8)	1
  (1, 0)	1
  (1, 4)	1
  (1, 5)	1
  (1, 7)	1
  (1, 10)	1
  (2, 1)	1
  (2, 3)	1
  (2, 4)	1
  (2, 6)	1
  (2, 9)	1
(3, 11)
<class 'scipy.sparse.csr.csr_matrix'>


**Counts Scoring**

In [3]:
# create the transform
vectorizer = CountVectorizer(token_pattern = r"(?u)\b\w+\b")

# tokenize and build vocab
vectors = vectorizer.fit_transform(text)

print(vectors.toarray())

[[1 0 1 1 1 0 0 0 2 0 0]
 [1 0 0 0 1 1 0 1 0 0 1]
 [0 1 0 1 1 0 1 0 0 1 0]]


**Frequency Scoring**

In [6]:
import numpy as np
# create the transform
vectorizer = CountVectorizer(token_pattern = r"(?u)\b\w+\b")

vectors = vectorizer.fit_transform(text)
vectors = vectors/vectors.sum(axis=1)
np.set_printoptions(precision=2)
print(vectors)

[[0.17 0.   0.17 0.17 0.17 0.   0.   0.   0.33 0.   0.  ]
 [0.2  0.   0.   0.   0.2  0.2  0.   0.2  0.   0.   0.2 ]
 [0.   0.2  0.   0.2  0.2  0.   0.2  0.   0.   0.2  0.  ]]


**Additional Features: Removing stop words**

In [4]:
# create the transform
vectorizer = CountVectorizer(binary=True, token_pattern = r"(?u)\b\w+\b", stop_words='english')

# tokenize and build vocab
vectorizer.fit(text)

# summarize
print(vectorizer.vocabulary_)

# encode documents
vectors = vectorizer.transform(text)
print(vectors.toarray())

{'feeling': 0, 'happy': 1, 'today': 2, 'want': 3}
[[1 1 0 0]
 [0 0 1 0]
 [0 1 0 1]]


**TFIDF**

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vectorizer.fit(text)

# encode document
vectors = vectorizer.transform(text)

# summarize encoded vector
print(vectors.shape)
print(vectors.toarray())

(3, 10)
[[0.34385143 0.         0.45212331 0.68770286 0.         0.
  0.         0.45212331 0.         0.        ]
 [0.40204024 0.         0.         0.         0.52863461 0.
  0.52863461 0.         0.         0.52863461]
 [0.         0.52863461 0.         0.40204024 0.         0.52863461
  0.         0.         0.52863461 0.        ]]
