In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Text data (list of strings)
data = [
    "People watch dswithbappy",
    "dswithbappy watch dswithbappy",
    "people write comments",
    "comments write people",
    "dswithbappy write comments"
]

print(f"data: {data}")

# ---------- CountVectorizer (word-level) ----------

# creating one instance of bow 
vectorizer = CountVectorizer()

# learn vocabulary and create document-term matrix
X = vectorizer.fit_transform(data)

# get feature names
features = vectorizer.get_feature_names_out()

# convert sparse matrix to dense matrix
X = X.toarray()

print(f"Bag of Word (Word-level):\n {X}")
print(f"Features: {features}")
print(f"vocabulary: {vectorizer.vocabulary_}")

# ---------- TfidfTransformer ----------

# creating one instance of tfidf
transformer = TfidfVectorizer()

# learn vocabulary and create document-term matrix
X = transformer.fit_transform(data)


print(f"TF-IDF:\n {X.toarray()}")

data: ['People watch dswithbappy', 'dswithbappy watch dswithbappy', 'people write comments', 'comments write people', 'dswithbappy write comments']
Bag of Word (Word-level):
 [[0 1 1 1 0]
 [0 2 0 1 0]
 [1 0 1 0 1]
 [1 0 1 0 1]
 [1 1 0 0 1]]
Features: ['comments' 'dswithbappy' 'people' 'watch' 'write']
vocabulary: {'people': 2, 'watch': 3, 'dswithbappy': 1, 'write': 4, 'comments': 0}
TF-IDF:
 [[0.         0.53828256 0.53828256 0.64846263 0.        ]
 [0.         0.85660579 0.         0.51597143 0.        ]
 [0.57735027 0.         0.57735027 0.         0.57735027]
 [0.57735027 0.         0.57735027 0.         0.57735027]
 [0.57735027 0.57735027 0.         0.         0.57735027]]
