# **Bag of Words**

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'text':['people warch campusx',
                           'campusx watch campusx',
                           'people write comment',
                           'campusx write comment'],
                   'Output':[1,1,0,0]})
df

Unnamed: 0,text,Output
0,people warch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [4]:
bow = cv.fit_transform(df['text'])

In [5]:
# vocab
print(cv.vocabulary_)

{'people': 2, 'warch': 3, 'campusx': 0, 'watch': 4, 'write': 5, 'comment': 1}


In [6]:
print(bow[0].toarray())

[[1 0 1 1 0 0]]


In [7]:
print(bow[1].toarray())

[[2 0 0 0 1 0]]


In [8]:
# adding out of word (which is not present in given text)
cv.transform(['campusx watch and write comment of campusx']).toarray()

array([[2, 1, 0, 0, 1, 1]])

and, of is not available in array

# **N-grams**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [10]:
bow = cv.fit_transform(df['text'])

In [11]:
# vocab
print(cv.vocabulary_)

{'people warch': 2, 'warch campusx': 4, 'campusx watch': 0, 'watch campusx': 5, 'people write': 3, 'write comment': 6, 'campusx write': 1}


In [12]:
print(bow[0].toarray())

[[0 0 1 0 1 0 0]]


# **Unigrams and Bigrams**

In [13]:
cv = CountVectorizer(ngram_range=(1,2))

In [14]:
bow = cv.fit_transform(df['text'])

In [15]:
# vocab
print(cv.vocabulary_)

{'people': 4, 'warch': 7, 'campusx': 0, 'people warch': 5, 'warch campusx': 8, 'watch': 9, 'campusx watch': 1, 'watch campusx': 10, 'write': 11, 'comment': 3, 'people write': 6, 'write comment': 12, 'campusx write': 2}


In [16]:
print(bow[0].toarray())
print(bow[1].toarray())

[[1 0 0 0 1 1 0 1 1 0 0 0 0]]
[[2 1 0 0 0 0 0 0 0 1 1 0 0]]


# **Unigrams, Bigrams and trigrams**

In [17]:
cv = CountVectorizer(ngram_range=(1,3))

In [18]:
bow = cv.fit_transform(df['text'])

In [19]:
# vocab
print(cv.vocabulary_)

{'people': 6, 'warch': 11, 'campusx': 0, 'people warch': 7, 'warch campusx': 12, 'people warch campusx': 8, 'watch': 13, 'campusx watch': 1, 'watch campusx': 14, 'campusx watch campusx': 2, 'write': 15, 'comment': 5, 'people write': 9, 'write comment': 16, 'people write comment': 10, 'campusx write': 3, 'campusx write comment': 4}


In [20]:
print(bow[0].toarray())
print(bow[1].toarray())

[[1 0 0 0 0 0 1 1 1 0 0 1 1 0 0 0 0]]
[[2 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0]]


# **TF-IDF**

In [21]:
df = pd.DataFrame({'text':['people warch campusx',
                           'campusx watch campusx',
                           'people write comment',
                           'campusx write comment'],
                   'Output':[1,1,0,0]})
df

Unnamed: 0,text,Output
0,people warch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['text']).toarray()

array([[0.44809973, 0.        , 0.55349232, 0.70203482, 0.        ,
        0.        ],
       [0.78722298, 0.        , 0.        , 0.        , 0.61666846,
        0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.        ,
        0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.        ,
        0.61366674]])

In [26]:
print(tfidf.idf_)
print(tfidf.get_feature_names_out())

[1.22314355 1.51082562 1.51082562 1.91629073 1.91629073 1.51082562]
['campusx' 'comment' 'people' 'warch' 'watch' 'write']
