# Bag-of-Words `BoW`


In [1]:
import os
import pandas as pd

DATA_PATH = "../data/raw"


In [43]:
corpus = pd.read_csv(os.path.join(DATA_PATH, "corpus.csv"))
corpus = corpus.fillna("")
corpus = corpus["tagline"]


In [33]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Print the shape of bow_matrix
print(bow_matrix.shape)


(9098, 6614)


In [34]:
lem_corpus = pd.read_csv(os.path.join(DATA_PATH, "lem_corpus.csv"), header=None)
lem_corpus = lem_corpus[0]


In [35]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_lem_matrix = vectorizer.fit_transform(lem_corpus)

# Print the shape of bow_lem_matrix
print(bow_lem_matrix.shape)


(6959, 5223)


In [36]:
corpus = [
    "The lion is the king of the jungle",
    "Lions have lifespans of a decade",
    "The lion is an endangered species",
]


In [37]:
# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary
bow_df.columns = vectorizer.get_feature_names_out()

# Print bow_df
print(bow_df)


   an  decade  endangered  have  is  jungle  king  lifespans  lion  lions  of  \
0   0       0           0     0   1       1     1          0     1      0   1   
1   0       1           0     1   0       0     0          1     0      1   1   
2   1       0           1     0   1       0     0          0     1      0   0   

   species  the  
0        0    3  
1        0    0  
2        1    1  


## Naive-Bayes BoW classifier


In [38]:
from sklearn.model_selection import train_test_split

reviews = pd.read_csv(os.path.join(DATA_PATH, "reviews.csv"))
X_train, X_test, y_train, y_test = train_test_split(
    reviews["review"], reviews["sentiment"], test_size=0.75, random_state=42
)


In [39]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, stop_words="english")

# Fit and transform X_train
X_train_bow = vectorizer.fit_transform(X_train)

# Transform X_test
X_test_bow = vectorizer.transform(X_test)

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)


(250, 7887)
(750, 7887)


In [40]:
from sklearn.naive_bayes import MultinomialNB


In [41]:
# Create a MultinomialNB object
clf = MultinomialNB(alpha=0.1)

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure the accuracy
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was terrible. The music was underwhelming and the acting mediocre."
prediction = clf.predict(vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))


The accuracy of the classifier on the test set is 0.741
The sentiment predicted by the classifier is 0


## N-Gram Models


In [48]:
# Generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1, 1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# Generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1, 2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# Generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1, 3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print(
    "ng1, ng2 and ng3 have %i, %i and %i features respectively"
    % (ng1.shape[1], ng2.shape[1], ng3.shape[1])
)


ng1, ng2 and ng3 have 6614, 37100 and 76881 features respectively


In [49]:
ng_vectorizer = vectorizer_ng2
X_train_ng = ng_vectorizer.fit_transform(X_train)
X_test_ng = ng_vectorizer.transform(X_test)


In [50]:
# Define an instance of MultinomialNB
clf_ng = MultinomialNB(alpha=0.1)

# Fit the classifier
clf_ng.fit(X_train_ng, y_train)

# Measure the accuracy
accuracy = clf_ng.score(X_test_ng, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = (
    "The movie was not good. The plot had several holes and the acting lacked panache."
)
prediction = clf_ng.predict(ng_vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))


The accuracy of the classifier on the test set is 0.740
The sentiment predicted by the classifier is 0


In [51]:
import time

df = reviews


In [53]:
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(
    df["review"],
    df["sentiment"],
    test_size=0.5,
    random_state=42,
    stratify=df["sentiment"],
)

# Generating ngrams
vectorizer = CountVectorizer()
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB(alpha=0.1)
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print(
    "The program took %.3f seconds to complete. The accuracy on the test set is %.2f. The ngram representation had %i features."
    % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1])
)


The program took 0.192 seconds to complete. The accuracy on the test set is 0.74. The ngram representation had 12347 features.


In [54]:
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(
    df["review"],
    df["sentiment"],
    test_size=0.5,
    random_state=42,
    stratify=df["sentiment"],
)

# Generating ngrams
vectorizer = CountVectorizer(ngram_range=(1, 3))
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB(alpha=0.1)
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print(
    "The program took %.3f seconds to complete. The accuracy on the test set is %.2f. The ngram representation had %i features."
    % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1])
)


The program took 1.135 seconds to complete. The accuracy on the test set is 0.75. The ngram representation had 178240 features.
