# Bag of words model

1. Extract word tokens
2. Compute frequency of word tokens
3. Construct a word vector out of these frequencies and vocabulary of corpus

## Text preprocessing
Lions, lion ===> lion

The , the ==> the

No punctuations

No stopwards

In [5]:
import pandas as pd

In [1]:
corpus = ['There was a big lion.',
         'After 10 days he died.',
         'After 15 days lion reborn.']

# Bag of words

In [4]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Print the shape of bow_matrix
print(bow_matrix.shape)  # 11 unique words with 3 sentences


(3, 11)


In [8]:
# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

# Print bow_df
bow_df  # it remove 'a'  But we can also remove some other words like 'was, the, etc'

Unnamed: 0,10,15,after,big,days,died,he,lion,reborn,there,was
0,0,0,0,1,0,0,0,1,0,1,1
1,1,0,1,0,1,1,1,0,0,0,0
2,0,1,1,0,1,0,0,1,1,0,0


# Building a Naive Bayes Classifier (Bag of words)


# spam filtering

In [None]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=False, stop_words='english')

# Fit and transform X_train
X_train_bow = vectorizer.fit_transform(X_train)

# Transform X_test
X_test_bow = vectorizer.transform(X_test)

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)

In [None]:
# Create a MultinomialNB object
clf = MultinomialNB()

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure the accuracy
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was terrible. The music was underwhelming and the acting mediocre."
prediction = clf.predict(vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))

# N-gram model

bag of words is a special case of N-grams

Applications:

1. Sentence completion
2. Spelling correction
3. Machine translation correction

Problems:
curse of dimensionality (in bag of word dimension is high)

higher order of n-grams are rare

keep n-small

In [10]:
corpus

['There was a big lion.',
 'After 10 days he died.',
 'After 15 days lion reborn.']

In [20]:
# Generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1,1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# Generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1,2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# Generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1, 3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print("ng1, ng2 and ng3 have %i, %i and %i features respectively" % (ng1.shape[1], ng2.shape[1], ng3.shape[1]))

ng1, ng2 and ng3 have 11, 22 and 30 features respectively


In [24]:
from sklearn.naive_bayes import MultinomialNB 

In [30]:
# Define an instance of MultinomialNB 
clf_ng = MultinomialNB()

# Fit the classifier 
clf_ng.fit(ng1.toarray(),[1,1,0])

# Measure the accuracy 
accuracy = clf_ng.score(ng1.toarray(),[1,1,0])
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The lion was not good."
prediction = clf_ng.predict(vectorizer_ng1.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))

The accuracy of the classifier on the test set is 1.000
The sentiment predicted by the classifier is 1


# Comparsion

In [None]:
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(df['review'], df['sentiment'], 
                                                    test_size    = 0.5, 
                                                    random_state = 42, 
                                                    stratify     = df['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer()
train_X    = vectorizer.fit_transform(train_X)
test_X     = vectorizer.transform(test_X)

# Fit classifier
clf        = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print('''The program took %.3f seconds to complete. The accuracy on the test set is %.2f. 
      The ngram representation had %i features.''' % 
      (time.time() - start_time, 
       clf.score(test_X, test_y), 
       train_X.shape[1]))

In [None]:
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(df['review'], df['sentiment'],
                                                    test_size=0.5,
                                                    random_state=42,
                                                    stratify=df['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer(ngram_range=(1,3))
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print("""The program took %.3f seconds to complete. The accuracy on the test set is %.2f. 
The ngram representation had %i features.""" % 
      (time.time() - start_time, 
       clf.score(test_X, test_y), 
       train_X.shape[1]))