# Simple Example of Bag Of Words


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Define the documents
documents = [
    "This is good, is it not?",
    "This is bad",
    "This is awesome"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the documents into a document-term matrix
bow_matrix = vectorizer.fit_transform(documents)

# Get the feature names (vocabulary)
vocabulary = vectorizer.get_feature_names_out()

# Convert the matrix to an array
bow_array = bow_matrix.toarray()

print("Vocabulary:", vocabulary)

# Create a list of dictionaries for each document
data = []
for i, doc in enumerate(documents):
    tokens = doc.split()  # Simple tokenization (may not handle punctuation properly)
    word_vector = bow_array[i]
    data.append({"Document": doc, "Tokens": tokens, f"Word Vector {vocabulary}": word_vector})

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)

# Print the DataFrame
df


Vocabulary: ['awesome' 'bad' 'good' 'is' 'it' 'not' 'this']


Unnamed: 0,Document,Tokens,Word Vector ['awesome' 'bad' 'good' 'is' 'it' 'not' 'this']
0,"This is good, is it not?","[This, is, good,, is, it, not?]","[0, 0, 1, 2, 1, 1, 1]"
1,This is bad,"[This, is, bad]","[0, 1, 0, 1, 0, 0, 1]"
2,This is awesome,"[This, is, awesome]","[1, 0, 0, 1, 0, 0, 1]"


# Applications of Bag Of Words (BOW)

## Document Classification

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Sample documents and their corresponding labels (e.g., spam or not spam)
documents = [
    "Win a free iPhone today!",
    "Exclusive offer just for you",
    "Dear friend, I need your help",
    "Your account has been hacked",
    "Congratulations! You've won a lottery"
]
labels = ['spam', 'spam', 'not spam', 'not spam', 'spam']

# Convert the documents into a Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
y = labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Naive Bayes classifier on the training data
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict the spam/ham for a new email
new_email = "win a lottery"
#new_email = "a leave application"

new_email_vectorized = vectorizer.transform([new_email])
prediction = classifier.predict(new_email_vectorized)
print("Prediction for new email:", prediction)


Prediction for new email: ['not spam']


## Document Classficiation Example 2

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Sample documents and their corresponding labels (e.g., positive or negative)
documents = [
    "I love this movie. It is fantastic!",
    "Absolutely terrible. I will never watch this again.",
    "Best film ever. I highly recommend it.",
    "Awful. Complete waste of time.",
    "Such a bad movie",
    "I enjoyed this movie. It was great!"
]
labels = ['positive', 'negative', 'positive', 'negative','negative', 'positive']

# Convert the documents into a Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
y = labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Multinomial Naive Bayes classifier on the training data
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

new_review = "a horrible movie"
#new_review = "great music"

new_review_vectorized = vectorizer.transform([new_review])
prediction = classifier.predict(new_review_vectorized)
print("Prediction for new review:", prediction)



Prediction for new review: ['positive']


## Sentiment Analysis

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Sample reviews and their corresponding sentiments (positive or negative)
reviews = [
    "This movie was fantastic! I loved it.",
    "Horrible movie. I hated it.",
    "The plot was boring, but the acting was good.",
    "I wouldn't recommend this movie.",
    "Definitely a must-watch!"
]
sentiments = ['positive', 'negative', 'neutral', 'negative', 'positive']

# Convert the reviews into a Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)
y = sentiments

# Train a Logistic Regression classifier on the data
classifier = LogisticRegression()
classifier.fit(X, y)

# Predict the sentiment for a new review
new_review = "The movie was not bad, but it could have been better."
new_review_vectorized = vectorizer.transform([new_review])
prediction = classifier.predict(new_review_vectorized)
print("Prediction for new review:", prediction)


Prediction for new review: ['negative']


## Sentiment Analaysis using TextBlob

In [25]:
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer

# Sample reviews
reviews = [
    "I love this product. It is amazing!",
    "Terrible service. I will never come back.",
    "The food was delicious. Highly recommend!",
    "Worst experience ever. Do not go there.",
    "Great movie. I enjoyed it a lot!"
]

# Convert the reviews into a Bag of Words model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)

# Analyze the sentiment of each review
for review, vector in zip(reviews, X.toarray()):
    blob = TextBlob(review)
    sentiment = blob.sentiment.polarity
    print(f"Review: {review}")
    print(f"Vector: {vector}")
    print(f"Sentiment: {'Positive' if sentiment > 0 else 'Negative' if sentiment < 0 else 'Neutral'}")
    print()


Review: I love this product. It is amazing!
Vector: [1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0]
Sentiment: Positive

Review: Terrible service. I will never come back.
Vector: [0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0]
Sentiment: Negative

Review: The food was delicious. Highly recommend!
Vector: [0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0]
Sentiment: Positive

Review: Worst experience ever. Do not go there.
Vector: [0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1]
Sentiment: Negative

Review: Great movie. I enjoyed it a lot!
Vector: [0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
Sentiment: Positive

