In [93]:
import pandas as pd
import random

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


1st part , generating a database using templates

In [94]:
positive_templates = [
    "This movie was {adjective1} and {adjective2}!",
    "Absolutely {adjective1}! A must-watch for everyone.",
    "I found it to be very {adjective1} and {adjective2}.",
    "What a {adjective1} film, truly {adjective2}.",
    "Highly recommended! {adjective1} performances and a {adjective2} plot.",
    "A {adjective1} cinematic experience.",
    "So {adjective1}, I couldn't stop watching. Truly {adjective2}.",
    "Loved every {adjective1} minute of it.",
    "Pure {adjective1} entertainment.",
    "Simply {adjective1} and {adjective2}."
]

negative_templates = [
    "This movie was {adjective1} and {adjective2}.",
    "Absolutely {adjective1}! Don't waste your time.",
    "I found it to be very {adjective1} and {adjective2}.",
    "What a {adjective1} film, truly {adjective2}.",
    "Not recommended. {adjective1} performances and a {adjective2} plot.",
    "A {adjective1} cinematic experience.",
    "So {adjective1}, I had to turn it off. Truly {adjective2}.",
    "Regretted every {adjective1} minute of it.",
    "Pure {adjective1} disappointment.",
    "Simply {adjective1} and {adjective2}."
]

positive_adjectives = [
    "amazing", "fantastic", "brilliant", "great", "excellent",
    "superb", "wonderful", "enjoyable", "captivating", "mesmerizing",
    "thrilling", "insightful", "compelling", "memorable", "spectacular"
]

negative_adjectives = [
    "terrible", "horrible", "bad", "awful", "disappointing",
    "boring", "dull", "uninspired", "mediocre", "confusing",
    "predictable", "pointless", "unwatchable", "sloppy", "frustrating"
]

In [95]:
def generate_review(sentiment_type):
    if sentiment_type == "positive":
        template = random.choice(positive_templates)
        adj1 = random.choice(positive_adjectives)
        adj2 = random.choice(positive_adjectives)
    else:
        template = random.choice(negative_templates)
        adj1 = random.choice(negative_adjectives)
        adj2 = random.choice(negative_adjectives)
    return template.format(adjective1=adj1, adjective2=adj2)

In [96]:
positive_reviews_data = []
for _ in range(50):
    positive_reviews_data.append({
        "review": generate_review("positive"),
        "sentiment": "positive"
    })

negative_reviews_data = []
for _ in range(50):
    negative_reviews_data.append({
        "review": generate_review("negative"),
        "sentiment": "negative"
    })

In [97]:
all_reviews_data = positive_reviews_data + negative_reviews_data
random.shuffle(all_reviews_data)

df = pd.DataFrame(all_reviews_data)

print("Generated DataFrame:")
print(df)


Generated DataFrame:
                                              review sentiment
0               What a unwatchable film, truly dull.  negative
1   Absolutely excellent! A must-watch for everyone.  positive
2             Regretted every mediocre minute of it.  negative
3       Absolutely great! A must-watch for everyone.  positive
4                 Pure disappointing disappointment.  negative
..                                               ...       ...
95                        Pure awful disappointment.  negative
96                A uninspired cinematic experience.  negative
97               Loved every memorable minute of it.  positive
98               Regretted every awful minute of it.  negative
99                      A dull cinematic experience.  negative

[100 rows x 2 columns]


2nd part:
tokenizing the reviews into 80 % training set and 20% testing set, as asked in the question

In [98]:
vectorizer = CountVectorizer(max_features=500, stop_words='english')
X = vectorizer.fit_transform(df['review'])
y = df['sentiment'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

part 3rd:
training the model based on distribited data frame and testing its accuracy

In [99]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\n--- Multinomial Naive Bayes Classifier Training Complete ---")
print(f"Accuracy on the test set: {accuracy*100:.2f}%")


--- Multinomial Naive Bayes Classifier Training Complete ---
Accuracy on the test set: 95.00%


ps: the accuracy is mostly 100 %due to synthetic database, the accuracy still drops to 95% sometimes 

4th part

In [100]:
def predict_review_sentiment(model_param, vectorizer_param, review_text):
        review_vectorized = vectorizer_param.transform([review_text])
        predicted_sentiment = model_param.predict(review_vectorized)[0]
        return predicted_sentiment

In [101]:
test_review_positive = "This was an absolutely stunning and captivating film!"
predicted_sentiment_positive = predict_review_sentiment(model, vectorizer, test_review_positive)
print(f"Review: '{test_review_positive}' -> Predicted Sentiment: {predicted_sentiment_positive}")

# Test with a negative-leaning review
test_review_negative = "What a dull and disappointing movie, I hated it."
predicted_sentiment_negative = predict_review_sentiment(model, vectorizer, test_review_negative)
print(f"Review: '{test_review_negative}' -> Predicted Sentiment: {predicted_sentiment_negative}")

# Test with a review that might be ambiguous or out of vocabulary
test_review_ambiguous = "The movie had some good parts but was largely okay."
predicted_sentiment_ambiguous = predict_review_sentiment(model, vectorizer, test_review_ambiguous)
print(f"Review: '{test_review_ambiguous}' -> Predicted Sentiment: {predicted_sentiment_ambiguous}")

test_review_new = "A truly boring and pointless film, completely unwatchable."
predicted_sentiment_new = predict_review_sentiment(model, vectorizer, test_review_new)
print(f"Review: '{test_review_new}' -> Predicted Sentiment: {predicted_sentiment_new}")

test_review_new_pos = "Highly enjoyable and a wonderful experience."
predicted_sentiment_new_pos = predict_review_sentiment(model, vectorizer, test_review_new_pos)
print(f"Review: '{test_review_new_pos}' -> Predicted Sentiment: {predicted_sentiment_new_pos}")

Review: 'This was an absolutely stunning and captivating film!' -> Predicted Sentiment: positive
Review: 'What a dull and disappointing movie, I hated it.' -> Predicted Sentiment: negative
Review: 'The movie had some good parts but was largely okay.' -> Predicted Sentiment: positive
Review: 'A truly boring and pointless film, completely unwatchable.' -> Predicted Sentiment: negative
Review: 'Highly enjoyable and a wonderful experience.' -> Predicted Sentiment: positive
