In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
import random

In [3]:
import nltk
nltk.download('movie_reviews')


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [4]:
# Load the dataset from nltk
reviews = [(movie_reviews.raw(fileid), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]

# Shuffle the reviews to ensure randomness
random.shuffle(reviews)

# Convert to DataFrame
df = pd.DataFrame(reviews, columns=['review', 'sentiment'])

In [5]:
# Features and Labels
X = df['review']
y = df['sentiment']

# Text Vectorization
vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.25, random_state=42)



In [6]:
# Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 82.60%


In [7]:
# Predicting a new review
new_reviews = ["The movie was awesome!", "It was a boring movie."]
new_reviews_vectorized = vectorizer.transform(new_reviews)
predictions = model.predict(new_reviews_vectorized)

for review, sentiment in zip(new_reviews, predictions):
    print(f"Review: '{review}' -> Sentiment: {sentiment}")

Review: 'The movie was awesome!' -> Sentiment: pos
Review: 'It was a boring movie.' -> Sentiment: neg
