In [39]:
# Import required libraries
import numpy as np
import pandas as pd
import nltk

# Download movie reviews dataset
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

# Prepare dataset: (words, category)
docs = [(list(movie_reviews.words(fileid)), category)
         for category in movie_reviews.categories()
         for fileid in movie_reviews.fileids(category)]

print("Total samples:", len(docs))

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Total samples: 2000


In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Join words into full sentences
X = [" ".join(words) for words, category in docs]
y = [category for words, category in docs]

# Convert text into a bag-of-words matrix
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

In [41]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

print("Training complete!")

Training complete!


In [42]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8225


In [43]:
# Custom movie reviews
sample = ["Iwohub Media is a reputable company in the world",
          "We deal with scholarship and education" , "Iwohub Media has help many student in the World"]

# Transform and predict
sample_vec = vectorizer.transform(sample)
print(model.predict(sample_vec))

['neg' 'neg' 'pos']
