In [None]:
pip install nltk scikit-learn




In [None]:
# Import Data
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [None]:
# Prepare and Load Data
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]


In [None]:
X = [' '.join(review) for review, _ in documents]
y = [label for _, label in documents]


In [None]:
# Split - Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from collections import Counter
print("y_train label count:", Counter(y_train))
print("y_test label count:", Counter(y_test))


y_train label count: Counter({'pos': 800, 'neg': 800})
y_test label count: Counter({'neg': 200, 'pos': 200})


In [None]:
# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:
# Model Training - Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_vec, y_train)


In [None]:
# Model Evaluation
from sklearn import metrics
y_pred = model.predict(X_test_vec)
print(metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))


0.8175
              precision    recall  f1-score   support

         neg       0.82      0.81      0.82       200
         pos       0.81      0.82      0.82       200

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [None]:
print("Test predictions (first 10):", model.predict(X_test_vec[:10]))
print("Test labels (first 10):", y_test[:10])


Test predictions (first 10): ['neg' 'neg' 'pos' 'pos' 'neg' 'pos' 'pos' 'neg' 'neg' 'pos']
Test labels (first 10): ['neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos']


In [None]:
# Test with new input
user_input = ["I absolutely loved the movie! It was fantastic and inspiring"]
user_vec = vectorizer.transform(user_input)
print(model.predict(user_vec))


['pos']


In [None]:
print("User vector features:", user_vec)
print("Training sample:", X_train[0])
sample_vec = vectorizer.transform([X_train[0]])
print("Prediction for training sample:", model.predict(sample_vec)[0])
print("True label for training sample:", y_train[0])


User vector features: <Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (1, 36352)>
  Coords	Values
  (0, 673)	1
  (0, 1633)	1
  (0, 11725)	1
  (0, 16663)	1
  (0, 17102)	1
  (0, 19181)	1
  (0, 21210)	1
  (0, 32338)	1
  (0, 35202)	1
Training sample: while watching loser , it occurred to me that amy heckerling ' s true genius as a film - maker is casting . in fast times at ridgemont high , she gave us sean penn ' s jeff spicoli ; in look who ' s talking , she turned bruce willis into a wise - cracking baby and provided john travolta with is first career revival ; in clueless , she found a star vehicle for the adorableness that is ( or was ) alicia silverstone . she seems to understand instinctively how to find performers the audience will like in spite of their flaws . unfortunately , she may also be starting to understand that she understands . giving appealing actors an appealing script creates likeable movies . giving appealing actors a script in 

In [None]:
examples = [
    "I absolutely loved the movie! It was fantastic and inspiring.",
    "That was a terrible film, completely disappointing and boring.",
    "The plot was interesting but the acting was mediocre.",
    "I wouldn't recommend this to anyone.",
    "Everyone should watch this masterpiece!"
]
example_vec = vectorizer.transform(examples)
print(model.predict(example_vec))


['pos' 'neg' 'neg' 'neg' 'pos']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

sample_sentence = [
    "I absolutely loved the movie! It was fantastic and inspiring."
]


sample_tfidf_vec = tfidf_vectorizer.transform(sample_sentence)
print("TF-IDF Model Prediction:", model_tfidf.predict(sample_tfidf_vec)[0])


TF-IDF Model Prediction: pos
