In [1]:
pip install gensim scikit-learn nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:

import nltk
from nltk.corpus import brown, movie_reviews
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

import random
import os

nltk.download('brown')
nltk.download('movie_reviews')
nltk.download('punkt')

try:
    nltk.download('punkt_tab')
except:
    pass


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\najmulu\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\najmulu\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\najmulu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\najmulu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [4]:
print("=== 1) Training Word2Vec on Brown corpus ===")


sentences = brown.sents()
sentences = [[word.lower() for word in sentence] for sentence in sentences]

model_path = "word2vec_brown.model"
print("Training Word2Vec (this can take a moment)...")
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

model.save(model_path)
print(f"Model saved to: {os.path.abspath(model_path)}")

model = Word2Vec.load(model_path)
print("Model reloaded successfully.")

if 'king' in model.wv:
    vector = model.wv['king']
    print(f"Vector for 'king' (dim={len(vector)}):\n{vector}")
else:
    print("The word 'king' is not in the Brown vocabulary with current settings.")

w1, w2 = 'king', 'queen'
if w1 in model.wv and w2 in model.wv:
    similarity = model.wv.similarity(w1, w2)
    print(f"Similarity between {w1} and {w2}: {similarity}")
else:
    print(f"Cannot compute similarity: missing {'king' if w1 not in model.wv else ''} {'queen' if w2 not in model.wv else ''}")


=== 1) Training Word2Vec on Brown corpus ===
Training Word2Vec (this can take a moment)...
Model saved to: C:\Users\najmulu\word2vec_brown.model
Model reloaded successfully.
Vector for 'king' (dim=100):
[ 9.58590060e-02  4.98979241e-01  1.36356652e-01  1.30096570e-01
 -3.52803797e-01 -1.70505583e-01  3.90085489e-01  3.47632170e-01
 -4.12792675e-02  1.31814955e-02 -2.67216023e-02 -1.97483420e-01
 -3.10967416e-01 -3.81479189e-02 -1.21954411e-01 -1.09172881e-01
  6.79948106e-02 -3.04841191e-01 -3.16980243e-01 -5.37305892e-01
  2.94380635e-01  3.13377470e-01  4.45572764e-01 -1.80112347e-01
 -1.01373836e-01 -6.58368543e-02 -5.09006679e-01 -1.56280547e-01
  2.18618095e-01  1.96103491e-02  4.84882265e-01  1.00269258e-01
  2.18198612e-01 -6.70234025e-01  1.68403536e-01  1.82396416e-02
  1.54398456e-01  2.05432534e-01 -2.24132627e-01 -1.62212580e-01
  1.85362354e-01 -3.61309856e-01  8.63816869e-03  2.18285531e-01
  2.46094555e-01 -2.28801189e-04 -2.49191821e-01 -1.50018841e-01
  4.89422262e-01 

In [5]:
print("\n=== 2) Naive Bayes with CountVectorizer on movie reviews ===")

documents = [
    (list(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)
]
random.shuffle(documents)
reviews, labels = zip(*documents)
reviews = [" ".join(tokens) for tokens in reviews]

X_train, X_test, y_train, y_test = train_test_split(
    reviews, labels, test_size=0.2, random_state=42, stratify=labels
)

vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts  = vectorizer.transform(X_test)

# Train Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

y_pred = nb_classifier.predict(X_test_counts)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print("\nClassification report (CountVectorizer):")
print(classification_report(y_test, y_pred, target_names=movie_reviews.categories()))

new_review = ["very bad movie, waste of time."]
new_review_counts = vectorizer.transform(new_review)
prediction = nb_classifier.predict(new_review_counts)
print(f'Prediction: {"pos" if prediction[0] == "pos" else "neg"}')



=== 2) Naive Bayes with CountVectorizer on movie reviews ===
Accuracy: 0.83

Classification report (CountVectorizer):
              precision    recall  f1-score   support

         neg       0.82      0.84      0.83       200
         pos       0.84      0.82      0.83       200

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

Prediction: neg


In [7]:
print("\n=== 3) Naive Bayes with TfidfVectorizer (tutorial task) ===")


tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf  = tfidf_vectorizer.transform(X_test)

nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = nb_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f'Accuracy (TF-IDF): {accuracy_tfidf:.2f}')
print("\nClassification report (TF-IDF):")
print(classification_report(y_test, y_pred_tfidf, target_names=movie_reviews.categories()))

new_review_tfidf = tfidf_vectorizer.transform(["very bad movie, waste of time."])
prediction_tfidf = nb_tfidf.predict(new_review_tfidf)
print(f'Prediction (TF-IDF): {"pos" if prediction_tfidf[0] == "pos" else "neg"}')

print("\n=== Side-by-side accuracy comparison ===")
print(f"CountVectorizer accuracy: {accuracy:.2f}")
print(f"TfidfVectorizer accuracy: {accuracy_tfidf:.2f}")



=== 3) Naive Bayes with TfidfVectorizer (tutorial task) ===
Accuracy (TF-IDF): 0.83

Classification report (TF-IDF):
              precision    recall  f1-score   support

         neg       0.79      0.90      0.84       200
         pos       0.88      0.76      0.82       200

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

Prediction (TF-IDF): neg

=== Side-by-side accuracy comparison ===
CountVectorizer accuracy: 0.83
TfidfVectorizer accuracy: 0.83
