In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import os

In [2]:
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

texts = [" ".join(words) for words, label in documents]
labels = [1 if label == 'pos' else 0 for words, label in documents]

X_train_texts, X_test_texts, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Bluepal\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [3]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_texts)

X_train_seq = tokenizer.texts_to_sequences(X_train_texts)
X_test_seq = tokenizer.texts_to_sequences(X_test_texts)

max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [4]:
import os
import zipfile
import requests
from tqdm import tqdm

glove_zip_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = "glove.6B.zip"
glove_txt_path = "glove.6B.100d.txt"

if not os.path.exists(glove_zip_path):
    print("Downloading GloVe embeddings...")
    response = requests.get(glove_zip_url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024
    t=tqdm(total=total_size, unit='iB', unit_scale=True)
    with open(glove_zip_path, 'wb') as f:
        for data in response.iter_content(block_size):
            t.update(len(data))
            f.write(data)
    t.close()
    print("Download completed!")

if not os.path.exists(glove_txt_path):
    print("Extracting glove.6B.100d.txt...")
    with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
        zip_ref.extract('glove.6B.100d.txt', path='.')
    print("Extraction completed!")

if os.path.exists(glove_txt_path):
    print(f"{glove_txt_path} is ready to use!")
else:
    print("Error: File not found.")

Downloading GloVe embeddings...


100%|██████████| 862M/862M [02:46<00:00, 5.19MiB/s]  


Download completed!
Extracting glove.6B.100d.txt...
Extraction completed!
glove.6B.100d.txt is ready to use!


In [5]:
embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        embedding_index[word] = vector

print(f"Loaded {len(embedding_index)} word vectors.")

Loaded 400000 word vectors.


In [6]:
vocab_size = min(10000, len(tokenizer.word_index) + 1)
embedding_dim = 100

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [7]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim,
              weights=[embedding_matrix], input_length=max_len, trainable=False),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()



In [8]:
history = model.fit(X_train_pad, np.array(y_train),
                    validation_split=0.2,
                    epochs=10,
                    batch_size=32)

Epoch 1/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.4875 - loss: 0.7022 - val_accuracy: 0.5250 - val_loss: 0.6915
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5273 - loss: 0.6914 - val_accuracy: 0.5125 - val_loss: 0.6926
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5641 - loss: 0.6904 - val_accuracy: 0.5344 - val_loss: 0.6913
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5734 - loss: 0.6898 - val_accuracy: 0.5344 - val_loss: 0.6906
Epoch 5/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5523 - loss: 0.6881 - val_accuracy: 0.5719 - val_loss: 0.6888
Epoch 6/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5820 - loss: 0.6872 - val_accuracy: 0.5437 - val_loss: 0.6891
Epoch 7/10
[1m40/40[0m [32m━━━━━━━━━

In [9]:
loss, acc = model.evaluate(X_test_pad, np.array(y_test))
print(f"Test Accuracy: {acc:.4f}")

def predict_sentiment_glove(text):
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    pred = model.predict(pad)[0][0]
    sentiment = "Positive 😀" if pred >= 0.5 else "Negative 😞"
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5900 - loss: 0.6799
Test Accuracy: 0.5900


In [10]:
predict_sentiment_glove("I really loved this movie, it was amazing!")
predict_sentiment_glove("This film was boring and a waste of time.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
Text: I really loved this movie, it was amazing!
Predicted Sentiment: Negative 😞

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Text: This film was boring and a waste of time.
Predicted Sentiment: Negative 😞

