In [1]:
# Cell 1: Download the list of common words to remove
import nltk
nltk.download('stopwords')
print("Stopwords downloaded successfully!")

[nltk_data] Downloading package stopwords to /root/nltk_data...


Stopwords downloaded successfully!


[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Cell 2: Import all the toolboxes we need
import numpy as np
import re
from tensorflow.keras.datasets import imdb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.corpus import stopwords

print("All libraries imported!")

All libraries imported!


In [3]:
# Cell 3: Load the movie review data
top_words = 5000
print("Loading IMDB dataset... This might take a second.")
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# Let's see what we got
print(f"Number of training reviews: {len(X_train)}")
print(f"Number of test reviews: {len(X_test)}")
print(f"First training review label: {y_train[0]} (1 = Positive, 0 = Negative)")

Loading IMDB dataset... This might take a second.
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Number of training reviews: 25000
Number of test reviews: 25000
First training review label: 1 (1 = Positive, 0 = Negative)


In [4]:
# Cell 4: Decode the numbers back into words
word_index = imdb.get_word_index()
reverse_word_index = {index: word for word, index in word_index.items()}

# Function to decode a review
def decode_review(encoded_review):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

# Let's decode a few to see how it works
print("Decoded Review Example:")
print(decode_review(X_train[0]))
print(f"\nThis review's sentiment is: {'Positive' if y_train[0] == 1 else 'Negative'}")

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Decoded Review Example:
? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly ? was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little ? that played the ? of norman and paul they were just brilliant children are often left out 

In [5]:
# Cell 5: Take a smaller sample of 1000 reviews
num_reviews = 1000

# Decode the reviews into text
train_reviews = [decode_review(review) for review in X_train[:num_reviews]]
train_sentiments = y_train[:num_reviews]

test_reviews = [decode_review(review) for review in X_test[:num_reviews]]
test_sentiments = y_test[:num_reviews]

# Combine them into one big list
all_reviews = train_reviews + test_reviews
all_sentiments = np.concatenate([train_sentiments, test_sentiments])

print(f"Total reviews to analyze: {len(all_reviews)}")
print(f"Total sentiments/scores: {len(all_sentiments)}")

Total reviews to analyze: 2000
Total sentiments/scores: 2000


In [6]:
# Cell 6: Clean the text data (lowercase, remove junk, remove stopwords)
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # 1. Convert to lowercase
    text = text.lower()
    # 2. Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 3. Remove stopwords
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words and len(word) > 1]
    # 4. Join the words back into a single string
    return ' '.join(filtered_words)

# Let's test it on one review first!
sample_review = "This is a GREAT movie!!! It's full of awesome, fun, and exciting scenes."
print("Original Text:", sample_review)
print("\nPreprocessed Text:", preprocess_text(sample_review))

# Now, clean ALL our reviews (this will take a moment)
print("\nStarting to preprocess all reviews...")
processed_reviews = [preprocess_text(review) for review in all_reviews]
print("Preprocessing complete!")

Original Text: This is a GREAT movie!!! It's full of awesome, fun, and exciting scenes.

Preprocessed Text: great movie full awesome fun exciting scenes

Starting to preprocess all reviews...
Preprocessing complete!


In [7]:
# Cell 7: Convert text to numbers
vectorizer = CountVectorizer(max_features=3000)

# Fit the vectorizer to our text and transform the text into numbers
print("Converting text to numbers...")
X = vectorizer.fit_transform(processed_reviews).toarray()
y = all_sentiments

print(f"Shape of our numerical data: {X.shape}")
print("Text has been successfully converted to numbers!")

Converting text to numbers...
Shape of our numerical data: (2000, 3000)
Text has been successfully converted to numbers!


In [8]:
# Cell 8: Split the data and train the model
# Split the data: 80% for training, 20% for testing
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training samples: {X_train_final.shape[0]}")
print(f"Testing samples: {X_test_final.shape[0]}")

# Create and train the model
print("Training the Logistic Regression model...")
model = LogisticRegression(random_state=42, max_iter=200)
model.fit(X_train_final, y_train_final)
print("Model training complete!")

Training samples: 1600
Testing samples: 400
Training the Logistic Regression model...
Model training complete!


In [9]:
# Cell 9: See how good the model is
# Use the trained model to make predictions on the test set
y_pred = model.predict(X_test_final)

# Calculate how good the predictions are
accuracy = accuracy_score(y_test_final, y_pred)
precision = precision_score(y_test_final, y_pred)
recall = recall_score(y_test_final, y_pred)

print("\n--- How Did Our Model Do? ---")
print(f"Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")


--- How Did Our Model Do? ---
Accuracy:  0.8275 (82.75%)
Precision: 0.8434
Recall:    0.8146


In [10]:
# Cell 10: Test with your own words!
# Function to predict any new review
def predict_sentiment(new_review):
    processed_review = preprocess_text(new_review)
    vectorized_review = vectorizer.transform([processed_review])
    prediction = model.predict(vectorized_review)
    sentiment = "Positive" if prediction[0] == 1 else "Negative"
    print(f"Review: '{new_review}'")
    print(f"Predicted Sentiment: {sentiment}\n")

# Test with some examples
print("--- Testing With Custom Reviews ---")
predict_sentiment("This movie was absolutely wonderful. The story was heartwarming and the acting was brilliant.")
predict_sentiment("I hated every minute of this film. It was boring, long, and poorly acted. A complete waste of time.")

--- Testing With Custom Reviews ---
Review: 'This movie was absolutely wonderful. The story was heartwarming and the acting was brilliant.'
Predicted Sentiment: Positive

Review: 'I hated every minute of this film. It was boring, long, and poorly acted. A complete waste of time.'
Predicted Sentiment: Negative

