In [1]:
# Step 1: Import the pandas library, our tool for working with data tables
import pandas as pd

# Step 2: Define the path to our data file
file_path = 'IMDB Dataset.csv'

# Step 3: Load the CSV file into a pandas DataFrame
# A DataFrame is like a smart spreadsheet for Python
df = pd.read_csv(file_path)

# Step 4: Display the first 5 rows to see what our data looks like
print("Data loaded successfully!")
df.head()

Data loaded successfully!


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
# Let's see how many positive vs. negative reviews we have
print("Value counts for sentiment:")
print(df['sentiment'].value_counts())
print("-" * 30) # A separator line

# Convert 'positive'/'negative' labels into 1/0
df['sentiment_numeric'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# --- Splitting the Data ---
from sklearn.model_selection import train_test_split

# Define our features (X) and our target (y)
# X is the data we use to make a prediction (the reviews)
# y is what we want to predict (the 0 or 1 sentiment)
X = df['review']
y = df['sentiment_numeric']

# Split the data into training and testing sets
# We'll use 80% for training and 20% for testing.
# random_state ensures we get the same split every time we run the code.
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training data size: {len(X_train)} reviews")
print(f"Testing data size: {len(X_test)} reviews")

Value counts for sentiment:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
------------------------------
Training data size: 40000 reviews
Testing data size: 10000 reviews


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer object
# - max_features=5000 means we'll only use the top 5000 most important words.
#   This saves memory and often improves performance.
# - stop_words='english' automatically removes common English words like 'the', 'a', etc.
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Learn the vocabulary from our training data and transform it into a matrix
X_train_vectorized = vectorizer.fit_transform(X_train)

# ONLY transform the test data using the vocabulary we already learned
X_test_vectorized = vectorizer.transform(X_test)

print("Text has been vectorized successfully!")
print(f"Shape of the training data matrix: {X_train_vectorized.shape}")
print(f"Shape of the testing data matrix: {X_test_vectorized.shape}")

Text has been vectorized successfully!
Shape of the training data matrix: (40000, 5000)
Shape of the testing data matrix: (10000, 5000)


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Create and train the Naive Bayes model
# We create an instance of the model and then 'fit' it to our vectorized training data.
model = MultinomialNB()
model.fit(X_train_vectorized, y_train)

print("Model training complete!")
print("-" * 30)

# Step 2: Make predictions on the unseen test data
y_pred = model.predict(X_test_vectorized)

# Step 3: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\n--- Classification Report ---")
# The classification report gives us more details like precision and recall.
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

Model training complete!
------------------------------
Model Accuracy: 85.09%

--- Classification Report ---
              precision    recall  f1-score   support

    Negative       0.85      0.85      0.85      4961
    Positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [5]:
def predict_sentiment(text):
    """
    Takes a text string and predicts its sentiment using our trained model.
    """
    # 1. Vectorize the input text using the same vectorizer we trained on
    vectorized_text = vectorizer.transform([text])
    
    # 2. Use the trained model to predict
    prediction = model.predict(vectorized_text)
    
    # 3. Convert the numerical prediction back to a readable label
    if prediction[0] == 1:
        return "Positive"
    else:
        return "Negative"

# --- Let's test it! ---
my_review_1 = "This movie was absolutely fantastic, the acting was superb!"
my_review_2 = "It was a complete waste of time, the plot was boring and predictable."

print(f"Review: '{my_review_1}'")
print(f"Predicted Sentiment: {predict_sentiment(my_review_1)}\n")

print(f"Review: '{my_review_2}'")
print(f"Predicted Sentiment: {predict_sentiment(my_review_2)}")

Review: 'This movie was absolutely fantastic, the acting was superb!'
Predicted Sentiment: Positive

Review: 'It was a complete waste of time, the plot was boring and predictable.'
Predicted Sentiment: Negative


In [6]:
import joblib

# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Save the model
joblib.dump(model, 'sentiment_model.pkl')

print("Vectorizer and model saved successfully to files!")

Vectorizer and model saved successfully to files!
