In [3]:
from __future__ import annotations
"""
This file is intended for training a logistic regression module to catch
the sentiments of comments
"""

'\nThis file is intended for training a logistic regression module to catch\nthe sentiments of comments\n'

In [6]:
## Import Modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib # Used for saving and loading the model

In [7]:
## Read Data
with open("IMDB Dataset.csv", "r", encoding="utf-8") as data:
    reviews: DataFrame = pd.read_csv(data)
    print(reviews)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [15]:
## Process Data

# Define features (X)
X: Series = reviews['review']

# One-Hot Encode the 'sentiment' column
# This creates two new columns: 'sentiment_positive' and 'sentiment_negative'
# The values is stored as bool for memory efficiency
sentiment_dummies: DataFrame = pd.get_dummies(reviews['sentiment'], prefix='sentiment')
print(sentiment_dummies, end="\n\n")

# For binary classification (like Logistic Regression), we only need one of the new columns.
# We will select 'sentiment_positive' as our target (y),
# where 1 means positive and 0 means negative (the value of the 'sentiment_negative' column).
y: Series = sentiment_dummies['sentiment_positive']

# Create a pipeline using CountVectorizer for Bag-of-Words:
model_pipeline = Pipeline([
    # Step 1: CountVectorizer implements the traditional Bag-of-Words model
    ('bow', CountVectorizer(stop_words='english')), 
    # Step 2: LogisticRegression classifier
    ('clf', LogisticRegression(solver='liblinear', random_state=42))
])

print(f"Features (X) selected: {X.name}")
print(f"Target (y) selected: {y.name} (1=Positive, 0=Negative)")

       sentiment_negative  sentiment_positive
0                   False                True
1                   False                True
2                   False                True
3                    True               False
4                   False                True
...                   ...                 ...
49995               False                True
49996                True               False
49997                True               False
49998                True               False
49999                True               False

[50000 rows x 2 columns]

Features (X) selected: review
Target (y) selected: sentiment_positive (1=Positive, 0=Negative)


In [16]:
## Split Data
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining on {len(X_train)} samples, testing on {len(X_test)} samples.")


Training on 40000 samples, testing on 10000 samples.


In [17]:
## Train Model
print("\nTraining the Logistic Regression model using CountVectorizer...")
model_pipeline.fit(X_train, y_train)
print("Training complete.")


Training the Logistic Regression model using CountVectorizer...
Training complete.


In [18]:
## Test Model
# Predict the sentiments on the test set
y_pred = model_pipeline.predict(X_test)
print(y_pred)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")

[False False  True ... False  True  True]

Model Accuracy on Test Set: 0.8884


In [20]:
## Save Model
# Save the entire pipeline (vectorizer + classifier) to a file
model_filename = 'logistic_regression_bow_sentiment_model.joblib'
joblib.dump(model_pipeline, model_filename)
print(f"Model saved successfully as '{model_filename}'")

Model saved successfully as 'logistic_regression_bow_sentiment_model.joblib'


In [22]:
## Use Model
## Import necessary modules
import joblib
import pandas as pd # Optional, for organized output

def load_and_predict_sentiment(model_filename, new_reviews):
    """
    Loads the saved model pipeline and uses it to predict sentiments 
    for a list of new text reviews.
    """
    try:
        # Load the entire pipeline object (Vectorize + Classifier)
        loaded_model = joblib.load(model_filename)
        print(f"Successfully loaded model from '{model_filename}'")
        
    except FileNotFoundError:
        print(f"Error: Model file '{model_filename}' not found.")
        print("Please ensure the training script was run successfully.")
        return

    # Use the loaded pipeline to make predictions
    # The pipeline handles the CountVectorizer transformation automatically
    predictions = loaded_model.predict(new_reviews)
    
    # Map the numeric predictions back to text sentiment for readability
    sentiment_map = {1: 'positive', 0: 'negative'}
    predicted_sentiments = [sentiment_map[p] for p in predictions]

    print("\n--- Sentiment Prediction Results ---")
    for review, sentiment in zip(new_reviews, predicted_sentiments):
        print(f"Review: '{review}'\n-> Predicted Sentiment: **{sentiment}**\n")

# --- EXECUTION ---

# 1. Define the filename of the model you saved
MODEL_FILE = 'logistic_regression_bow_sentiment_model.joblib'

# 2. Define the new data you want to predict (a simple Python list of strings)
new_comments_to_test = [
    "The acting was phenomenal, and the plot was engaging. A true masterpiece!", 
    "I'm deeply disappointed; it was a waste of two hours of my life.",
    "The visual effects were great, but the story was weak.", # Mixed/Neutral, but will force a binary prediction
    "Simply the best cinema experience of the year.",
    "This is not a good movie" # simple negation
]

# 3. Call the function to load and predict
load_and_predict_sentiment(MODEL_FILE, new_comments_to_test)

Successfully loaded model from 'logistic_regression_bow_sentiment_model.joblib'

--- Sentiment Prediction Results ---
Review: 'The acting was phenomenal, and the plot was engaging. A true masterpiece!'
-> Predicted Sentiment: **positive**

Review: 'I'm deeply disappointed; it was a waste of two hours of my life.'
-> Predicted Sentiment: **negative**

Review: 'The visual effects were great, but the story was weak.'
-> Predicted Sentiment: **negative**

Review: 'Simply the best cinema experience of the year.'
-> Predicted Sentiment: **positive**

Review: 'This is not a good movie'
-> Predicted Sentiment: **positive**

