In [1]:
# --- Step 1: Import Libraries and Load Data ---

# 'pandas' is a powerful library for data manipulation and analysis. We use it to load and handle our dataset.
# We give it the shorter alias 'pd' by convention.
import pandas as pd

# 're' is Python's built-in library for regular expressions. We'll use it for cleaning text data.
import re

# 'nltk' (Natural Language Toolkit) is a leading platform for building Python programs to work with human language data.
import nltk

# We need to download a specific resource from NLTK: 'stopwords'.
# Stopwords are common words (like "the", "a", "in") that often don't carry significant meaning and can be removed.
nltk.download('stopwords')
from nltk.corpus import stopwords

# --- Load the Dataset ---
# We use pandas' read_csv function to load our data from the uploaded file.
# The file is now a pandas DataFrame, which is like a spreadsheet or a table in memory.
df = pd.read_csv('IMDB Dataset.csv')

# --- Initial Exploration ---
# Let's look at the first 5 rows of our data to understand its structure.
# We should see two columns: 'review' (the text of the movie review) and 'sentiment' (positive/negative).
print("--- First 5 rows of the dataset: ---")
print(df.head())

# Let's get some basic information about our dataset.
# .info() tells us the number of entries, column names, and data types.
print("\n--- Dataset Information: ---")
df.info()

# Let's check the distribution of sentiments. A balanced dataset is good for training.
print("\n--- Sentiment Distribution: ---")
print(df['sentiment'].value_counts())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


--- First 5 rows of the dataset: ---
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

--- Dataset Information: ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

--- Sentiment Distribution: ---
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [2]:
# --- Step 2: Preprocess the Text Data ---

# Get the list of English stop words from NLTK
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    This function takes a raw text string and performs the following cleaning steps:
    1. Removes HTML tags.
    2. Converts text to lowercase.
    3. Removes punctuation and special characters.
    4. Removes stop words.
    """
    # 1. Remove HTML tags using a regular expression.
    # re.sub finds a pattern and replaces it with another string. Here, '<.*?>' finds any sequence in angle brackets.
    text = re.sub(r'<.*?>', '', text)

    # 2. Remove punctuation and non-alphabetic characters, then convert to lowercase.
    # '[^a-zA-Z]' matches any character that is NOT a letter. We replace them with a space.
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()

    # 3. Tokenize the text (split it into a list of words) and remove stop words.
    # .split() breaks the string into a list of words based on spaces.
    words = text.split()
    # We use a list comprehension for an efficient loop. It creates a new list containing only the words
    # that are NOT in our stop_words set.
    clean_words = [word for word in words if word not in stop_words]

    # 4. Join the cleaned words back into a single string.
    return ' '.join(clean_words)

# Now, we apply our cleaning function to every review in the 'review' column.
# The .apply() method is a powerful way to run a function on every item in a pandas Series (a column).
# We create a new column 'cleaned_review' to store the result.
print("\n--- Cleaning the text data... (This may take a moment) ---")
df['cleaned_review'] = df['review'].apply(preprocess_text)
print("--- Cleaning complete! ---")

# Let's look at an original review and its cleaned version.
print("\n--- Example of Cleaning ---")
print("Original Review:", df['review'][0])
print("\nCleaned Review:", df['cleaned_review'][0])


--- Cleaning the text data... (This may take a moment) ---
--- Cleaning complete! ---

--- Example of Cleaning ---
Original Review: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death

In [3]:
# --- Step 3: Vectorize the Text using TF-IDF ---

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# TfidfVectorizer will convert our collection of text documents into a matrix of TF-IDF features.
# max_features=5000 means we'll only consider the top 5000 most frequent words to build our vocabulary.
# This helps save memory and can prevent our model from overfitting on rare words.
vectorizer = TfidfVectorizer(max_features=5000)

# We define our features (X) and our target (y).
# X is the numerical representation of our cleaned reviews.
# y is the sentiment label we want to predict.
X = vectorizer.fit_transform(df['cleaned_review']).toarray()
y = df['sentiment']

# Let's check the shape of our feature matrix X.
# It should be (50000, 5000) -> 50000 reviews, 5000 features (words) each.
print("\n--- Shape of TF-IDF feature matrix (X): ---")
print(X.shape)

# Our target 'y' is currently text ('positive', 'negative'). Models need numerical labels.
# LabelEncoder converts 'positive' to 1 and 'negative' to 0.
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Let's see the mapping: 0 for negative, 1 for positive.
print("\n--- Sentiment Label Mapping: ---")
print(f"'{label_encoder.classes_[0]}' is encoded as 0")
print(f"'{label_encoder.classes_[1]}' is encoded as 1")


--- Shape of TF-IDF feature matrix (X): ---
(50000, 5000)

--- Sentiment Label Mapping: ---
'negative' is encoded as 0
'positive' is encoded as 1


In [4]:
# --- Step 4: Split Data and Train the Model ---

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# We split our data: 80% for training and 20% for testing.
# X_train, y_train will be used to teach the model.
# X_test, y_test will be used to see how well it learned.
# random_state=42 ensures that we get the same split every time we run the code, for reproducibility.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model.
model = LogisticRegression()

# Train the model using our training data.
# The .fit() method is the "learning" step. The model learns the relationship
# between the TF-IDF features (X_train) and the sentiment labels (y_train).
print("\n--- Training the Logistic Regression model... ---")
model.fit(X_train, y_train)
print("--- Model training complete! ---")


--- Training the Logistic Regression model... ---
--- Model training complete! ---


In [5]:
# --- Step 5: Evaluate the Model ---

from sklearn.metrics import accuracy_score, classification_report

# Ask the trained model to make predictions on the unseen test data.
y_pred = model.predict(X_test)

# Calculate the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Accuracy: {accuracy * 100:.2f}% ---")

# Print a detailed classification report.
# Precision: Of all the reviews we predicted as positive, how many were actually positive?
# Recall: Of all the actual positive reviews, how many did we correctly identify?
# F1-Score: A balanced measure of precision and recall.
print("\n--- Classification Report: ---")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


--- Model Accuracy: 89.22% ---

--- Classification Report: ---
              precision    recall  f1-score   support

    negative       0.90      0.88      0.89      4961
    positive       0.88      0.91      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [13]:
# --- Step 6: Use the Model for Prediction on New Data ---

# Let's write some new reviews.
new_reviews = [
    "This movie is good for blind people",
    "What a waste of time. The plot was predictable and the acting was terrible. I would not recommend it.",
    "The film was okay, not great but not bad either. Some interesting scenes."
]

print("\n--- Predicting sentiment for new reviews: ---")

# 1. Clean the new reviews using our preprocessing function.
cleaned_new_reviews = [preprocess_text(review) for review in new_reviews]

# 2. Convert the cleaned reviews into a TF-IDF matrix.
# IMPORTANT: We use vectorizer.transform() here, NOT .fit_transform().
# We want to use the same vocabulary the model was trained on. 'fit' learns the vocabulary, 'transform' applies it.
new_reviews_tfidf = vectorizer.transform(cleaned_new_reviews)

# 3. Make predictions using our trained model.
new_predictions = model.predict(new_reviews_tfidf)

# 4. Convert the numerical predictions (0 or 1) back to text labels ('negative' or 'positive').
new_predictions_labels = label_encoder.inverse_transform(new_predictions)

# Display the results
for review, sentiment in zip(new_reviews, new_predictions_labels):
    print(f"\nReview: '{review}'")
    print(f"Predicted Sentiment: {sentiment.upper()}")


--- Predicting sentiment for new reviews: ---

Review: 'This movie is good for blind people'
Predicted Sentiment: POSITIVE

Review: 'What a waste of time. The plot was predictable and the acting was terrible. I would not recommend it.'
Predicted Sentiment: NEGATIVE

Review: 'The film was okay, not great but not bad either. Some interesting scenes.'
Predicted Sentiment: NEGATIVE
