In [3]:
import pandas as pd


In [4]:
file_path = r'/content/drive/MyDrive/dataset/IMDB Dataset.csv'

In [5]:
df = pd.read_csv(file_path)
print(df.head())
print(df.info())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None


In [6]:
print(df.isnull().sum())  # Check for missing values


review       0
sentiment    0
dtype: int64


In [7]:
print("Columns:", df.columns)
print(df.describe())  # Summary statistics


Columns: Index(['review', 'sentiment'], dtype='object')
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


✅ Removing HTML tags

✅ Converting text to lowercase

✅ Removing special characters & punctuation

✅ Removing stopwords

✅ Tokenization & Lemmatization

In [8]:
import re  # Regular expressions for text cleaning
import nltk  # Natural Language Toolkit for text processing
from bs4 import BeautifulSoup  # HTML tag removal
from nltk.corpus import stopwords  # List of common words like 'the', 'is', etc.
from nltk.tokenize import word_tokenize  # Splits text into words
from nltk.stem import WordNetLemmatizer  # Reduces words to their root form


In [9]:
nltk.download("stopwords")  # Stopwords list
nltk.download("punkt")  # Tokenizer
nltk.download("wordnet")  # Lemmatizer dictionary
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
lemmatizer = WordNetLemmatizer()  # Converts words to base form (e.g., "running" → "run")
stop_words = set(stopwords.words("english"))  # Load a list of common stopwords


In [11]:
def preprocess_text(text):
    #text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization: Split text into words
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization
    return " ".join(tokens)  # Join words back into cleaned text


In [12]:
df["clean_review"] = df["review"].apply(preprocess_text)  # Apply function to all reviews
print(df.head())  # Check cleaned data

                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                        clean_review  
0  one reviewer mentioned watching oz episode you...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically there family little boy jake think t...  
4  petter matteis love time money visually stunni...  


Feature Extraction

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 important words

# Convert text data into TF-IDF vectors
X_tfidf = tfidf_vectorizer.fit_transform(df["clean_review"])

# Extract labels (target variable)
y = df["sentiment"]  # Assuming 'sentiment' is the column with labels (positive/negative)

# Save the vectorizer
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

print("Vectorizer saved as tfidf_vectorizer.pkl")

Vectorizer saved as tfidf_vectorizer.pkl


Train a Machine Learning Model

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split dataset into training & testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize and train Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8846
Classification Report:
               precision    recall  f1-score   support

    negative       0.89      0.87      0.88      4961
    positive       0.88      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



Test on New Reviews

In [1]:
import joblib

In [16]:
# Save the trained model
joblib.dump(model, "sentiment_model.pkl")
print("Model saved successfully!")

Model saved successfully!


In [15]:
new_reviews = ["This movie was fantastic and full of emotions!", "It was the worst film I have ever seen."]
new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews)

# Predict sentiment
predictions = model.predict(new_reviews_tfidf)

print(predictions)  # Output: ['positive', 'negative']


['positive' 'negative']


Try Naïve Bayes (MultinomialNB)

In [14]:
from sklearn.naive_bayes import MultinomialNB

# Train Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_nb = nb_model.predict(X_test)
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))


Naïve Bayes Accuracy: 0.8519
Classification Report:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.86      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



Try Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8506
Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.86      0.85      4961
    positive       0.86      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



Try SVM

In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


In [None]:
# Save the trained model
joblib.dump(svm_model, "sentiment_model_svm.pkl")
print("Model saved successfully!")

Tune Hyperparameters for Better Performance

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Train model with best params
best_lr = grid_search.best_estimator_
y_pred_best = best_lr.predict(X_test)
print("Tuned Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_best))
