In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import string
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import joblib

In [2]:
# Download the stopwords dataset from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Remove HTML tags
def remove_html_tags(text):
    return BeautifulSoup(text, "lxml").text

In [5]:
df['review'] = df['review'].apply(remove_html_tags)

  return BeautifulSoup(text, "lxml").text


In [6]:
# Remove punctuation
def remove_punctuation(sentence):
    return ''.join([letters.lower() for letters in sentence if letters not in string.punctuation])


In [7]:
df['review'] = df['review'].apply(remove_punctuation)

In [8]:
# Encode sentiment labels
df['sentiment'].replace(['positive', 'negative'], [1, 0], inplace=True)

In [9]:
# Remove stopwords
stop_words = set(nltk.corpus.stopwords.words('english'))

In [10]:
def remove_stopwords(sentence):
    return ' '.join([words for words in sentence.split() if words.lower() not in stop_words])


In [13]:
df['review'] = df['review'].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1
5,probably alltime favorite movie story selfless...,1
6,sure would like see resurrection dated seahunt...,1
7,show amazing fresh innovative idea 70s first a...,0
8,encouraged positive comments film looking forw...,0
9,like original gut wrenching laughter like movi...,1


In [14]:
# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [15]:
def stem_sentence(sentence):
    return ' '.join([ps.stem(word) for word in sentence.split()])

In [16]:
df['review'] = df['review'].apply(stem_sentence)

In [17]:
df.head()

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product film techniqu unassum old...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [18]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(df['review'])
y = df['sentiment']

In [19]:
# Train-Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [20]:
# Train the Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(x_train, y_train)

In [21]:
# Evaluate the model
y_pred = classifier.predict(x_test)
print("Model Accuracy : {}%".format((y_pred == y_test).mean() * 100))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Model Accuracy : 85.15%
Confusion Matrix:
 [[4215  746]
 [ 739 4300]]


In [22]:
# Save the model and vectorizer
joblib.dump(classifier, 'sentiment_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [23]:
# Load the model and vectorizer for prediction
loaded_classifier = joblib.load('sentiment_classifier.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [24]:
# Function to predict sentiment of a new review
def predict_sentiment(review):
    review = BeautifulSoup(review, "lxml").text
    review = remove_punctuation(review)
    review = remove_stopwords(review)
    review = stem_sentence(review)
    review_vectorized = loaded_vectorizer.transform([review])
    prediction = loaded_classifier.predict(review_vectorized)
    return 'Positive' if prediction[0] == 1 else 'Negative'

In [25]:
# Example usage
new_review_1 = "This movie was fantastic! I loved it."
print(predict_sentiment(new_review_1))

Positive


In [26]:
new_review_2 = "The movie was terrible and boring."
print(predict_sentiment(new_review_2))

Negative
