## Install Required Libraries

In [7]:
! pip install pandas numpy scikit-learn nltk





# Import Required Libraries

In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Download NLTK Stopwords

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gokulmurugan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gokulmurugan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Dataset

In [8]:
df = pd.read_csv("file:///Users/gokulmurugan/Downloads/IMDB%20Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Text Preprocessing

In [9]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize words
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return " ".join(tokens)

df["clean_review"] = df["review"].apply(preprocess_text)
df.head()


Unnamed: 0,review,sentiment,clean_review
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...


# Convert Sentiments to Numerical Values

In [10]:
label_encoder = LabelEncoder()
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])
print(df["sentiment"].value_counts())


1    25000
0    25000
Name: sentiment, dtype: int64


# Split Data into Training & Testing Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_review"], df["sentiment"], test_size=0.2, random_state=42)
print(f"Training samples: {len(X_train)} | Testing samples: {len(X_test)}")


Training samples: 40000 | Testing samples: 10000


# Convert Text into Numerical Vectors (TF-IDF)

In [12]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("TF-IDF conversion complete!")


TF-IDF conversion complete!


# Train Machine Learning Model

In [13]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [14]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.89


#  Test with Custom Reviews

In [15]:
def predict_sentiment(clean_review):
    review_tfidf = vectorizer.transform([clean_review])  # Use cleaned review
    prediction = model.predict(review_tfidf)
    return "Positive" if prediction[0] == 1 else "Negative"


In [16]:
test_review = preprocess_text("I really loved this movie! The story was fantastic.")  # Preprocess before testing
print(f"Predicted Sentiment: {predict_sentiment(test_review)}")


Predicted Sentiment: Positive


In [25]:
test_review = preprocess_text("I'm not like ,it is average movie")  # Preprocess before testing
print(f"Predicted Sentiment: {predict_sentiment(test_review)}")


Predicted Sentiment: Negative


In [26]:
print(predict_sentiment(preprocess_text("The movie was absolutely terrible! I hated it.")))



Negative


In [27]:
print(predict_sentiment(preprocess_text("One of the best films I have ever watched!")))

Positive
