In [1]:
import pandas as pd
df=pd.read_csv('IMDB Dataset.csv')
df['sentiment']=df['sentiment'].map(lambda x: 1 if x=='positive' else 0)

In [2]:
import re
import string

def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    return ''.join([i for i in text if not i.isdigit()])

def remove_emojis(text):
    return ''.join([char for char in text if char.isascii()])

df['review'] = df['review'].apply(lambda x: x.lower())
df['review'] = df['review'].apply(remove_html_tags)  
df['review'] = df['review'].apply(remove_punctuation)
df['review'] = df['review'].apply(remove_emojis)
df['review'] = df['review'].apply(remove_numbers)


In [3]:
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab') 
nltk.download('stopwords')

stop_words=set(stopwords.words('english'))
def remove_stopwords(text):
    word_tokens=word_tokenize(text)
    filtered_sentence=[w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

df['review']=df['review'].apply(remove_stopwords)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Person\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Person\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Person\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review,sentiment
0,one reviewers mentioned watching oz episode yo...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically theres family little boy jake thinks...,0
4,petter matteis love time money visually stunni...,1


In [4]:
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
# X_train, X_test, y_train, y_test=train_test_split(df["review"],df["sentiment"],test_size=0.2,random_state=42)

# bgw = CountVectorizer()
# X_train_cv = bgw.fit_transform(X_train)
# X_test_cv = bgw.transform(X_test)

# tfid = TfidfTransformer()
# X_train_tfid = tfid.fit_transform(X_train_cv)
# X_test_tfid = tfid.transform(X_test_cv)

# model=LogisticRegression(max_iter=1000)
# model.fit(X_train_cv,y_train)
# y_pred=model.predict(X_test_cv)
# print(f'Bag of Words Accuracy: {accuracy_score(y_test,y_pred)}')

# model.fit(X_train_tfid,y_train)
# y_pred=model.predict(X_test_tfid)
# print(f'TF-IDF Accuracy: {accuracy_score(y_test,y_pred)}')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import joblib

X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('cv', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(f"Pipeline Accuracy: {accuracy_score(y_test, y_pred)}")

joblib.dump(pipeline, "sentiment_pipeline.pkl")


Pipeline Accuracy: 0.8957104289571043


['sentiment_pipeline.pkl']

In [7]:
def predict_review(review):
    review = remove_html_tags(review)
    review = remove_punctuation(review)
    review = remove_emojis(review)
    review = remove_numbers(review)
    review = review.lower()
    review = remove_stopwords(review)
    result=pipeline.predict([review])
    return "Positive" if result[0] == 1 else "Negative"

print(predict_review("I hate this movie"))

Negative
