__Import necessary libraries__

In [75]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

__Load the dataset__


In [76]:
imdb_data = pd.read_csv('IMDB Dataset.csv').sample(10000)

__Use only 1000 samples for this task__

In [77]:
imdb_data = imdb_data.sample(10000, random_state=42).reset_index(drop=True)

__Preprocessing functions__

In [78]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

__remove unnecessary brackets__

In [79]:
def remove_between_square_brackets(text):
    return re.sub(r'\[[^]]*\]', '', text)

__Remove Special Characters__

In [80]:
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)
    return text

__ Convert to Base Case Here__

In [81]:
def simple_stemmer(text):
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

__Tokenizer and stopwords setup__

In [82]:
tokenizer = ToktokTokenizer()
stopword_list = stopwords.words('english')

__Remove Stopwards__

In [83]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

__Preprocess Function__

In [84]:
def preprocess_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    text = simple_stemmer(text)
    text = remove_stopwords(text)
    return text

__Preprocess the reviews in the dataset__


In [85]:
imdb_data['review'] = imdb_data['review'].apply(preprocess_text)

  soup = BeautifulSoup(text, "html.parser")


In [None]:
__Remove empty reviews__

In [86]:
imdb_data = imdb_data[imdb_data['review'].str.strip().astype(bool)]

__Split the dataset into training and testing sets__

In [87]:
train_reviews = imdb_data.review[:8000]  # 80% for training
train_sentiments = imdb_data.sentiment[:8000]
test_reviews = imdb_data.review[8000:]   # 20% for testing
test_sentiments = imdb_data.sentiment[8000:]

__Feature extraction using TF-IDF__

In [88]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, max_df=1.0, use_idf=True, ngram_range=(1,3))
tfidf_train_reviews = tfidf_vectorizer.fit_transform(train_reviews)
tfidf_test_reviews = tfidf_vectorizer.transform(test_reviews)

__Labeling the sentiment data__

In [89]:
label_binarizer = LabelBinarizer()
train_sentiments = label_binarizer.fit_transform(train_sentiments)
test_sentiments = label_binarizer.transform(test_sentiments)

__Train the Logistic Regression model__

In [90]:
lr_model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)
lr_model.fit(tfidf_train_reviews, train_sentiments)

  y = column_or_1d(y, warn=True)


__Function to classify user input__

In [91]:
def classify_review(review_text):
    # Preprocess the input text
    processed_text = preprocess_text(review_text)
    # Transform the input text using the trained TF-IDF vectorizer
    tfidf_review = tfidf_vectorizer.transform([processed_text])
    # Predict sentiment using the trained model
    prediction = lr_model.predict(tfidf_review)
    # Convert the prediction to sentiment label
    sentiment = label_binarizer.inverse_transform(prediction)[0]
    return sentiment

__Get input from the user__

In [94]:
user_review = input("Enter a movie review: ")

Enter a movie review:  Kali is worst,demotivating movie


__Classify the input review__

In [95]:
predicted_sentiment = classify_review(user_review)
print(f"The review is predicted to be: {predicted_sentiment}")

The review is predicted to be: negative
