In [1]:
!pip install nltk



In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import pickle
import nltk
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, LSTM,Embedding, Dense, Bidirectional, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

2025-08-13 07:58:51.647199: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755071931.822777      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755071931.873321      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
movie_review = pd.read_csv("/kaggle/input/imdb-movie-reviews-dataset/train_data (1).csv",encoding='ISO-8859-1') # Load the datasets with proper encoding

In [5]:
movie_review.rename(columns={'0': 'Reviews', '1': 'Sentiment'}, inplace=True) 

In [6]:
movie_review.head()

Unnamed: 0,Reviews,Sentiment
0,"This film is absolutely awful, but nevertheles...",0
1,Well since seeing part's 1 through 3 I can hon...,0
2,I got to see this film at a preview and was da...,1
3,This adaptation positively butchers a classic ...,0
4,RÃ¥zone is an awful movie! It is so simple. It...,0


In [7]:
def clean_text(text):
    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(text, "html.parser")
    text_without_html = soup.get_text()

    cleaned_text = re.sub(r'[^\w\s]', '', text_without_html)  # Remove punctuation
    cleaned_text = re.sub(r'\d+', '', cleaned_text)  # Remove numbers

    cleaned_text = cleaned_text.lower() #Convert to lower
    return cleaned_text

In [8]:
movie_review['Reviews'] = movie_review['Reviews'].apply(clean_text)

In [9]:
movie_review.head()

Unnamed: 0,Reviews,Sentiment
0,this film is absolutely awful but nevertheless...,0
1,well since seeing parts through i can honest...,0
2,i got to see this film at a preview and was da...,1
3,this adaptation positively butchers a classic ...,0
4,rãzone is an awful movie it is so simple it se...,0


In [10]:
movie_review.isnull().sum()

Reviews      0
Sentiment    0
dtype: int64

In [11]:
movie_review["Sentiment"].value_counts()

Sentiment
0    12500
1    12500
Name: count, dtype: int64

In [12]:
lemmatizer = WordNetLemmatizer()

In [13]:
def lemmatize_content(content):
    lemmatized_content = re.sub('[^a-zA-Z]', ' ', content)
    lemmatized_content = lemmatized_content.lower().split()
    lemmatized_content = [lemmatizer.lemmatize(word) for word in lemmatized_content if word not in stopwords.words('english')]
    lemmatized_content = ' '.join(lemmatized_content)
    return lemmatized_content

In [14]:
movie_review['Reviews'] = movie_review['Reviews'].apply(lemmatize_content)

In [15]:
movie_review.head()

Unnamed: 0,Reviews,Sentiment
0,film absolutely awful nevertheless hilarious t...,0
1,well since seeing part honestly say never made...,0
2,got see film preview dazzled typical romantic ...,1
3,adaptation positively butcher classic beloved ...,0
4,r zone awful movie simple seems tried make mov...,0


In [17]:
tokenizer = Tokenizer(num_words=10000)  # Adjust the num_words as needed
tokenizer.fit_on_texts(movie_review['Reviews'])
X = tokenizer.texts_to_sequences(movie_review['Reviews'])
X = pad_sequences(X, padding='post')

y = movie_review['Sentiment'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# BiLSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128, input_length=X.shape[1]))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
score, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1838s[0m 3s/step - accuracy: 0.7227 - loss: 0.5328 - val_accuracy: 0.8428 - val_loss: 0.3734
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1815s[0m 3s/step - accuracy: 0.8802 - loss: 0.3094 - val_accuracy: 0.8652 - val_loss: 0.3304
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1810s[0m 3s/step - accuracy: 0.9049 - loss: 0.2483 - val_accuracy: 0.8638 - val_loss: 0.3473
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1828s[0m 3s/step - accuracy: 0.9340 - loss: 0.1861 - val_accuracy: 0.8680 - val_loss: 0.3778
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1819s[0m 3s/step - accuracy: 0.9507 - loss: 0.1421 - val_accuracy: 0.8656 - val_loss: 0.4030
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 816ms/step - accuracy: 0.8621 - loss: 0.4097
Test Accuracy: 0.8655999898910522


In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score 
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary classes

# Calculate Precision, Recall, and F1 score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Evaluate the model
score, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 783ms/step
Precision: 0.8571
Recall: 0.8737
F1 Score: 0.8653
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 816ms/step - accuracy: 0.8621 - loss: 0.4097
Test Accuracy: 0.8656
