Step 1: Text Preprocessing

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('./IMDB_dataset.csv')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a function to clean the text
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    # Tokenize text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
    # Apply stemming and lemmatization
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Apply the cleaning function to the review column
df['review'] = df['review'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naras\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\naras\AppData\Roaming\nltk_data...


Step 2: Tokenization and Padding

In [5]:
# Initialize tokenizer with out-of-vocabulary token
tokenizer = Tokenizer(oov_token='<OOV>')
# Fit tokenizer on the cleaned reviews
tokenizer.fit_on_texts(df['review'])

# Convert reviews to sequences
sequences = tokenizer.texts_to_sequences(df['review'])

# Pad the sequences
maxlen = 100  # Define the maximum length of sequences
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')


# Define RNN model

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)

model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=maxlen),
    LSTM(64, return_sequences=False), 
    Dropout(0.5),  
    Dense(1, activation='sigmoid') 
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))




Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x20942f8bc10>

#Step 4: Train and Evaluate the Model

In [7]:

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.47803452610969543, Test Accuracy: 0.8686000108718872


Save the Model

In [9]:
model.save('trained_model.h5')