In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [3]:
#importing the training data
imdb_data=pd.read_csv('E:/Projects/IMDB project/IMDB Dataset.csv/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


# Remove HTML tags

In [5]:
import re
from bs4 import BeautifulSoup

def remove_html_tags(text): #remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text()
    return clean_text

text = "<p>This is an <b>example</b> text with HTML tags.</p>"
clean_text = remove_html_tags(text)
print(clean_text)

This is an example text with HTML tags.


# Remove Punctuation


In [6]:
import string

def remove_punctuation(text):
    clean_text = "".join([char for char in text if char not in string.punctuation])
    return clean_text

text = "Hello! This is an example sentence."
clean_text = remove_punctuation(text)
print(clean_text)


Hello This is an example sentence


# Remove Special Characters

In [7]:
import re

def remove_special_characters(text):
    clean_text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return clean_text

text = "Hello! This is @an example #sentence."
clean_text = remove_special_characters(text)
print(clean_text)

Hello This is an example sentence


# Normalize Text


In [8]:
def normalize_text(text):
    clean_text = text.lower()
    return clean_text

text = "Hello! This is an EXAMPLE sentence."
clean_text = normalize_text(text)
print(clean_text)

hello! this is an example sentence.


# Handle Contractions

In [9]:
import contractions

def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

text = "I can't believe it!"
expanded_text = expand_contractions(text)
print(expanded_text)

I cannot believe it!


# Remove Stop Words


In [10]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    clean_tokens = [token for token in tokens if token.lower() not in stop_words]
    clean_text = " ".join(clean_tokens)
    return clean_text

text = "This is an example sentence with some stop words."
clean_text = remove_stopwords(text)
print(clean_text)


example sentence stop words.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hazem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Remove Numbers

In [11]:
def remove_numbers(text):
    clean_text = re.sub(r"\d+", "", text)
    return clean_text

text = "This is an example sentence with 123 numbers."
clean_text = remove_numbers(text)
print(clean_text)


This is an example sentence with  numbers.


# Remove Extra Whitespace

In [12]:
def remove_extra_whitespace(text):
    clean_text = " ".join(text.split())
    return clean_text

text = "   This    is   an   example   sentence.   "
clean_text = remove_extra_whitespace(text)
print(clean_text)


This is an example sentence.


In [13]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

In [14]:
def preprocess_text(text):

    text = remove_html_tags(text)
    text = remove_punctuation(text)
    text = remove_special_characters(text)

    return text

imdb_data['clean_text'] = imdb_data['review'].apply(preprocess_text)




In [15]:
# Display the updated DataFrame
print(imdb_data.head())

                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                          clean_text  
0  One of the other reviewers has mentioned that ...  
1  A wonderful little production The filming tech...  
2  I thought this was a wonderful way to spend ti...  
3  Basically theres a family where a little boy J...  
4  Petter Matteis Love in the Time of Money is a ...  


# Tokenize clean_text

In [16]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
def tokenize(text):
    preprocessed_text = preprocess_text(text)
    tokens = word_tokenize(preprocessed_text)
    return tokens
imdb_data['tokens'] = imdb_data['clean_text'].apply(tokenize)
print(imdb_data['tokens'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hazem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0        [One, of, the, other, reviewers, has, mentione...
1        [A, wonderful, little, production, The, filmin...
2        [I, thought, this, was, a, wonderful, way, to,...
3        [Basically, theres, a, family, where, a, littl...
4        [Petter, Matteis, Love, in, the, Time, of, Mon...
                               ...                        
49995    [I, thought, this, movie, did, a, down, right,...
49996    [Bad, plot, bad, dialogue, bad, acting, idioti...
49997    [I, am, a, Catholic, taught, in, parochial, el...
49998    [Im, going, to, have, to, disagree, with, the,...
49999    [No, one, expects, the, Star, Trek, movies, to...
Name: tokens, Length: 50000, dtype: object


# Model

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create word-to-index mapping

tokenizer = Tokenizer()
tokenizer.fit_on_texts(imdb_data['tokens'])

# Conver text to sequences
sequences = tokenizer.texts_to_sequences(imdb_data['tokens'])

#pad sequences

max_seq_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split into input and target
X = padded_sequences
labels = imdb_data['sentiment']
y = labels

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test= label_encoder.transform(y_test)
# Define the RNN model architecture
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_length = 100

In [19]:
# Step 3: Define the RNN model architecture

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(units=128))
model.add(Dense(units=1, activation='sigmoid'))

# Step 4: Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [22]:
# Step 5: Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Step 6: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)
# Print validation accuracy
val_accuracy = history.history['val_accuracy']
print('Validation Accuracy:', val_accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.8511025309562683
Test Accuracy: 0.8331000208854675


NameError: name 'history' is not defined

# Predict new reviews

In [37]:
# Preprocess the new text
new_text = "I had an amazing experience at this hotel. The staff was friendly and welcoming, always ready to assist with a smile. The room was spacious, clean, and beautifully decorated. The bed was incredibly comfortable, providing a great night's sleep. The hotel amenities were top-notch, including a refreshing swimming pool and a well-equipped fitness center. The location was perfect, with easy access to popular attractions and dining options. The breakfast buffet offered a wide variety of delicious options. I highly recommend this hotel to anyone looking for a fantastic stay. I can't wait to visit again"
preprocessed_text = preprocess_text(new_text)

# Tokenize the preprocessed text
tokens = tokenizer.texts_to_sequences([preprocessed_text])
print(preprocessed_text)
# Pad the tokenized sequence
padded_sequence = pad_sequences(tokens, maxlen=max_length)

# Make predictions
predictions = model.predict(padded_sequence)

# Interpret the predictions
sentiment = "positive" if predictions[0] > 0.5 else "negative"
confidence = predictions[0] if predictions[0] > 0.5 else 1 - predictions[0]

# Print the result
print("Predicted sentiment:", sentiment)
print("Confidence:", confidence)

I had an amazing experience at this hotel The staff was friendly and welcoming always ready to assist with a smile The room was spacious clean and beautifully decorated The bed was incredibly comfortable providing a great nights sleep The hotel amenities were topnotch including a refreshing swimming pool and a wellequipped fitness center The location was perfect with easy access to popular attractions and dining options The breakfast buffet offered a wide variety of delicious options I highly recommend this hotel to anyone looking for a fantastic stay I cant wait to visit again
Predicted sentiment: positive
Confidence: [0.99972165]
