In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [5]:
#importing the training data
imdb_data=pd.read_csv('E:/Projects/IMDB project/IMDB Dataset.csv/IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [6]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


# Remove HTML tags

In [7]:
import re
from bs4 import BeautifulSoup

def remove_html_tags(text): #remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text()
    return clean_text

text = "<p>This is an <b>example</b> text with HTML tags.</p>"
clean_text = remove_html_tags(text)
print(clean_text)

This is an example text with HTML tags.


# Remove Punctuation


In [8]:
import string

def remove_punctuation(text):
    clean_text = "".join([char for char in text if char not in string.punctuation])
    return clean_text

text = "Hello! This is an example sentence."
clean_text = remove_punctuation(text)
print(clean_text)


Hello This is an example sentence


# Remove Special Characters

In [9]:
import re

def remove_special_characters(text):
    clean_text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return clean_text

text = "Hello! This is @an example #sentence."
clean_text = remove_special_characters(text)
print(clean_text)

Hello This is an example sentence


# Normalize Text


In [10]:
def normalize_text(text):
    clean_text = text.lower()
    return clean_text

text = "Hello! This is an EXAMPLE sentence."
clean_text = normalize_text(text)
print(clean_text)

hello! this is an example sentence.


# Handle Contractions

In [11]:
import contractions

def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

text = "I can't believe it!"
expanded_text = expand_contractions(text)
print(expanded_text)

I cannot believe it!


# Remove Stop Words


In [12]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    clean_tokens = [token for token in tokens if token.lower() not in stop_words]
    clean_text = " ".join(clean_tokens)
    return clean_text

text = "This is an example sentence with some stop words."
clean_text = remove_stopwords(text)
print(clean_text)


example sentence stop words.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hazem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Remove Numbers

In [13]:
def remove_numbers(text):
    clean_text = re.sub(r"\d+", "", text)
    return clean_text

text = "This is an example sentence with 123 numbers."
clean_text = remove_numbers(text)
print(clean_text)


This is an example sentence with  numbers.


# Remove Extra Whitespace

In [14]:
def remove_extra_whitespace(text):
    clean_text = " ".join(text.split())
    return clean_text

text = "   This    is   an   example   sentence.   "
clean_text = remove_extra_whitespace(text)
print(clean_text)


This is an example sentence.


In [15]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

In [16]:
def preprocess_text(text):

    text = remove_html_tags(text)
    text = remove_punctuation(text)
    text = remove_special_characters(text)
    text = normalize_text(text)
    text = expand_contractions(text)
    text = remove_stopwords(text)
    text = remove_numbers(text)
    text = remove_extra_whitespace(text)

    return text

imdb_data['clean_text'] = imdb_data['review'].apply(preprocess_text)


  soup = BeautifulSoup(text, "html.parser")


In [17]:
# Display the updated DataFrame
print(imdb_data.head())

                                              review sentiment  \
0  One of the other reviewers has mentioned that ...  positive   
1  A wonderful little production. <br /><br />The...  positive   
2  I thought this was a wonderful way to spend ti...  positive   
3  Basically there's a family where a little boy ...  negative   
4  Petter Mattei's "Love in the Time of Money" is...  positive   

                                          clean_text  
0  one reviewers mentioned watching oz episode ho...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically family little boy jake thinks zombie...  
4  petter matteis love time money visually stunni...  


# Tokenize clean_text

In [18]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
def tokenize(text):
    preprocessed_text = preprocess_text(text)
    tokens = word_tokenize(preprocessed_text)
    return tokens
imdb_data['tokens'] = imdb_data['clean_text'].apply(tokenize)
print(imdb_data['tokens'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hazem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0        [one, reviewers, mentioned, watching, oz, epis...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, creative, o...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [going, disagree, previous, comment, side, mal...
49999    [one, expects, star, trek, movies, high, art, ...
Name: tokens, Length: 50000, dtype: object


# Model

In [28]:


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create word-to-index mapping

tokenizer = Tokenizer()
tokenizer.fit_on_texts(imdb_data['tokens'])

# Conver text to sequences
sequences = tokenizer.texts_to_sequences(imdb_data['tokens'])

#pad sequences

max_seq_length = 100
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length)

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Split into input and target
X = padded_sequences
labels = imdb_data['sentiment']
y = labels

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test= label_encoder.transform(y_test)
# Define the RNN model architecture
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
max_length = 100

In [21]:
# Step 3: Define the RNN model architecture

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(units=128))
model.add(Dense(units=1, activation='sigmoid'))

# Step 4: Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [22]:
# Step 5: Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Step 6: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.7721770405769348
Test Accuracy: 0.8600000143051147


# Predict new reviews

In [27]:
# Preprocess the new text
new_text = "I am extremely satisfied with this product. It has exceeded my expectations in every aspect. The build quality is superb, and it functions flawlessly. The customer service provided by the company was exceptional, with quick response times and a friendly approach. The item was delivered on time and in perfect condition. It has made my life so much easier and more enjoyable. I highly recommend this product to anyone in need of a reliable and efficient solution."
preprocessed_text = preprocess_text(new_text)

# Tokenize the preprocessed text
tokens = tokenizer.texts_to_sequences([preprocessed_text])
print(preprocessed_text)
# Pad the tokenized sequence
padded_sequence = pad_sequences(tokens, maxlen=max_length)

# Make predictions
predictions = model.predict(padded_sequence)

# Interpret the predictions
sentiment = "positive" if predictions[0] > 0.5 else "negative"
confidence = predictions[0] if predictions[0] > 0.5 else 1 - predictions[0]

# Print the result
print("Predicted sentiment:", sentiment)
print("Confidence:", confidence)

extremely satisfied product exceeded expectations every aspect build quality superb functions flawlessly customer service provided company exceptional quick response times friendly approach item delivered time perfect condition made life much easier enjoyable highly recommend product anyone need reliable efficient solution
Predicted sentiment: positive
Confidence: [0.9999602]
