<a href="https://colab.research.google.com/github/Neeshi14/Sentiment-analysis/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
# import the dataset
df = pd.read_csv('/content/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [6]:
# replace the value
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [8]:
print(df.isnull().sum()) # check the null value

review       0
sentiment    0
dtype: int64


In [9]:
# split the dataset
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
import nltk
nltk.download('stopwords')
from nltk.tokenize.toktok import ToktokTokenizer
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')
stopword_list

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [11]:
from bs4 import BeautifulSoup
import re
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
df['review']=df['review'].apply(denoise_text)
df['review']=df['review'].apply(strip_html)
df['review']=df['review'].apply(remove_between_square_brackets)
df['review']

Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...
49995,I thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...


In [12]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
df['review']=df['review'].apply(remove_special_characters)

In [13]:
import nltk
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text
#Apply function on review column
df['review']=df['review'].apply(simple_stemmer)
df['review']

Unnamed: 0,review
0,one of the other review ha mention that after ...
1,a wonder littl product the film techniqu is ve...
2,i thought thi wa a wonder way to spend time on...
3,basic there a famili where a littl boy jake th...
4,petter mattei love in the time of money is a v...
...,...
49995,i thought thi movi did a down right good job i...
49996,bad plot bad dialogu bad act idiot direct the ...
49997,i am a cathol taught in parochi elementari sch...
49998,im go to have to disagre with the previou comm...


In [14]:
from nltk.corpus import stopwords

In [15]:
from nltk.tokenize import word_tokenize

#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

df['review'] = df['review'].apply(remove_stopwords)
df['review']

{'me', 'more', 's', 'until', "we'd", 'each', 'above', "he'll", 'itself', 'a', "it's", "they'll", 'you', 'below', 'few', 'haven', 'ourselves', 'because', 'and', 'no', "isn't", 'does', 'it', "should've", 't', 'very', "she'll", 'all', 'theirs', 'yourself', 'through', "you're", 'during', 'having', 'his', 'hers', 'out', 'hadn', "shouldn't", 'd', 'their', 'them', 'of', 'whom', "haven't", 'up', 'didn', "you've", 'from', 'own', 'can', 'was', 'wasn', 'in', 'hasn', "doesn't", 'this', 'needn', 'herself', 're', 'i', "mustn't", "aren't", 'he', "shan't", 'which', 'with', 'to', 'where', 'for', 'doesn', 'same', "won't", 'be', "i'd", 'by', "don't", 'isn', 'we', "i'm", 'o', 'both', 'that', 'yourselves', "he'd", "he's", 'once', 'wouldn', "she'd", "it'll", 'other', 'were', 'shouldn', 'themselves', 'too', 'then', 'do', "mightn't", "wouldn't", 'further', "i've", 'any', "it'd", 'between', 'weren', 'm', 'off', 'but', 'only', 'will', "you'd", "couldn't", 'most', 'my', "we'll", 'as', 'before', 'some', 'again', 

Unnamed: 0,review
0,one review ha mention watch 1 oz episod youll ...
1,wonder littl product film techniqu veri unassu...
2,thought thi wa wonder way spend time hot summe...
3,basic famili littl boy jake think zombi hi clo...
4,petter mattei love time money visual stun film...
...,...
49995,thought thi movi right good job wasnt creativ ...
49996,bad plot bad dialogu bad act idiot direct anno...
49997,cathol taught parochi elementari school nun ta...
49998,im go disagre previou comment side maltin thi ...


In [16]:
from collections import Counter

word_counts = Counter()
for text in df['review']:
    word_counts.update(text.split())

vocabulary = {word: index for index, (word, count) in enumerate(word_counts.most_common(), start=1)}
vocabulary

{'thi': 1,
 'movi': 2,
 'wa': 3,
 'film': 4,
 'hi': 5,
 'one': 6,
 'like': 7,
 'ha': 8,
 'time': 9,
 'good': 10,
 'make': 11,
 'get': 12,
 'see': 13,
 'charact': 14,
 'veri': 15,
 'watch': 16,
 'even': 17,
 'stori': 18,
 'would': 19,
 'onli': 20,
 'realli': 21,
 'scene': 22,
 'show': 23,
 'look': 24,
 'well': 25,
 'much': 26,
 'end': 27,
 'peopl': 28,
 'great': 29,
 'bad': 30,
 'go': 31,
 'love': 32,
 'becaus': 33,
 'also': 34,
 'think': 35,
 'play': 36,
 'first': 37,
 'act': 38,
 'dont': 39,
 'way': 40,
 'thing': 41,
 'made': 42,
 'could': 43,
 'ani': 44,
 'know': 45,
 'say': 46,
 'seem': 47,
 'mani': 48,
 'work': 49,
 'want': 50,
 'seen': 51,
 'plot': 52,
 'actor': 53,
 'come': 54,
 'two': 55,
 'take': 56,
 'never': 57,
 'tri': 58,
 'littl': 59,
 'best': 60,
 'year': 61,
 'life': 62,
 'ever': 63,
 'doe': 64,
 'give': 65,
 'better': 66,
 'man': 67,
 'find': 68,
 'still': 69,
 'perform': 70,
 'feel': 71,
 'whi': 72,
 'use': 73,
 'part': 74,
 'actual': 75,
 'someth': 76,
 'lot': 77,
 'b

In [17]:
# Convert X_train and X_test to numerical sequences using the vocabulary
X_train_sequences = [[vocabulary[word] for word in text.split() if word in vocabulary] for text in X_train]
X_test_sequences = [[vocabulary[word] for word in text.split() if word in vocabulary] for text in X_test]

In [18]:
from tensorflow.keras.preprocessing import sequence

# Keeping a fixed length of all reviews to max 400 words
max_length = 400

X_train = sequence.pad_sequences(X_train_sequences, maxlen=max_length)
X_test = sequence.pad_sequences(X_test_sequences, maxlen=max_length)




In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

max_length = 100

model = Sequential()
model.add(Embedding(len(vocabulary)+1, 128,input_length=max_length)) # embedding layer
model.add(LSTM(128)) # LSTM layer with 128 units
model.add(Dense(1, activation='sigmoid')) #output layer with sigmoid activation

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [20]:
model.build(input_shape=(None, max_length))
model.summary()

In [21]:
# Train model
epochs = 10
batch_size = 64
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 35ms/step - accuracy: 0.7435 - loss: 0.5044 - val_accuracy: 0.8223 - val_loss: 0.3908
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 35ms/step - accuracy: 0.8523 - loss: 0.3437 - val_accuracy: 0.8291 - val_loss: 0.3943
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 35ms/step - accuracy: 0.8791 - loss: 0.2911 - val_accuracy: 0.8161 - val_loss: 0.4076
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 36ms/step - accuracy: 0.8976 - loss: 0.2491 - val_accuracy: 0.8158 - val_loss: 0.4422
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 34ms/step - accuracy: 0.9135 - loss: 0.2162 - val_accuracy: 0.8117 - val_loss: 0.4617
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 36ms/step - accuracy: 0.9293 - loss: 0.1789 - val_accuracy: 0.8082 - val_loss: 0.5264
Epoch 7/10
[1m6

In [22]:
# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.7945 - loss: 0.7759
Test Accuracy: 0.7966


In [23]:
# Print training accuracy
train_accuracy = history.history['accuracy'][-1]
print(f"Training Accuracy: {train_accuracy:.4f}")

Training Accuracy: 0.9627


# **Simple text preprocessing task for simple dataset**

In [8]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [9]:
# Text preprocessing
max_words = 10000  # Vocabulary size
max_len = 200  # Max sequence length
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)

In [11]:
# Build LSTM model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(32),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])



In [15]:
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.build(input_shape=(None, max_len))
model.summary()

In [17]:
# Train model
epochs = 10
batch_size = 64
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 18ms/step - accuracy: 0.8691 - loss: 0.3332 - val_accuracy: 0.8674 - val_loss: 0.3041
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 17ms/step - accuracy: 0.9017 - loss: 0.2634 - val_accuracy: 0.8780 - val_loss: 0.3008
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 17ms/step - accuracy: 0.9292 - loss: 0.2047 - val_accuracy: 0.8723 - val_loss: 0.3049
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9471 - loss: 0.1644 - val_accuracy: 0.8801 - val_loss: 0.3401
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - accuracy: 0.9628 - loss: 0.1258 - val_accuracy: 0.8734 - val_loss: 0.3923
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9729 - loss: 0.0974 - val_accuracy: 0.8732 - val_loss: 0.4440
Epoch 7/10
[1m6

In [18]:
# Print training accuracy
train_accuracy = history.history['accuracy'][-1]
print(f"Training Accuracy: {train_accuracy:.4f}")


Training Accuracy: 0.9874


In [19]:
# Verify model with sample reviews
def predict_sentiment(review):
    sequence = tokenizer.texts_to_sequences([review])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')
    prediction = model.predict(padded)[0,0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment, prediction

# Sample reviews for verification
sample_reviews = [
    "This movie was fantastic! The acting was great and the story was engaging.",
    "I did not like this movie. The plot was boring and the acting was terrible.",
    "An average film, not too bad but not too good either.",
    "One of the best movies I have ever seen! Highly recommend it.",
    "Waste of time. The worst movie I have watched in years."
]

# Make predictions
for review in sample_reviews:
    sentiment, confidence = predict_sentiment(review)
    print(f"Review: {review}\nPredicted Sentiment: {sentiment} (Confidence: {confidence:.4f})\n")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step
Review: This movie was fantastic! The acting was great and the story was engaging.
Predicted Sentiment: Positive (Confidence: 0.9973)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
Review: I did not like this movie. The plot was boring and the acting was terrible.
Predicted Sentiment: Negative (Confidence: 0.0022)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
Review: An average film, not too bad but not too good either.
Predicted Sentiment: Negative (Confidence: 0.0034)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Review: One of the best movies I have ever seen! Highly recommend it.
Predicted Sentiment: Positive (Confidence: 0.9967)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Review: Waste of time. The worst movie I have watched in years.
Predicted Sentiment: Negative (Confidence: 0.0021)

