### here we will be practicing the rnn on the imdb dataset

In [17]:
# import libraries
import pandas as pd 
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Embedding, Flatten
from tensorflow.keras.datasets import imdb
# imdb contains the reviews dataset and is preprocessed therefore no need to preprocess it z


In [2]:
# make a sample input document containing text/sentences
doc=[
    'go pakistan',
    'pakistan pakistan',
    'hip hip hurray',
    'jeetega bhai jeetega pakistan jeetega',
    'bharat mata ki jai',
    'amir amir',
    'haris haris',
    'haji haji',
    'imran khan ki jai',
    'inqilab zindabad'
]

In [8]:
# now we have to tokenize the text 
# Keras Tokenization refers to the process of converting text into numerical data that can be fed into machine learning or deep learning models, specifically in natural language processing (NLP) tasks.
# Tokenization: This is the process of splitting text into smaller parts, usually words or characters. For example, in sentence tokenization, the text "Hello world!" would be split into two tokens: ["Hello", "world!"].

# Token to Integer Mapping: After splitting the text into tokens, the Tokenizer assigns a unique integer to each token. For example:
# "Hello" → 1
# "world!" → 2

# Building Vocabulary: The Tokenizer builds a vocabulary based on the input text, storing each unique word and its corresponding integer index.

# Text Sequences: Once the text is tokenized and mapped to integers, the Tokenizer can transform sentences or documents into sequences of integers. For example:

# Original text: "I love deep learning"
# Tokenized: [2, 5, 10, 15]

from tensorflow.keras.preprocessing.text import Tokenizer

# make the tokenizer
tokenizer=Tokenizer(oov_token='Nothing found')
# the oov_token is a special token used to handle Out-of-Vocabulary (OOV) words during text tokenization. after fitting it to a specific corpus (training data), any new or unseen words in future input data that were not part of the original vocabulary will be replaced by this oov_token.

# now fit the text doc
tokenizer.fit_on_texts(doc)


In [9]:
# a look at the tokens
tokenizer.word_index

{'Nothing found': 1,
 'pakistan': 2,
 'jeetega': 3,
 'hip': 4,
 'ki': 5,
 'jai': 6,
 'amir': 7,
 'haris': 8,
 'haji': 9,
 'go': 10,
 'hurray': 11,
 'bhai': 12,
 'bharat': 13,
 'mata': 14,
 'imran': 15,
 'khan': 16,
 'inqilab': 17,
 'zindabad': 18}

### above is the dictionary of created vocabulary--> unique words from the whole text doc

In [11]:
# to count the occurance of words in the whole text doc
tokenizer.word_counts

OrderedDict([('go', 1),
             ('pakistan', 4),
             ('hip', 2),
             ('hurray', 1),
             ('jeetega', 3),
             ('bhai', 1),
             ('bharat', 1),
             ('mata', 1),
             ('ki', 2),
             ('jai', 2),
             ('amir', 2),
             ('haris', 2),
             ('haji', 2),
             ('imran', 1),
             ('khan', 1),
             ('inqilab', 1),
             ('zindabad', 1)])

In [13]:
# we cann also display the number of rows on the document=sentences
# here document means number of rows/sentences in the whole text
tokenizer.document_count

10

### we have total 10 sentences

In [14]:
# now generate the sequences for every sentence
seq=tokenizer.texts_to_sequences(doc)
seq

[[10, 2],
 [2, 2],
 [4, 4, 11],
 [3, 12, 3, 2, 3],
 [13, 14, 5, 6],
 [7, 7],
 [8, 8],
 [9, 9],
 [15, 16, 5, 6],
 [17, 18]]

### The sequences are generated by merging the indexes of the words together, for example the index of word 'go' is 10 and that of 'pakistan' is 2 so the generated sequence is [10, 2]

In [15]:
# since the sizes of sequences are different, therefore, to make them equal in size we will pad them
# we will add 0s at the end of the words in the following manner
# import the pad 
from tensorflow.keras.utils import pad_sequences

# now pad the sequences
seq=pad_sequences(seq, padding='post') # post will add 0s at the end of sequences
seq

array([[10,  2,  0,  0,  0],
       [ 2,  2,  0,  0,  0],
       [ 4,  4, 11,  0,  0],
       [ 3, 12,  3,  2,  3],
       [13, 14,  5,  6,  0],
       [ 7,  7,  0,  0,  0],
       [ 8,  8,  0,  0,  0],
       [ 9,  9,  0,  0,  0],
       [15, 16,  5,  6,  0],
       [17, 18,  0,  0,  0]])

### now we have successfuly made the sequences equal in size and the input is created

### Working with imdb dataset

In [18]:
# load and split the train and test data from imdb dataset
(Xtrain, ytrain), (Xtest, ytest)=imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 2us/step


In [19]:
# to analyze that whether the data is preprocessed or not
Xtrain

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [20]:
# now making sizes equal
# we also trim the data aand only take the first 50 reviews to shrink the data
Xtrain=pad_sequences(Xtrain, padding='post', maxlen=50)
Xtest=pad_sequences(Xtest, padding='post', maxlen=50)

In [21]:
# now creating a RNN model
model=Sequential()

# add and RNN with 32 units (neurons). Each unit is a memory cell that can maintain information from previous time steps.
# the input consists of sequences that are 50 time steps long (each input sequence contains 50 elements).
# Each time step in the sequence has 1 feature. This could represent a single variable at each time step
# Setting return_sequences=False means that the RNN will return only the output of the last time step, rather than a sequence of outputs for every time step. 
model.add(SimpleRNN(32, input_shape=(50,1), return_sequences=False))

# add 1 output layer with sigmoid activation function
model.add(Dense(1, activation='sigmoid'))

# summary of the model
model.summary()

  super().__init__(**kwargs)


In [23]:
# now compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 

In [24]:
# now fit the data
model.fit(Xtrain, ytrain, epochs=10, validation_data=(Xtest, ytest))

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.5020 - loss: 0.6964 - val_accuracy: 0.5067 - val_loss: 0.6937
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.4990 - loss: 0.6932 - val_accuracy: 0.5074 - val_loss: 0.6938
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5081 - loss: 0.6924 - val_accuracy: 0.5045 - val_loss: 0.6937
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5102 - loss: 0.6932 - val_accuracy: 0.5066 - val_loss: 0.6940
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5081 - loss: 0.6927 - val_accuracy: 0.5018 - val_loss: 0.6950
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5068 - loss: 0.6930 - val_accuracy: 0.5032 - val_loss: 0.6946
Epoch 7/10
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x13631bcb350>

### Since we have made of the less data, the accuracy is not that much high
### to increase it's performance we will make use of embedding on the above data

In [26]:
Xtrain.shape

(25000, 50)

In [30]:
# creating model with embeddings
model=Sequential()

# add embeddding
# we have defined 100000 vocabulary length with dense dimension of 2 vectors. the input size is 50 ass can be seen from the above cell's output
# each time 50 words will pass through the layer
model.add(Embedding(input_dim=100000, output_dim=2, input_length=50))
# add rnn with no input_shape bcz it is defined in th embedding
model.add(SimpleRNN(32, return_sequences=False))
# add dense layer (output)
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the data
model.fit(Xtrain, ytrain, epochs=10, validation_data=(Xtest, ytest))

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.5775 - loss: 0.6624 - val_accuracy: 0.8009 - val_loss: 0.4350
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.8510 - loss: 0.3521 - val_accuracy: 0.8132 - val_loss: 0.4172
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9111 - loss: 0.2383 - val_accuracy: 0.8032 - val_loss: 0.4696
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9421 - loss: 0.1705 - val_accuracy: 0.7780 - val_loss: 0.6367
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9558 - loss: 0.1293 - val_accuracy: 0.7898 - val_loss: 0.5399
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9686 - loss: 0.1004 - val_accuracy: 0.7798 - val_loss: 0.7123
Epoch 7/10
[1m782/782[0m 

<keras.src.callbacks.history.History at 0x1363c240860>

### Since it can be seen that we after embedding we have achieved 98% accuracy 