<a href="https://colab.research.google.com/github/HazemmoAlsady/Sentimental_Analysis/blob/main/RNN_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [112]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [113]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")   # اسم الملف حسب عندك
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# **PreProcessing**

In [114]:
# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)          # remove HTML tags
    text = re.sub(r"[^a-zA-Z]", " ", text)      # remove special chars
    text = re.sub(r"\s+", " ", text).strip()    # remove double spaces
    return text

df["review"] = df["review"].apply(clean_text)

# Encode sentiment
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

# **Split data to Train/Test**

In [115]:
from sklearn.model_selection import train_test_split

X = df["review"].values
y = df["sentiment"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **Tokenization**

In [120]:
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
vocab_size = 50000
max_len = 150   # keeps RNN efficient

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding="post")


In [123]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-12-12 19:21:43--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-12-12 19:21:43--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-12-12 19:21:43--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’

gl

In [125]:
embedding_dim = 100

embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        vec = embedding_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec



# **RNN Model**

In [126]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix],
              input_shape=(max_len,), trainable=False),

    SimpleRNN(256, dropout=0.2, recurrent_dropout=0.2),

    Dense(128, activation="relu"),
    Dropout(0.4),

    Dense(64, activation="relu"),
    Dropout(0.3),

    Dense(1, activation="sigmoid")
])


model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [110]:
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_padded, y_train,
    validation_split=0.2,
    epochs=8,
    batch_size=64,
    callbacks=[es],
    verbose=2
)


Epoch 1/10
500/500 - 103s - 206ms/step - accuracy: 0.5051 - loss: 0.7012 - val_accuracy: 0.5023 - val_loss: 0.6931
Epoch 2/10


KeyboardInterrupt: 

# **Evaluation**

In [100]:
loss, acc = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Accuracy = {acc * 100:.2f}%")


Test Accuracy = 63.35%


In [102]:
y_pred = (model.predict(X_test_padded) > 0.5).astype(int)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step


NameError: name 'classification_report' is not defined

# **GRU **

In [127]:
from tensorflow.keras.layers import GRU, Input

model = Sequential([
    Input(shape=(max_len,)),

    Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        trainable=False
    ),

    Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2)),

    Dropout(0.4),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

model.summary()


In [128]:
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train_padded, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=64,
    callbacks=[es],
    verbose=2
)


Epoch 1/10


KeyboardInterrupt: 

In [None]:
loss, acc = model.evaluate(X_test_padded, y_test, verbose=0)
print(f"Test Accuracy = {acc * 100:.2f}%")
