In [1]:
import pandas as pd
import numpy as np

import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, RNN, SimpleRNN, LSTM, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [41]:
df = pd.read_csv(r"/content/Test.csv")

df.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [42]:
df = df[["text", "label"]]
df.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [43]:
df["sentiment"] = df["label"].map({
    0: "Negative",
    1: "Positive"
})

In [44]:
df.dropna(inplace=True)
df = df[df["sentiment"].isin(["Positive", "Negative"])]
df.reset_index(drop=True, inplace=True)

df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Positive,2505
Negative,2495


In [46]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["clean_text"] = df["text"].apply(clean_text)
df.head()


Unnamed: 0,text,label,sentiment,clean_text
0,I always wrote this series off as being a comp...,0,Negative,i always wrote this series off as being a comp...
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0,Negative,st watched out of dirsteve purcell typical mar...
2,This movie was so poorly written and directed ...,0,Negative,this movie was so poorly written and directed ...
3,The most interesting thing about Miryang (Secr...,1,Positive,the most interesting thing about miryang secre...
4,"when i first read about ""berlin am meer"" i did...",0,Negative,when i first read about berlin am meer i didnt...


In [47]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["sentiment"])

df[["sentiment", "label"]].drop_duplicates()

Unnamed: 0,sentiment,label
0,Negative,0
3,Positive,1


In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

In [49]:
MAX_WORDS = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding="post")

In [50]:
def build_model(model_type):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, 128, input_length=MAX_LEN))

    if model_type == "RNN":
        model.add(SimpleRNN(64))
    elif model_type == "LSTM":
        model.add(LSTM(64))
    elif model_type == "GRU":
        model.add(GRU(64))

    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )
    return model

In [51]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

results = {}
models = {}

for arch in ["RNN", "LSTM", "GRU"]:
    print(f"\nTraining {arch}...\n")

    model = build_model(arch)
    history = model.fit(
        X_train_pad,
        y_train,
        epochs=10,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=1
    )

    val_acc = max(history.history["val_accuracy"])
    results[arch] = val_acc
    models[arch] = model

    print(f"{arch} Validation Accuracy: {val_acc:.4f}")


Training RNN...

Epoch 1/10




[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.5030 - loss: 0.7069 - val_accuracy: 0.5400 - val_loss: 0.6892
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7507 - loss: 0.5901 - val_accuracy: 0.6162 - val_loss: 0.6522
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8755 - loss: 0.3675 - val_accuracy: 0.6488 - val_loss: 0.6537
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9601 - loss: 0.1497 - val_accuracy: 0.5850 - val_loss: 0.7773
RNN Validation Accuracy: 0.6488

Training LSTM...

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5343 - loss: 0.6916 - val_accuracy: 0.6050 - val_loss: 0.6785
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7227 - loss: 0.5885 - val_accuracy: 0.7387 - val_loss: 0.

In [64]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

def build_model(model_type):
    model = Sequential()
    model.add(Embedding(MAX_WORDS, 128))  # input_length deprecated

    if model_type == "BiRNN":
        model.add(Bidirectional(SimpleRNN(64)))
    elif model_type == "BiLSTM":
        model.add(Bidirectional(LSTM(64)))
    elif model_type == "BiGRU":
        model.add(Bidirectional(GRU(64)))

    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )
    return model

In [65]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

results = {}
models = {}

for arch in ["BiRNN", "BiLSTM", "BiGRU"]:
    print(f"\nTraining {arch}...\n")

    model = build_model(arch)
    history = model.fit(
        X_train_pad,
        y_train,
        epochs=10,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=1
    )

    val_acc = max(history.history["val_accuracy"])
    results[arch] = val_acc
    models[arch] = model

    print(f"{arch} Validation Accuracy: {val_acc:.4f}")


Training BiRNN...

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step - accuracy: 0.5086 - loss: 0.7064 - val_accuracy: 0.6137 - val_loss: 0.6678
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.7454 - loss: 0.5625 - val_accuracy: 0.6812 - val_loss: 0.5970
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9240 - loss: 0.2695 - val_accuracy: 0.7287 - val_loss: 0.5846
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9784 - loss: 0.0957 - val_accuracy: 0.7487 - val_loss: 0.5996
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9981 - loss: 0.0278 - val_accuracy: 0.7188 - val_loss: 0.6442
BiRNN Validation Accuracy: 0.7487

Training BiLSTM...

Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.5085 - loss: 0.6920 -

In [66]:
results

{'BiRNN': 0.7487499713897705,
 'BiLSTM': 0.8062499761581421,
 'BiGRU': 0.7462499737739563}

In [67]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print("Best Architecture:", best_model_name)


Best Architecture: BiLSTM


In [68]:
test_loss, test_acc = best_model.evaluate(X_test_pad, y_test)
print("Test Accuracy:", test_acc)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7774 - loss: 0.4811
Test Accuracy: 0.7730000019073486


In [69]:
best_model.save("best_sentiment_model.h5")

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("Model and tokenizer saved successfully.")




Model and tokenizer saved successfully.


In [79]:
sample_text = ["""IT IS A PIECE OF CRAP! not funny at all. during the whole movie nothing ever happens. i almost fell asleep, which in my case happens only if a movie is rally bad. (that is why it didn't get 1 (awful) out of 10 but 2).don't be fooled, like i was, by first review. a waste of money and your time! spend it on other stuff. at this point i'm finished with my review but i have to fill in at least ten lines of text so i will go on.... (ctrl+c, ctrl+v) :))) IT IS A PIECE OF CRAP! not funny at all. during the whole movie nothing ever happens. i almost fell asleep, which in my case happens only if a movie is rally bad. (that is why it didn't get 1 (awful) out of 10 but 2).don't be fooled, like i was, by first review. a waste of money and your time! spend it on other stuff. IT IS A PIECE OF CRAP! not funny at all. during the whole movie nothing ever happens. i almost fell asleep, which in my case happens only if a movie is rally bad. (that is why it didn't get 1 (awful) out of 10 but 2).don't be fooled, like i was, by first review. a waste of money and your time! spend it on other stuff."""]

sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=MAX_LEN, padding="post")

prediction = best_model.predict(sample_pad)
print("Prediction:", "Positive" if prediction[0][0] > 0.5 else "Negative")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Prediction: Negative
