In [None]:
import pandas as pd
import numpy as np
import nltk
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.corpus import twitter_samples

# Download necessary NLTK datasets
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# Combine datasets and create labels
texts = positive_tweets + negative_tweets
labels = [1] * len(positive_tweets) + [0] * len(negative_tweets)  # 1 for positive, 0 for negative

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Tokenization and padding
max_words = 5000  # Maximum number of words in the vocabulary
max_len = 50      # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")  # Handle out-of-vocabulary words
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

data = pad_sequences(sequences, maxlen=max_len)
labels = np.array(labels)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [None]:
# RNN Model
embedding_dim = 50  # Size of the word embeddings

# Define the model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),

    # First RNN layer
    SimpleRNN(32, return_sequences=True),

    # Second RNN layer
    SimpleRNN(32, return_sequences=True),

    # Third RNN layer
    SimpleRNN(64, return_sequences=False),

    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),  # Fully connected layer
    Dense(1, activation='sigmoid')
])

# Compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
#early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train
history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.2,
                    callbacks=[early_stopping])

# Evaluate
test_loss, test_accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 25ms/step - accuracy: 0.5320 - loss: 1.1444 - val_accuracy: 0.6925 - val_loss: 0.7151
Epoch 2/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8027 - loss: 0.5540 - val_accuracy: 0.7631 - val_loss: 0.5530
Epoch 3/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.9305 - loss: 0.2367 - val_accuracy: 0.7588 - val_loss: 0.6353
Epoch 4/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.9707 - loss: 0.1190 - val_accuracy: 0.7569 - val_loss: 0.7227
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7544 - loss: 0.5762
Test Accuracy: 0.7735000252723694
