In [1]:
import pandas as pd
import numpy as np
import nltk
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.rnn_models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_encoded_tweets
from sklearn.rnn_model_selection import train_test_split
from nltk.corpus import twitter_samples

nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('stopwords')

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

tweets = positive_tweets + negative_tweets
sentiments = [1] * len(positive_tweets) + [0] * len(negative_tweets)  # 1 for positive, 0 for negative

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
vocab_size = 5000  # Maximum number of words in the vocabulary
sequence_length = 50      # Maximum length of encoded_tweets

text_processor = Tokenizer(num_words=vocab_size, oov_token="<OOV>")  # Handle out-of-vocabulary words
text_processor.fit_on_tweets(tweets)
encoded_tweets = text_processor.tweets_to_encoded_tweets(tweets)
vocab_index = text_processor.vocab_index

padded_tweets = pad_encoded_tweets(encoded_tweets, maxlen=sequence_length)
sentiments = np.array(sentiments)

train_features, test_features, train_labels, test_labels = train_test_split(padded_tweets, sentiments, test_size=0.2, random_state=42)

In [3]:
embed_size = 50  # Size of the word embeddings

rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=sequence_length),

    SimpleRNN(32, return_encoded_tweets=True),

    SimpleRNN(32, return_encoded_tweets=True),

    SimpleRNN(64, return_encoded_tweets=False),

    Dropout(0.3),  # Dropout layer for regularization

    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),  # Fully connected layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

train_history = rnn_model.fit(train_features, train_labels,
                    epochs=30,
                    batch_size=64,
                    validation_split=0.2,
                    callbacks=[early_stop])

test_loss, test_accuracy = rnn_model.evaluate(test_features, test_labels)
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 25ms/step - accuracy: 0.5320 - loss: 1.1444 - val_accuracy: 0.6925 - val_loss: 0.7151
Epoch 2/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 17ms/step - accuracy: 0.8027 - loss: 0.5540 - val_accuracy: 0.7631 - val_loss: 0.5530
Epoch 3/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 19ms/step - accuracy: 0.9305 - loss: 0.2367 - val_accuracy: 0.7588 - val_loss: 0.6353
Epoch 4/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.9707 - loss: 0.1190 - val_accuracy: 0.7569 - val_loss: 0.7227
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7544 - loss: 0.5762
Test Accuracy: 0.7735000252723694
