In [3]:
import nltk
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Download NLTK resources
nltk.download('twitter_samples')
nltk.download('punkt')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\engma\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\engma\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

In [5]:
tweets = positive_tweets + negative_tweets
sentiments = ['positive'] * len(positive_tweets) + ['negative'] * len(negative_tweets)

# Preprocess text
def preprocess_text(text):
    return ' '.join(word_tokenize(text.lower()))

tweets = [preprocess_text(tweet) for tweet in tweets]

In [6]:
# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(sentiments)

# Split data
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.2, random_state=42)

In [102]:
# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences
max_len = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)

In [103]:
len(tokenizer.word_index)

17963

## RNN

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [88]:
# Define model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    SimpleRNN(5, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [89]:
# Compile model
model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

In [90]:
# Train model
history = model.fit(X_train_padded, y_train, epochs=2, batch_size=32, validation_data=(X_test_padded, y_test))

Epoch 1/2


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5965 - loss: 0.6613 - val_accuracy: 0.7855 - val_loss: 0.5396
Epoch 2/2
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7590 - loss: 0.5303 - val_accuracy: 0.7900 - val_loss: 0.4835


In [91]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7905 - loss: 0.4819
Test Accuracy: 79.00%


In [92]:
# Sample prediction
def predict_sentiment(text):
    processed = preprocess_text(text)
    sequence = tokenizer.texts_to_sequences([processed])
    padded = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded)
    return 'positive' if prediction >= 0.5 else 'negative'

In [95]:
sample_text = "I love shirts!"
print(f'Sentiment: {predict_sentiment(sample_text)}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Sentiment: positive


## LSTM

In [50]:
from tensorflow.keras.layers import LSTM

In [52]:
# Define LSTM model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [53]:
# Compile model
model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

In [54]:
# Train model
history = model.fit(X_train_padded, y_train, epochs=2, batch_size=32, validation_data=(X_test_padded, y_test))

Epoch 1/2
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.6103 - loss: 0.6579 - val_accuracy: 0.7285 - val_loss: 0.5093
Epoch 2/2
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7996 - loss: 0.4378 - val_accuracy: 0.8065 - val_loss: 0.4258


In [59]:
sample_text = "I hate shirts!"
print(f'Sentiment: {predict_sentiment(sample_text)}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Sentiment: negative


## GRU

In [70]:
from tensorflow.keras.layers import GRU

In [81]:
# Define GRU model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64),
    GRU(64, return_sequences=False),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [82]:
# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
# Train model
history = model.fit(X_train_padded, y_train, epochs=2, batch_size=32, validation_data=(X_test_padded, y_test))

Epoch 1/2
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.6575 - loss: 0.6192 - val_accuracy: 0.8005 - val_loss: 0.4366
Epoch 2/2
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8431 - loss: 0.3591 - val_accuracy: 0.8035 - val_loss: 0.4318


In [86]:
sample_text = "I hate shirts!"
print(f'Sentiment: {predict_sentiment(sample_text)}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Sentiment: negative
