In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, SpatialDropout1D, Embedding
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import pickle

In [94]:
# Load the dataset
df = pd.read_csv('Tweets.csv', sep=',')
print(df.head())

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [95]:
## Shape of the Dataset
print("Number of Rows: " + str(df.shape[0]))
print("Number of Columns: " + str(df.shape[1]))

Number of Rows: 14640
Number of Columns: 15


In [96]:
# ## Select Required Columns Only

# Select the required columns
tweet_df = df[['text', 'airline_sentiment']]

In [97]:
# ## Filter Out Neutral Sentiments

# Keep only positive and negative sentiments
tweet_df = tweet_df[tweet_df['airline_sentiment'] != 'neutral']

In [98]:
# ## Prepare Features and Labels

# Features and Labels
X = tweet_df['text']
y = tweet_df['airline_sentiment']

In [99]:
# ## Train-Test Split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [100]:
# ## Check Train and Test Set Sizes

print("Training Set X Items: " + str(len(X_train)))
print("Training Set y Items: " + str(len(y_train)))
print("Test Set X Items: " + str(len(X_test)))
print("Test Set y Items: " + str(len(y_test)))

Training Set X Items: 9232
Training Set y Items: 9232
Test Set X Items: 2309
Test Set y Items: 2309


In [101]:
# ## Encoding Labels

# Encoding the labels
review_labels_train = y_train.factorize()

In [102]:
# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=8000, oov_token='OOV')
tokenizer.fit_on_texts(X_train)

In [103]:
# Get vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [104]:
# Print vocabulary size
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 11648


In [105]:
# Convert texts to sequences
tweet_seqs = tokenizer.texts_to_sequences(X_train)
padded_sequence_train = pad_sequences(tweet_seqs, maxlen=200)

# Print the padded sequence of the first tweet
print("Padded Sequence for First Tweet:", padded_sequence_train[0])

Padded Sequence for First Tweet: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0   13  143 1591  550   57  122  661  221 2106
   59  753   57  428   20    3  257  856    

In [106]:
# ## Build the Model

# Model parameters
embedding_vector_length = 32

# Create the model
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



None


In [107]:
# ## Train the Model with Early Stopping

early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

In [108]:
# Train the model
trained = model.fit(padded_sequence_train, review_labels_train[0],
                    validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 96ms/step - accuracy: 0.7903 - loss: 0.5375 - val_accuracy: 0.8376 - val_loss: 0.3702
Epoch 2/5
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 91ms/step - accuracy: 0.8667 - loss: 0.3212 - val_accuracy: 0.8863 - val_loss: 0.2995
Epoch 3/5
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 88ms/step - accuracy: 0.9181 - loss: 0.2254 - val_accuracy: 0.9020 - val_loss: 0.2631
Epoch 4/5
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 102ms/step - accuracy: 0.9471 - loss: 0.1527 - val_accuracy: 0.9080 - val_loss: 0.2606
Epoch 5/5
[1m231/231[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 95ms/step - accuracy: 0.9585 - loss: 0.1205 - val_accuracy: 0.9112 - val_loss: 0.2783


In [109]:
# ## Evaluate the Model

# Prepare the test data
encoded_docs = tokenizer.texts_to_sequences(X_test)
padded_sequence_test = pad_sequences(encoded_docs, maxlen=200)

In [110]:
# Encode test labels
sentiment_label_test = y_test.factorize()

In [111]:
# Evaluate the model
score = model.evaluate(padded_sequence_test, sentiment_label_test[0], verbose=0)
print("Test Accuracy: {:.2f}%".format(score[1] * 100))

Test Accuracy: 91.86%


In [112]:
# Get predictions
y_pred = model.predict(padded_sequence_test)
y_pred_classes = (y_pred > 0.5).astype(int).flatten()

# Generate classification report
report = classification_report(sentiment_label_test[0], y_pred_classes, target_names=review_labels_train[1])
print(report)

[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step
              precision    recall  f1-score   support

    negative       0.93      0.97      0.95      1862
    positive       0.85      0.70      0.77       447

    accuracy                           0.92      2309
   macro avg       0.89      0.84      0.86      2309
weighted avg       0.92      0.92      0.92      2309



In [113]:
# ## Save the Model and Tokenizer

# Save the trained model
model.save("sentiment_analysis1.h5")
print("Model Saved.")



Model Saved.


In [114]:
# Save the tokenizer
with open('tokenizer1.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tokenizer Saved.")

Tokenizer Saved.
