In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('cleaned_tweets.csv')
df = df.dropna(subset=['clean_text'])
df = df[df['clean_text'].str.strip() != '']

# Convert labels to numbers
# negative=0, neutral=1, positive=2
le = LabelEncoder()
df['label'] = le.fit_transform(df['airline_sentiment'])

print("Classes:", le.classes_)
print("Shape:", df.shape)

Classes: ['negative' 'neutral' 'positive']
Shape: (14617, 4)


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 10000
MAX_LEN = 50

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_text'])

sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post')

print("Shape of padded sequences:", padded.shape)

Shape of padded sequences: (14617, 50)


In [4]:
X = padded
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", len(X_train))
print("Test size: ", len(X_test))

Train size: 11693
Test size:  2924


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(MAX_WORDS, 64),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(32),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

In [8]:
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)

Epoch 1/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.6781 - loss: 0.7586 - val_accuracy: 0.7650 - val_loss: 0.6114
Epoch 2/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step - accuracy: 0.8094 - loss: 0.4775 - val_accuracy: 0.7752 - val_loss: 0.5981
Epoch 3/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.8770 - loss: 0.3380 - val_accuracy: 0.7684 - val_loss: 0.6660
Epoch 4/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.9129 - loss: 0.2484 - val_accuracy: 0.7547 - val_loss: 0.6872
Epoch 5/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 21ms/step - accuracy: 0.9362 - loss: 0.1909 - val_accuracy: 0.7368 - val_loss: 0.8215
