In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

data = pd.read_csv(f"{path}/twitter_training.csv")

In [None]:
data.columns = ['Tweet ID', 'entity', 'sentiment', 'Tweet content']

data['Tweet content'] = data['Tweet content'].str.lower().str.replace('[^\w\s]', '')

data['Tweet content'] = data['Tweet content'].astype(str)

label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data['Tweet content'])
sequences = tokenizer.texts_to_sequences(data['Tweet content'])

X = pad_sequences(sequences, maxlen=100)
y = data['sentiment'].values

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
print(X_train.shape)
print(X_test.shape)

(59744, 100)
(14937, 100)


In [5]:
print(y_train.shape)
print(y_test.shape)

(59744,)
(14937,)


In [6]:
print(np.unique(y_train))
print(np.unique(y_test))

[0 1 2 3]
[0 1 2 3]


In [7]:
print(np.any(np.isnan(X_train)))
print(np.any(np.isinf(X_train)))
print(np.any(np.isnan(y_train)))
print(np.any(np.isinf(y_train)))

False
False
False
False


In [8]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128))
model.add(LSTM(64))
model.add(Dense(4, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
model.fit(X_train, y_train, batch_size=16, epochs=5, validation_data=(X_test, y_test))

Epoch 1/5
[1m3734/3734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 37ms/step - accuracy: 0.5651 - loss: 1.0309 - val_accuracy: 0.7548 - val_loss: 0.6376
Epoch 2/5
[1m3734/3734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 37ms/step - accuracy: 0.8211 - loss: 0.4743 - val_accuracy: 0.8165 - val_loss: 0.4902
Epoch 3/5
[1m3734/3734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 37ms/step - accuracy: 0.8842 - loss: 0.3033 - val_accuracy: 0.8354 - val_loss: 0.4565
Epoch 4/5
[1m3734/3734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 37ms/step - accuracy: 0.9157 - loss: 0.2184 - val_accuracy: 0.8515 - val_loss: 0.4411
Epoch 5/5
[1m3734/3734[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 37ms/step - accuracy: 0.9343 - loss: 0.1657 - val_accuracy: 0.8567 - val_loss: 0.4472


<keras.src.callbacks.history.History at 0x7d0a4de49630>

In [10]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.8478 - loss: 0.4721
Test Accuracy: 0.8566646575927734


In [11]:
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 18ms/step
