In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D

In [2]:
# Load your dataset
data = pd.read_csv('tweet_emotions.csv')

# Define your hyperparameters
MAX_NB_WORDS = 5000  # Maximum number of words to tokenize
MAX_SEQUENCE_LENGTH = 100  # Maximum length of each sequence
EMBEDDING_DIM = 100  # Dimension of the word embeddings
VALIDATION_SPLIT = 0.2  # Percentage of data for validation

# Tokenize the text data
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['content'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 48998 unique tokens.


In [3]:
# Convert text to sequences
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

# Convert sentiment labels to one-hot encoding
Y = pd.get_dummies(data['sentiment']).values
print('Shape of label tensor:', Y.shape)

# Split data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=VALIDATION_SPLIT, random_state=42)

Shape of data tensor: (40000, 100)
Shape of label tensor: (40000, 13)


In [4]:
# Define the LSTM model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM))  # Remove input_length argument
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [5]:
# Train the model
epochs = 5
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, Y_val), verbose=2)

# Evaluate the model
loss, accuracy = model.evaluate(X_val, Y_val, verbose=0)
print('Validation Accuracy:', accuracy)

Epoch 1/5
500/500 - 28s - 57ms/step - accuracy: 0.2797 - loss: 2.0722 - val_accuracy: 0.3375 - val_loss: 1.9491
Epoch 2/5
500/500 - 25s - 51ms/step - accuracy: 0.3625 - loss: 1.8802 - val_accuracy: 0.3479 - val_loss: 1.9207
Epoch 3/5
500/500 - 25s - 49ms/step - accuracy: 0.4021 - loss: 1.7871 - val_accuracy: 0.3549 - val_loss: 1.9034
Epoch 4/5
500/500 - 24s - 48ms/step - accuracy: 0.4307 - loss: 1.7140 - val_accuracy: 0.3456 - val_loss: 1.9207
Epoch 5/5
500/500 - 24s - 48ms/step - accuracy: 0.4504 - loss: 1.6481 - val_accuracy: 0.3406 - val_loss: 1.9507
Validation Accuracy: 0.34062498807907104


In [8]:
# Extract unique types of sentiments
unique_sentiments = data['sentiment'].unique()

# Create index_to_label dictionary
index_to_label = {index: sentiment for index, sentiment in enumerate(unique_sentiments)}

# Define a function to preprocess and classify a sample sentence
def classify_sentiment(sample_sentence):
    sequence = tokenizer.texts_to_sequences([sample_sentence])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
    predicted_probabilities = model.predict(padded_sequence)
    predicted_label_index = np.argmax(predicted_probabilities)
    predicted_sentiment = index_to_label[predicted_label_index]
    return predicted_sentiment

In [16]:
# Test the model with a sample sentence
sample_sentence = "That country sucks!"

predicted_sentiment = classify_sentiment(sample_sentence)
print("Predicted Sentiment:", predicted_sentiment)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Predicted Sentiment: love
