In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [3]:
# Loading the dataset
data = pd.read_csv('twitter_training.csv', names=['ID', 'Context', 'Sentiment', 'Tweet'])

In [4]:
# Prepaing the dataset (Dropping unnecessary columns and renaming)
data = data[['Tweet', 'Sentiment']]

In [5]:
#Encoding sentiment labels 
label_encoder = LabelEncoder()
data['Sentiment'] = label_encoder.fit_transform(data['Sentiment'])

In [6]:
# Ensuring that all values in the 'Tweet' column are strings
data['Tweet'] = data['Tweet'].astype(str)

# Handling missing values by replacing them with an empty string
data['Tweet'].fillna('', inplace=True)

# fitting the tokenizer again after cleaning the data
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data['Tweet'])

# Converting the tweets to sequences and pad them
X = tokenizer.texts_to_sequences(data['Tweet'])
X = pad_sequences(X, maxlen=max_len)

In [7]:
# Defining the target variable (Sentiment)
y = data['Sentiment'].values

In [8]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Building the LSTM model for sentiment classification
model = Sequential()

# No need for `input_length` in the Embedding layer
model.add(Embedding(input_dim=max_words, output_dim=64))  
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))  

In [10]:
# Compilling the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
# Training the model
history = model.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test), batch_size=32)

[1m1868/1868[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 203ms/step - accuracy: 0.4986 - loss: 1.1466 - val_accuracy: 0.6673 - val_loss: 0.8332


In [12]:
# Evaluating the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc}') 

[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 76ms/step - accuracy: 0.6671 - loss: 0.8325
Test Accuracy: 0.6673361659049988


In [13]:
# Predicting sentiment for new texts
new_tweets = ["I love this!", "This is terrible.", "It's okay."]
sequences = tokenizer.texts_to_sequences(new_tweets)
padded = pad_sequences(sequences, maxlen=max_len)
predictions = model.predict(padded)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 813ms/step


In [14]:
# Display prediction
for i, tweet in enumerate(new_tweets):
    print(f'Tweet: {tweet} -> Sentiment: {label_encoder.inverse_transform([np.argmax(predictions[i])])[0]}')

Tweet: I love this! -> Sentiment: Positive
Tweet: This is terrible. -> Sentiment: Negative
Tweet: It's okay. -> Sentiment: Negative
