In [4]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/tweet_emotions_classification/tweet_emotions.csv'
df = pd.read_csv(file_path)

df.head()


Mounted at /content/drive


Unnamed: 0,Id,Tweet,Label
0,145353048817012000,Thinks that @melbahughes had a great 50th birt...,surprise
1,144279638024257000,"Como una expresiÃ³n tan simple, una sola oraci...",sadness
2,140499585285111000,the moment when you get another follower and y...,joy
3,145207578270507000,Be the greatest dancer of your life! practice ...,joy
4,139502146390470000,eww.. my moms starting to make her annual rum ...,disgust


In [5]:
# Count the number of instances for each class label
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
joy,8240
surprise,3849
sadness,3830
fear,2816
anger,1555
disgust,761


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare the tokenizer
max_words = 100000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Tweet'])

sequences = tokenizer.texts_to_sequences(df['Tweet'])

X = pad_sequences(sequences, maxlen=max_len)
y = pd.get_dummies(df['Label']).values


In [7]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from sklearn.model_selection import train_test_split

# Split data into training (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Build the model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
model.add(Flatten())
model.add(Dense(64, activation='relu'))  # second hidden layer
model.add(Dense(64, activation='relu'))  # second hidden layer
model.add(Dense(32, activation='relu'))  # third hidden layer
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer with softmax for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)




Epoch 1/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.4046 - loss: 1.5308 - val_accuracy: 0.5113 - val_loss: 1.2782
Epoch 2/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.6469 - loss: 0.9891 - val_accuracy: 0.5395 - val_loss: 1.2857
Epoch 3/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8454 - loss: 0.4343 - val_accuracy: 0.5285 - val_loss: 1.6364
Epoch 4/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.9516 - loss: 0.1427 - val_accuracy: 0.5279 - val_loss: 2.0623
Epoch 5/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9920 - loss: 0.0346 - val_accuracy: 0.5143 - val_loss: 2.3984
Epoch 6/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9981 - loss: 0.0082 - val_accuracy: 0.5229 - val_loss: 2.6166
Epoch 7/10
[1m421/421[0m 

In [8]:
# Evaluate the model on test data
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f'Training Accuracy: {train_accuracy * 100:.2f}%')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


Training Accuracy: 90.46%
Test Accuracy: 52.39%
