In [24]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Machine Learning/Tweet-Emotions-Classification/tweet_emotions_classification/tweet_emotions.csv'
df = pd.read_csv(file_path)

df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Id,Tweet,Label
0,145353048817012000,Thinks that @melbahughes had a great 50th birt...,surprise
1,144279638024257000,"Como una expresiÃ³n tan simple, una sola oraci...",sadness
2,140499585285111000,the moment when you get another follower and y...,joy
3,145207578270507000,Be the greatest dancer of your life! practice ...,joy
4,139502146390470000,eww.. my moms starting to make her annual rum ...,disgust


In [25]:
# Count the number of instances for each class label
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
joy,8240
surprise,3849
sadness,3830
fear,2816
anger,1555
disgust,761


In [39]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare the tokenizer
max_words = 30000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Tweet'])

sequences = tokenizer.texts_to_sequences(df['Tweet'])


X = pad_sequences(sequences, maxlen=max_len)
y = pd.get_dummies(df['Label']).values


In [38]:
len(tokenizer.word_index)

32939

In [40]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from sklearn.model_selection import train_test_split

# Split data into training (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Build the model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
model.add(Flatten())
model.add(Dense(128, activation='relu'))  # second hidden layer
model.add(Dense(64, activation='relu'))  # second hidden layer
model.add(Dense(32, activation='relu'))  # third hidden layer
model.add(Dense(y.shape[1], activation='softmax'))  # Output layer with softmax for multi-class classification

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=8, batch_size=32, validation_split=0.2)


Epoch 1/8




[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.4112 - loss: 1.5339 - val_accuracy: 0.5178 - val_loss: 1.2993
Epoch 2/8
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6491 - loss: 0.9714 - val_accuracy: 0.5243 - val_loss: 1.4197
Epoch 3/8
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8838 - loss: 0.3496 - val_accuracy: 0.5321 - val_loss: 1.6834
Epoch 4/8
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9672 - loss: 0.1059 - val_accuracy: 0.5270 - val_loss: 2.2012
Epoch 5/8
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9960 - loss: 0.0207 - val_accuracy: 0.5202 - val_loss: 2.4924
Epoch 6/8
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9991 - loss: 0.0062 - val_accuracy: 0.5098 - val_loss: 2.8992
Epoch 7/8
[1m421/421[0m [32m━━━━━━━━━━━━━

In [41]:
# Evaluate the model on test data
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f'Training Accuracy: {train_accuracy * 100:.2f}%')
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')


Training Accuracy: 90.42%
Test Accuracy: 52.20%


In [42]:
from sklearn.metrics import classification_report
import numpy as np

# Get model predictions
y_pred = model.predict(X_test)

# Convert predictions from one-hot encoded format to class labels
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# Display classification report
# This will show precision, recall, and F1-score for each class
report = classification_report(y_test_classes, y_pred_classes, target_names=df['Label'].unique())

print(report)


[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
              precision    recall  f1-score   support

    surprise       0.32      0.25      0.28       311
     sadness       0.18      0.07      0.10       152
         joy       0.60      0.44      0.51       563
     disgust       0.66      0.70      0.68      1649
        fear       0.37      0.43      0.40       766
       anger       0.45      0.50      0.47       770

    accuracy                           0.52      4211
   macro avg       0.43      0.40      0.40      4211
weighted avg       0.52      0.52      0.52      4211

