In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

2023-05-12 15:51:52.539410: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import chardet

with open("tweet_emotions2.csv", "rb") as f:
    encoding = chardet.detect(f.read())["encoding"]

df = pd.read_csv("tweet_emotions2.csv", encoding=encoding)


In [4]:
df

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
74787,surprise,@MichelGW have you gift! Hope you like it! It'...
74788,joy,The world didnt give it to me..so the world MO...
74789,anger,A man robbed me today .
74790,fear,"Youu call it JEALOUSY, I call it of #Losing YO..."


In [5]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["content"], df["sentiment"], test_size=0.2)

In [6]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

In [7]:
maxlen = 50
train_padded = pad_sequences(train_sequences, maxlen=maxlen, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=maxlen, padding='post', truncating='post')

In [8]:
df['sentiment'].unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger',
       'joy', 'fear', 'shame', 'disgust'], dtype=object)

In [10]:
label_dict = {"empty": 0, "sadness": 1, "enthusiasm": 2, "neutral": 3, "worry": 4,"surprise":5,"love":6,"fun":7,"hate":8,"happiness":9,"boredom":10,"relief":11,"anger":12,"joy":13,"fear":14,"shame":15,"disgust":16}
num_classes = len(label_dict)
train_labels = train_labels.map(label_dict)
test_labels = test_labels.map(label_dict)
train_labels = tf.keras.utils.to_categorical(train_labels, num_classes)
test_labels = tf.keras.utils.to_categorical(test_labels, num_classes)

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=50, input_length=maxlen),
    tf.keras.layers.GRU(units=64, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

In [12]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_padded, train_labels, epochs=10, batch_size=32, validation_data=(test_padded, test_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1316dde90>

In [13]:
test_loss, test_accuracy = model.evaluate(test_padded, test_labels)
print("Test loss:", test_loss)
print("Test accuracy:", test_accuracy)

from sklearn.metrics import classification_report
test_predictions = model.predict(test_padded)
test_pred_labels = np.argmax(test_predictions, axis=1)
test_true_labels = np.argmax(test_labels, axis=1)
label_names = list(label_dict.keys())
print(classification_report(test_true_labels, test_pred_labels, target_names=label_names))

Test loss: 1.8932322263717651
Test accuracy: 0.41172537207603455
              precision    recall  f1-score   support

       empty       0.00      0.00      0.00       170
     sadness       0.45      0.43      0.44      2399
  enthusiasm       0.00      0.00      0.00       165
     neutral       0.39      0.48      0.43      2196
       worry       0.25      0.34      0.29      1662
    surprise       0.36      0.30      0.33      1212
        love       0.39      0.39      0.39       750
         fun       0.19      0.02      0.03       400
        hate       0.26      0.10      0.14       261
   happiness       0.27      0.31      0.29      1062
     boredom       0.00      0.00      0.00        42
      relief       0.00      0.00      0.00       304
       anger       0.53      0.53      0.53       849
         joy       0.53      0.56      0.55      2218
        fear       0.59      0.68      0.63      1090
       shame       0.79      0.85      0.82        27
     disgust    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
