In [1]:
import numpy as np
import pandas as pd

In [2]:
tweets_data = pd.read_csv("tweet_emotions.csv")

In [3]:
tweets_data

Unnamed: 0,Id,Tweet,Label
0,145353048817012000,Thinks that @melbahughes had a great 50th birt...,surprise
1,144279638024257000,"Como una expresiÃ³n tan simple, una sola oraci...",sadness
2,140499585285111000,the moment when you get another follower and y...,joy
3,145207578270507000,Be the greatest dancer of your life! practice ...,joy
4,139502146390470000,eww.. my moms starting to make her annual rum ...,disgust
...,...,...,...
21046,146254676558495000,@TheBodyShopUK Knowing my dissertation will be...,joy
21047,141661025014521000,"hospital tomorrow morning, strapped with wires...",joy
21048,143061444525686000,"Work is soooo slow, ready to have a great satu...",joy
21049,143068383678697000,You realize that by choosing joy every single ...,joy


In [4]:
# show the shape of the the dataset
tweets_data.shape

(21051, 3)

In [5]:
# this dataset has 6 class and have a look at values of each class means it is balance or not
tweets_data['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
joy,8240
surprise,3849
sadness,3830
fear,2816
anger,1555
disgust,761


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# give the tokens to the words Keep only the top 5000 most frequent words.
tokenizer = Tokenizer(num_words=5000)
#Assigns each word a number.
tokenizer.fit_on_texts(tweets_data["Tweet"])

In [8]:
#Now conversion happens.
sequence = tokenizer.texts_to_sequences(tweets_data["Tweet"])

In [9]:
sequence

[[1668, 14, 93, 4, 128, 195, 247],
 [2571, 1247, 2263, 1248, 1247, 4907, 2572],
 [1, 166, 28, 11, 40, 187, 1828, 6, 11, 956],
 [19, 1, 804, 7, 29, 72, 470, 1093, 1427, 4200, 305, 1331, 4200],
 [3323,
  5,
  1249,
  700,
  3,
  86,
  121,
  3324,
  980,
  10,
  1,
  348,
  4908,
  139,
  1179,
  1179,
  1179,
  1],
 [62, 254, 220, 1122, 30, 1, 38, 10, 1428, 260, 200, 383, 124, 3044, 1],
 [2, 84, 1064, 6, 59, 157, 107, 1669, 249, 39, 82, 3, 3325, 4909],
 [23, 10, 1218, 939, 13, 4910, 357, 8, 51, 1, 1123],
 [284, 1065, 6, 1065, 42, 908, 24, 140, 2573],
 [1829, 805, 15, 40, 79, 143, 13, 2145, 238],
 [17,
  525,
  661,
  33,
  1,
  4911,
  11,
  383,
  2426,
  51,
  29,
  3045,
  358,
  4912,
  187,
  4911,
  17,
  44,
  2,
  2782],
 [126,
  4913,
  1543,
  17,
  8,
  78,
  17,
  76,
  3683,
  8,
  3683,
  1670,
  3683,
  8,
  3683,
  1670],
 [58, 68, 61, 60, 1478, 11, 153, 6, 61, 257, 197, 4, 539],
 [5, 220, 6, 940, 1671, 9, 643, 13, 6, 2, 71, 109, 85, 170],
 [2783, 6, 2146],
 [1429,
  62,

In [10]:
#This makes all sequences length = 50. add padding at the last
padded_sequence = pad_sequences(sequence,maxlen=50, padding="post")

In [11]:
padded_sequence

array([[1668,   14,   93, ...,    0,    0,    0],
       [2571, 1247, 2263, ...,    0,    0,    0],
       [   1,  166,   28, ...,    0,    0,    0],
       ...,
       [  52,    9, 1829, ...,    0,    0,    0],
       [  11,  691,   14, ...,    0,    0,    0],
       [  17,  576,    3, ...,    0,    0,    0]], dtype=int32)

But neural networks only understand numbers.

So LabelEncoder converts them into integers.

Example:

Label	Encoded
anger	0
fear	1
joy	2
love	3
sadness	4

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(tweets_data["Label"])

In [13]:
label_encoded

array([5, 4, 3, ..., 3, 3, 0])

In [14]:
from tensorflow.keras.utils import to_categorical

In [15]:
#Now we convert integers into one-hot vectors.
categorical_label = to_categorical(label_encoded)

loss=categorical_crossentropy

which requires one-hotlabels.

In [16]:
categorical_label

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.]])

stratify = categorical_label

This keeps class
distribution balanced in both train and test sets.

If 20% of data is "joy",
then 20% of train and test will also be "joy"

In [17]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    padded_sequence,
    categorical_label,
    test_size=0.2,
    stratify = categorical_label,
    random_state=42
    )

In [18]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((16840, 50), (4211, 50), (16840, 6), (4211, 6))

In [19]:
from tensorflow import keras
from tensorflow.keras import layers

In [20]:
model = keras.Sequential([
    layers.Dense(64, activation="relu", input_shape=(50,)),
    layers.Dense(32, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(6, activation="softmax")
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Why categorical_crossentropy?

Because:

Multi-class classification

One-hot encoded labels

In [21]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [22]:
history = model.fit(x_train,
          y_train,
          epochs=5,
          batch_size=64)

Epoch 1/5
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2365 - loss: 84.0492
Epoch 2/5
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2599 - loss: 5.4347
Epoch 3/5
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2970 - loss: 2.3123
Epoch 4/5
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3520 - loss: 1.7994
Epoch 5/5
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3641 - loss: 1.7093


In [23]:
y_pred = model.predict(x_test)

[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [24]:
y_pred

array([[0.08927758, 0.0491211 , 0.08824252, 0.5369987 , 0.15896738,
        0.07739284],
       [0.21388167, 0.02795355, 0.13413893, 0.28657624, 0.23380172,
        0.10364788],
       [0.09150088, 0.07803018, 0.12337281, 0.41411608, 0.17508902,
        0.11789105],
       ...,
       [0.05982823, 0.00732587, 0.02168696, 0.83354074, 0.06184488,
        0.01577328],
       [0.07167576, 0.03472909, 0.06541365, 0.65046656, 0.12039939,
        0.05731551],
       [0.10393578, 0.06533201, 0.11577464, 0.44013137, 0.16635348,
        0.1084728 ]], dtype=float32)

Convert Probabilities to Class Labels

In [25]:
y_preds = np.argmax(y_pred, axis=1)

In [26]:
y_preds

array([3, 3, 3, ..., 3, 3, 3])

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(np.argmax(y_test, axis=1), y_preds)

0.3873189266207552