In [1]:
# !unzip /content/go_emotion.zip

Archive:  /content/go_emotion.zip
replace GoEmotionsFormat.PNG? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam
from keras.regularizers import l2


import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd

df1 = pd.read_csv('data/full_dataset/goemotions_1.csv')
df2 = pd.read_csv('data/full_dataset/goemotions_2.csv')
df3 = pd.read_csv('data/full_dataset/goemotions_3.csv')
df = pd.concat([df1, df2, df3], ignore_index=True)

emotion_map = {
    'anger': 'marah',
    'sadness': 'sedih',
    'joy': 'senang',
    'neutral': 'netral'
}
selected_emotions = ['anger', 'sadness', 'joy', 'neutral']
df_filtered = df[df[selected_emotions].any(axis=1)]

def get_emotion(row):
    for emotion in selected_emotions:
        if row[emotion] == 1:
            return emotion_map[emotion]
    return 'netral'  # Default jika netral

df_filtered['emotion'] = df_filtered.apply(get_emotion, axis=1)
df_filtered = df_filtered[['text', 'emotion']]




In [4]:
print("Jumlah data dan kolom:", df_filtered.shape)

Jumlah data dan kolom: (77890, 2)


In [5]:
df_filtered.duplicated().sum()
df_filtered.drop_duplicates(inplace=True)

In [6]:
print("Jumlah data dan kolom:", df_filtered.shape)

Jumlah data dan kolom: (47308, 2)


In [7]:
print("\nJumlah data yang hilang di setiap kolom:")
print(df_filtered.isnull().sum())


Jumlah data yang hilang di setiap kolom:
text       0
emotion    0
dtype: int64


In [8]:
print("\nDistribusi emosi dalam dataset:")
print(df_filtered['emotion'].value_counts())


Distribusi emosi dalam dataset:
emotion
netral    31446
marah      5644
senang     5634
sedih      4584
Name: count, dtype: int64


In [9]:
df_netral = df_filtered[df_filtered['emotion'] == 'netral']
df_marah = df_filtered[df_filtered['emotion'] == 'marah']
df_senang = df_filtered[df_filtered['emotion'] == 'senang']
df_sedih = df_filtered[df_filtered['emotion'] == 'sedih']

df_netral_reduced = df_netral.sample(n=4846, random_state=42)

df_balanced = pd.concat([df_netral_reduced, df_marah, df_senang, df_sedih])


df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['emotion'].value_counts())


emotion
marah     5644
senang    5634
netral    4846
sedih     4584
Name: count, dtype: int64


In [10]:
print(df_balanced.head(5))

                                                text emotion
0  She eats it until she throws up the first snow...  senang
1  I mean, who doesnt? But youre right, techgore ...  netral
2  I have decided to put myself first and see wha...  senang
3                                    Sorry. Bad day.   sedih
4            welcome to the community, my good dude!  senang


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
df_balanced['emotion_label'] = le.fit_transform(df_balanced['emotion'])

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced['text'], df_balanced['emotion_label'], test_size=0.2, random_state=42
)


In [12]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_sequence_length = max([len(x) for x in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

print(f"X_train_pad shape: {X_train_pad.shape}")
print(f"X_test_pad shape: {X_test_pad.shape}")


X_train_pad shape: (16566, 31)
X_test_pad shape: (4142, 31)


In [13]:
max_sequence_length = max([len(x) for x in X_train_seq])
print(f"Panjang maksimum dari teks dalam dataset: {max_sequence_length}")

Panjang maksimum dari teks dalam dataset: 31


In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print(f"Jumlah kata unik: {vocab_size}")


Jumlah kata unik: 16247


In [15]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

16247


In [28]:
vocab_size = len(tokenizer.word_index) + 1
model = Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])


In [29]:
model.summary()

In [30]:
history = model.fit(
    X_train_pad,
    y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test_pad, y_test)
)


Epoch 1/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.2799 - loss: 1.3765 - val_accuracy: 0.4283 - val_loss: 1.2125
Epoch 2/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.5034 - loss: 1.1233 - val_accuracy: 0.5765 - val_loss: 1.0160
Epoch 3/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6365 - loss: 0.9098 - val_accuracy: 0.5963 - val_loss: 0.9924
Epoch 4/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6892 - loss: 0.8310 - val_accuracy: 0.6074 - val_loss: 1.0080
Epoch 5/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.7212 - loss: 0.7624 - val_accuracy: 0.5876 - val_loss: 1.0221


In [31]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5946 - loss: 1.0217
Test Loss: 1.022050380706787
Test Accuracy: 0.587638795375824


In [32]:
from sklearn.metrics import classification_report

y_pred_probs = model.predict(X_test_pad)
y_pred_classes = y_pred_probs.argmax(axis=1)

print(classification_report(y_test, y_pred_classes, target_names=le.classes_))

[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
              precision    recall  f1-score   support

       marah       0.64      0.53      0.58      1170
      netral       0.43      0.50      0.46       980
       sedih       0.57      0.60      0.58       868
      senang       0.72      0.71      0.71      1124

    accuracy                           0.59      4142
   macro avg       0.59      0.59      0.59      4142
weighted avg       0.60      0.59      0.59      4142



In [33]:
new_text = ["I'm sad again"]

new_text_seq = tokenizer.texts_to_sequences(new_text)
new_text_pad = pad_sequences(new_text_seq, maxlen=max_sequence_length, padding='post')

new_pred = model.predict(new_text_pad)

new_pred_class = np.argmax(new_pred, axis=1)

print(f"Predicted Emotion for '{new_text[0]}': {le.inverse_transform(new_pred_class)[0]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Predicted Emotion for 'I'm sad again': marah


###(BERT-base memiliki 110 juta parameter)