# Final project——Text classification on goemotions | TensorFlow Dataset
Haozhou GU

In this project, I use the TensorFlow dataset which is given.
Link of dataset: https://www.tensorflow.org/datasets/catalog/goemotions

## Package install and setup

In [1]:
# !pip install tensorflow
import json

import pandas as pd 
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds

from sklearn import preprocessing
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Hyperparameters

In [2]:
# vocab_size = 5000
# embedding_dim = 32
# max_length = 100
# trunc_type='post'
# oov_tok = "<OOV>"

## Load data and preprocess

### Load data

In [3]:
# download dataset
train = !wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv
test = !wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv
validation = !wget -P data/full_dataset/ https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv

In [4]:
# load dataset
train = pd.read_csv("https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_1.csv")
test = pd.read_csv("https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_2.csv")
validation = pd.read_csv("https://storage.googleapis.com/gresearch/goemotions/data/full_dataset/goemotions_3.csv")

In [5]:
# preview
train.head(5)

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,0,1,0,0
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,True,0,...,0,0,0,0,0,0,0,0,0,0
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,1
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,1,0,0,0,0,0,0,0,0,0
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,0,1


### Preprocess

In [6]:
# drop the unnecessary columns

train = train.drop(['id','author','subreddit', 'link_id', 'parent_id', 'created_utc','rater_id','example_very_unclear'], axis = 1)
test = test.drop(['id','author','subreddit', 'link_id', 'parent_id', 'created_utc','rater_id','example_very_unclear'], axis = 1)
validation = validation.drop(['id','author','subreddit', 'link_id', 'parent_id', 'created_utc','rater_id','example_very_unclear'], axis = 1)


In [7]:
# define new datsets with the train,test,validation which 'text' column is dropped 

train_labels = np.array(train.drop(['text'], axis = 1))
# train_labels = np.array(train_labels)

validation_labels = np.array(validation.drop(['text'], axis = 1))
# validation_labels = np.array(validation_labels)

test_labels = np.array(test.drop(['text'], axis = 1))
# test_labels = np.array(test_labels)

## Tokenization

In [8]:
# Initialize the Tokenizer
vocab_size = 10000
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
# Fit the Tokenizer on the training data
tokenizer.fit_on_texts(train["text"])

# Tokenize and pad the training data
max_length = 100
trunc_type='post'

train_seq = tokenizer.texts_to_sequences(train["text"])
train_pad = pad_sequences(train_seq, maxlen=max_length, truncating=trunc_type)

# Tokenize and pad the validation data
validation_seq = tokenizer.texts_to_sequences(validation["text"])
validation_pad = pad_sequences(validation_seq, maxlen=max_length, truncating=trunc_type)

# Tokenize and pad the test data
test_seq = tokenizer.texts_to_sequences(test["text"])
test_pad = pad_sequences(test_seq, maxlen=max_length, truncating=trunc_type)

train_pad = np.array(train_pad)
validation_pad = np.array(validation_pad)
test_pad = np.array(test_pad)


# Model Training

In [9]:
embedding_dim = 16
max_length = 100
baseline_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(28, activation='softmax')
])
baseline_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])

In [10]:
baseline_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 24)                408       
                                                                 
 dense_1 (Dense)             (None, 28)                700       
                                                                 
Total params: 161,108
Trainable params: 161,108
Non-trainable params: 0
_________________________________________________________________


In [11]:
epochs = 10
baseline_model_history = baseline_model.fit(train_pad, train_labels, epochs=epochs, validation_data=(validation_pad, validation_labels))

Epoch 1/10


  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
print(baseline_model_history.history['accuracy'])
print(baseline_model_history.history['val_accuracy'])


[0.12549999356269836, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895]
[0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084]


In [13]:
print("model evalutaion on test data")
baseline_results = baseline_model.evaluate(test_pad, test_labels, batch_size=64)
print("loss, acc:", baseline_results)



model evalutaion on test data
loss, acc: [0.15678055584430695, 0.26084285974502563]


Now we are trying to add a LSTM layer to improve the model

# Add a LSTM layer

In [14]:
LSTM_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.LSTM(20),
    tf.keras.layers.Dense(28, activation='softmax')
])
LSTM_model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),optimizer=tf.keras.optimizers.Adam(1e-4),metrics=['accuracy'])


In [15]:
LSTM_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 16)           160000    
                                                                 
 dense_2 (Dense)             (None, 100, 24)           408       
                                                                 
 lstm (LSTM)                 (None, 20)                3600      
                                                                 
 dense_3 (Dense)             (None, 28)                588       
                                                                 
Total params: 164,596
Trainable params: 164,596
Non-trainable params: 0
_________________________________________________________________


In [16]:
epochs = 10
LSTM_model_history = LSTM_model.fit(train_pad, train_labels, epochs=epochs, validation_data=(validation_pad, validation_labels))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
print(LSTM_model_history.history['accuracy'])
print(LSTM_model_history.history['val_accuracy'])


[0.15155714750289917, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.2631857097148895, 0.263700008392334, 0.273328572511673, 0.2873428463935852, 0.29494285583496094]
[0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26136890053749084, 0.26135486364364624, 0.26319411396980286, 0.27622324228286743, 0.28777816891670227, 0.2918919026851654]


In [18]:
print("Model evaluation on test data")
LSTM_results = LSTM_model.evaluate(test_pad, test_labels, batch_size=64)
print("loss, acc:", LSTM_results)

Model evaluation on test data
loss, acc: [0.14933916926383972, 0.2921000123023987]
