<a href="https://colab.research.google.com/github/HAAIL/PRADO-GoEmotions-Notebook/blob/main/sequential_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/models/blob/master/research/seq_flow_lite/demo/colab/emotion_colab.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/models/blob/master/research/seq_flow_lite/demo/colab/emotion_colab.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
</table>

### Install Tensorflow 2.10.0

The seq_flow_lite library has been written with the assumption that tensorflow 2.10.0 will be used.  It may be necessary to restart the runtime after installing the correct version of Tensorflow.

In [None]:
!pip install tensorflow==2.10.0

Update CuDNN.  The version installed on the Colab machines does not play well with Tensorflow 2.10.0.

In [None]:
!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2

### Install the TensorFlow Datasets pip package

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
!pip install keras

Load the data from TFDS:

### Train and Evaluate

In [None]:
!pip install sklearn

In [None]:
print("Data loading...")

import pandas as pd
import numpy as np
import re
import string as strinG

raw_data = pd.concat(map(pd.read_csv, ['goemotions_1.csv', 'goemotions_2.csv','goemotions_3.csv'])).to_numpy()
keys =  [
    'admiration',
    'amusement',
    'anger',
    'annoyance',
    'approval',
    'caring',
    'confusion',
    'curiosity',
    'desire',
    'disappointment',
    'disapproval',
    'disgust',
    'embarrassment',
    'excitement',
    'fear',
    'gratitude',
    'grief',
    'joy',
    'love',
    'nervousness',
    'optimism',
    'pride',
    'realization',
    'relief',
    'remorse',
    'sadness',
    'surprise',
    'neutral'

]
print(raw_data[0])
data = np.delete(raw_data[0:], [1, 2, 3, 4, 5, 6, 7, 8], 1)


print(data[0])

def preprocess(string):
    # Strip URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    string = url_pattern.sub(r'', string) 

    string = re.sub("\S*@\S*\s?", "", string)     # Strip Emails

    string = re.sub("\s+", " ", string)           # Strip newlines
    
    string = string.translate(str.maketrans('', '', strinG.punctuation)) # Strip Punctuation
    
    string = string.lower()
    
    return string


X = []
y = []

for i in range(len(data)):
    if(data[i][-1] != 1): # don't include "neutral" to avoid overfitting
        X.append(preprocess(data[i][0]))
        y.append(data[i][1:])

X = np.array(X).astype(str)
y = np.array(y).astype(float)

text_dataset = tf.data.Dataset.from_tensor_slices(X) # for adapting
        

print("Data loaded.")


In [None]:
data.shape
X.shape
print("y")
y.shape
print(y[0])


#len(keys)

In [None]:
from sklearn.model_selection import train_test_split 

x_train, x_test, Y_train, Y_test = train_test_split(list(text_dataset), y, train_size=0.8)

X_train = tf.data.Dataset.from_tensor_slices(x_train)
y_test = tf.data.Dataset.from_tensor_slices(Y_test)
y_train = tf.data.Dataset.from_tensor_slices(Y_train)
X_test = tf.data.Dataset.from_tensor_slices(x_test)
whole1 = tf.data.Dataset.from_tensor_slices((x_train, Y_train))
whole2 = tf.data.Dataset.from_tensor_slices((x_test, Y_test))


In [None]:
#@title Default title text
print("Data loaded.")
import keras

# Create Model

max_words = 20000  # number of words to tokenize
max_len = 300  # We allow up to 300 words per string. The largest in our dataset (after preprocessing) is 703 words

tokenizer = keras.layers.TextVectorization(  # Vectorize Layer tokenizes words
    max_tokens=max_words,
    output_mode='int',
    output_sequence_length=max_len)

print("token")

tokenizer.adapt(X_train.batch(64))  # adapt to the dataset of words

model = keras.models.Sequential()

model.add(tf.keras.Input(shape=(1,), dtype=tf.string))  # Takes a single string as an input
model.add(tokenizer);  # The tokenizer. String -> Vector
model.add(keras.layers.Embedding(max_words, 300))
model.add(keras.layers.Conv1D(260, 8, activation="relu"))  # Hidden sliding window. Dropout to reduce overfitting
model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(100, activation="tanh"))  # Hidden Layer
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(len(keys), activation="softmax"))  # Output Layer

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

epochs = 4
batch_size = 128
# Fit the model weights.

model.fit(whole1,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(whole2))


    

In [None]:
def guessEmotion(string): # Finds the top 3 predictions
    prediction = model.predict([preprocess(string)]).flatten()
    for i in range(6):
        best = np.argmax(prediction)
        bestPercent = np.max(prediction)
        print("Emotion ", i+1, ": ", keys[best], " (", bestPercent*100, "%)")
        prediction[best] = -100 # Look for next-best prediction on the next loop pass

In [None]:
guessEmotion("I love my puppy so much. She is adorable and goofy. I hope I have more pets with her personaility in the future. ")



*   try the XGBoost
*   maybe change the labels
*   stratify the sample 
*   get metrics to work 
*   simplify the network (keep dropout) 







