In [None]:
import tensorflow as tf

In [None]:
#pip install nlp

In [None]:
%matplotlib inline

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random


def show_history(h):
    epochs_trained = len(h.history['loss'])
    plt.figure(figsize=(16, 6))

    plt.subplot(1, 2, 1)
    plt.plot(range(0, epochs_trained), h.history.get('accuracy'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_accuracy'), label='Validation')
    plt.ylim([0., 1.])
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(range(0, epochs_trained), h.history.get('loss'), label='Training')
    plt.plot(range(0, epochs_trained), h.history.get('val_loss'), label='Validation')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    
def show_confusion_matrix(y_true, y_pred, classes):
    from sklearn.metrics import confusion_matrix
    
    cm = confusion_matrix(y_true, y_pred, normalize='true')

    plt.figure(figsize=(8, 8))
    sp = plt.subplot(1, 1, 1)
    ctx = sp.matshow(cm)
    plt.xticks(list(range(0, 6)), labels=classes)
    plt.yticks(list(range(0, 6)), labels=classes)
    plt.colorbar(ctx)
    plt.show()

    
print('Using TensorFlow version', tf.__version__)

In [None]:
dataset= nlp.load_dataset('emotion')

In [None]:
dataset

In [None]:
train = dataset['train']
test = dataset['test']
validation = dataset['validation']

In [None]:
def get_tweet(data):
    texts = data['text']
    labels = data['label']
    return texts,labels

In [None]:
texts,labels =get_tweet(train)


In [None]:
#Tokenizer : map each word to a number
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
#we will have 10K most repeated words, and all of the unmapped words will have the '<UNK>' key
tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>') 
tokenizer.fit_on_texts(texts)

In [None]:
tokenizer.word_counts
tokenizer.index_word #sequences

In [None]:
print(texts[0])
tokenizer.texts_to_sequences([texts[0]])

In [None]:
## padding and truncating
lengths = [len(t.split(' ')) for t in texts] #getting each sentence length
plt.hist(lengths)
plt.show()
#we can see most of the lengths are between 8 to 20 words

In [None]:
# we have to make our input have the same length
maxlen = 50 #most of our texts are <= than 50 words
#we will pad texts with less than 50, and truncate with more than 50
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def get_sequence(tokenizer,texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences,truncating='post',padding='post',maxlen=maxlen)
    return padded
#'post' parameter so the operations of padding and truncating are at the end

In [None]:
padded_train = get_sequence(tokenizer,texts)

In [None]:
#text smaller than 50
print(texts[0])
print(tokenizer.texts_to_sequences([texts[0]]))
padded_train[0]

In [None]:
for i in range(len(texts)):
    if len(texts[i].split(' ')) > 50:
        break

print(tokenizer.texts_to_sequences([texts[i]])
      ,'\n Length = ',len(tokenizer.texts_to_sequences([texts[i]])[0]))
padded_train[i]

In [None]:
#encoding labels
classes = set(labels)
classes

In [None]:
plt.hist(labels,bins=11)
plt.show
#data imbalance 


In [None]:
class_to_index = dict((c,i) for i,c in enumerate(classes))
index_to_class =dict((v,k) for k,v in class_to_index.items())

In [None]:
class_to_index

In [None]:
names_to_ids = lambda labels : np.array([class_to_index.get(x) for x in labels])

In [None]:
train_labels = names_to_ids(labels)
print(labels[0])
train_labels[0]


In [None]:
#Modeling 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Bidirectional,LSTM,Dense

In [None]:
model = Sequential([
    Embedding(10000,16,input_length=maxlen),
    Bidirectional(LSTM(20,return_sequences=True)),
    Bidirectional(LSTM(20)),
    Dense(6,activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [None]:
model.summary() #10k are our tokenized words, 6 dense we got 6 labels

In [None]:
#training the model
val_texts,val_labels = get_tweet(validation)
val_seq = get_sequence(tokenizer,val_texts)
val_labels = names_to_ids(val_labels)

In [None]:
print(validation[0])
print(val_texts[0],val_labels[0])

In [None]:
trained_model = model.fit(
padded_train, train_labels,
    validation_data=(val_seq,val_labels),
    epochs=20,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=2)]
)

In [None]:
#model evaluation
show_history(trained_model)

In [None]:
test_texts,test_labels = get_tweet(test)
test_seq = get_sequence(tokenizer,test_texts)
test_labels = names_to_ids(test_labels)


In [None]:
_ = model.evaluate(test_seq,test_labels)

In [None]:
print('Sentence : ',test_texts[0])
print('Emotion : ',index_to_class[test_labels[0]])
p = model.predict(np.expand_dims(test_seq[0],axis=0))[0]
pred_class = index_to_class[np.argmax(p).astype('uint8')]
print('Predicted :', pred_class)

In [None]:
p = model.predict(test_seq)
print('Sentence : ',test_texts[0])
print('Emotion : ',index_to_class[test_labels[0]])
index_to_class[np.argmax(p[0])]
c = 0
for i in range(len(test_seq)):
    if index_to_class[test_labels[i]] == index_to_class[np.argmax(p[i])] :
        c+=1
print(c,len(test_seq), c/len(test_seq))   


In [None]:
p = np.argmax(model.predict(test_seq),axis=-1)
show_confusion_matrix(test_labels,p,list(classes))

# we have some issues in the surprise/fear