In [1]:
import pandas as pd
import os
os.environ['PYTHONHASHSEED']=str(2)

import numpy as np
from sklearn.utils import class_weight
from keras_preprocessing import image as im
import tensorflow as tf
import random

In [2]:
def reset_random_seeds():
   os.environ['PYTHONHASHSEED']=str(2)
   tf.random.set_seed(2)
   np.random.seed(2)
   random.seed(2)

# Loading and preparing data

In [3]:
df = pd.read_csv('../../../data/emo-at-cap/emo-at-cap.csv')

In [4]:
df.columns

Index(['image_name', 'annotation', 'human_sentiment', 'sentiment'], dtype='object')

In [5]:
annotations = df['annotation'].str.lower().values
sentiment = df['human_sentiment'].values

In [6]:
sentiment

array(['Neutral', 'Positive', 'Positive', ..., 'Positive', 'Negative',
       'Negative'], dtype=object)

In [7]:
mapping_sen = {'Negative' : 0, 'Neutral' : 1, 'Positive' : 2}

In [8]:
from nltk.tokenize import WordPunctTokenizer

In [9]:
tokenizer = WordPunctTokenizer()

In [10]:
indexed = [tokenizer.tokenize(i) for i in annotations]

In [11]:
unique_tokens = np.unique(np.hstack(indexed))

In [12]:
mapping_tokens = dict([(c+1,i) for c, i in enumerate(unique_tokens)])

In [13]:
mapping_tokens[0] = 'PAD'

In [14]:
inverse_mapping = dict([(v,k) for k,v in mapping_tokens.items()])

In [15]:
mapped = [list(map(lambda x: inverse_mapping[x], i)) for i in indexed]

In [16]:
np.mean(list(map(len,mapped)))

8.873177083333333

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
mapped[0]

[1937, 1188, 1049, 1623, 796, 1784]

In [19]:
mapped = pad_sequences(mapped, maxlen=9, truncating='post', padding='post')

In [20]:
mapped[0]

array([1937, 1188, 1049, 1623,  796, 1784,    0,    0,    0], dtype=int32)

In [21]:
y = [mapping_sen[i] for i in sentiment]

In [22]:
np.unique(sentiment, return_counts=True)

(array(['Negative', 'Neutral', 'Positive'], dtype=object),
 array([1530,  273, 2037]))

In [23]:
from sklearn.utils.class_weight import compute_class_weight

In [24]:
class_weights = compute_class_weight('balanced', np.unique(sentiment), sentiment)



# Model 

In [25]:
import gensim.downloader as api



In [None]:
ft_embeddings = api.load('fasttext-wiki-news-subwords-300')

In [None]:
def build_embeddings(vocab, word_vectors, embed_dim):
    reset_random_seeds()
    emb_matrix = np.zeros(shape=(len(vocab)+ 1 , embed_dim), dtype='float32')
    count = 0
    for word, i in vocab.items():
        try:
            emb_matrix[i, :] = word_vectors.word_vec(word)
        except:
            count += 1
            emb_matrix[i, :] = np.random.uniform(-0.1, 0.1, embed_dim)
    print('{} words are out of vocabulary：'.format(count))
    return emb_matrix


In [None]:
ft_embeddings = build_embeddings(inverse_mapping, ft_embeddings, 300)

In [None]:
class LSTM_SA(tf.keras.Model):
     def __init__(self, max_tokens, units=512, embeddings=None, dim=128):
        super(LSTM_SA, self).__init__()
        tf.keras.backend.clear_session()
        reset_random_seeds()
        self.units = units
        self.embeddings = tf.keras.layers.Embedding(input_dim=max_tokens+1, output_dim=dim, name='embeddings',
                                                   weights=[embeddings], mask_zero=True)
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        self.lstm = tf.keras.layers.LSTM(units=self.units, return_state=False, return_sequences=False, name='lstm_decoder', 
                                         recurrent_dropout=0.1,recurrent_initializer='glorot_uniform')
        self.dense = tf.keras.layers.Dense(units=self.units//2, activation='relu')
        self.output_dense = tf.keras.layers.Dense(units=max_tokens, activation='softmax')

     def call(self, input):
        embedded = self.dropout(self.embeddings(input))
        result_lstm = self.lstm(embedded)
        result_dense = self.dense(result_lstm)
        probs = self.output_dense(result_dense)
        return probs

In [None]:
lstm_sa = LSTM_SA(len(inverse_mapping), units=256, embeddings=ft_embeddings, dim=300)

# Training 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(mapped,np.array(y) , random_state=0, test_size=0.2)
X_val, X_test, y_val , y_test = train_test_split(X_val,y_val , random_state=0, test_size=0.5)

In [None]:
epochs = 25
batch_size = 128
weighted = True

In [None]:
lstm_sa.compile(loss='sparse_categorical_crossentropy', metrics=['acc'], optimizer='adam')

In [None]:
hist = lstm_sa.fit(x=X_train, y=y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size,
                  class_weight=dict([(c,i) for c, i in enumerate(class_weights)]) if weighted else None)

# Evaluation

In [None]:
from sklearn.metrics import f1_score, accuracy_score

In [None]:
inverse_sen_mapping = dict([(v,k) for k,v in mapping_sen.items()])

In [None]:
predicted = []
for i in X_test:
    predicted.append(np.argmax(lstm_sa.predict(np.expand_dims(i,axis=0))))

In [None]:
f1_score(y_true=y_test, y_pred=predicted, average='macro')

In [None]:
f1_score(y_true=y_test, y_pred=predicted, average='weighted')

In [None]:
accuracy_score(y_true=y_test, y_pred=predicted)

# Validating results

In [None]:
mapping_sen

In [None]:
inverse_sen_mapping = dict([(v,k) for k,v in mapping_sen.items()])

In [None]:
def predict(model, inverse_mapping, sen_mapping, sentence):
    sentence = [inverse_mapping.get(i,0) for i in tokenizer.tokenize(sentence)]
    sentence = pad_sequences([sentence], maxlen=9,  truncating='post', padding='post')
    return sen_mapping[np.argmax(model.predict(sentence))]

In [None]:
predict(lstm_sa, inverse_mapping,inverse_sen_mapping, 'man is trying to shoot someone')

# Saving results

In [None]:
df = pd.DataFrame()

In [None]:
df['name'] = ['sa_lstm_ft']
df['f1_macro'] = [f1_score(y_true=y_test, y_pred=predicted, average='macro')]
df['f1_weighted'] = [f1_score(y_true=y_test, y_pred=predicted, average='weighted')]
df['acc'] = [accuracy_score(y_true=y_test, y_pred=predicted)]
df.to_csv('sa_lstm_ft_logs.csv', index=False)