In [31]:
import numpy as np
import pandas as pd

import keras
import getopt
import logging
#import nltk
import os
import re
import sys
import tweepy

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import MaxAbsScaler

from keras.models import Sequential
from keras.layers import Dense, Dropout, SimpleRNN
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

#from RNNs import *

In [2]:
train_proc = pd.read_csv('train_set_processed.csv')
val_proc = pd.read_csv('val_set_processed.csv')
test_proc = pd.read_csv('test_set_processed.csv')

In [3]:
def count_emojis(data, n = 10):
    """
    Function that counts the number of emojis in the data set.
    Display the n most frequent emojis.
    """
    emoji_counts = {}
    for index, row in data.iterrows():
        emoji = row[1]
        if emoji not in emoji_counts:
            # compute simultaneous counting
            emoji_counts[emoji] = data[data.label == emoji].count()[1]
            
    # sort emojis by freq in descending order (list of tuples will be returned)
    sorted_emoji_counts = sorted(emoji_counts.items(), key= lambda kv: kv[1], reverse=True)
        
    return [emoji[0] for emoji in sorted_emoji_counts[:n]]

In [4]:
top_10_test = count_emojis(test_proc)
print(top_10_test)

['😍', '😂', '❤️', '💕', '😊', '😘', '😭', '💖', '😎', '✨']


In [5]:
### emoji map for top 10 ###

In [6]:
emoji_map = {emoji: i for i, emoji in enumerate(top_10_test)}
idx_emoji = {i: emoji for i, emoji in enumerate(top_10_test)}

In [None]:
distinct_8 = [emoji for emoji, i in emoji_map.items() if i is not 3 and i is not 7]
print(distinct_8)

In [None]:
### emoji map for distinct 8 ###

In [None]:
emoji_map = {emoji: i for i, emoji in enumerate(distinct_8)}
idx_emoji = {i: emoji for i, emoji in enumerate(distinct_8)}

In [7]:
def emoji_to_int(labels: list):
    return [emoji_map[emoji] for emoji in labels]

In [None]:
def distinct_eight(data, distinct_8: list): 
    """
    Function that checks, whether Tweet consists of one of the distinct eight emojis.
    The distinct eight emojis are a subset of the ten most frequent emojis in the dataset,
    where emojis with similar meaning were filtered.
    If, and only if, Tweet consists one of those emojis, 
    Tweet will be used for further analysis.
    Else: Line will be dropped.
    """
    idx_drop = []
    for index, row in data.iterrows():
        if row[1] not in distinct_8:
            idx_drop.append(index)
    return data.drop(data.index[idx_drop])

In [9]:
def keep_top_10(data, top_10: list): 
    """
    Function that checks, whether Tweet consists of one of the top ten emojis.
    If, and only if, Tweet consists one of the most frequent emojis, 
    Tweet will be used for further analysis.
    Else: Line will be dropped.
    """
    idx_drop = []
    for index, row in data.iterrows():
        if row[1] not in top_10:
            idx_drop.append(index)
    return data.drop(data.index[idx_drop])

In [None]:
### distinct 8 ###

In [None]:
train_data = distinct_eight(train_proc, distinct_8)
print("Number of Tweets in the train data set: {}".format(len(train_data)))

In [None]:
test_data = distinct_eight(test_proc, distinct_8)
print("Number of Tweets in the test data set: {}".format(len(test_data)))

In [None]:
val_data = distinct_eight(val_proc, distinct_8)
print("Number of Tweets in the validation data set: {}".format(len(val_data)))

In [None]:
### top 10 ### (don't execute for now, maybe useful for comparison with distinct eight)

In [10]:
train_data = keep_top_10(train_proc, top_10_test)
print("Number of Tweets in the train data set: {}".format(len(train_data)))

Number of Tweets in the train data set: 81236


In [11]:
test_data = keep_top_10(test_proc, top_10_test)
print("Number of Tweets in the test data set: {}".format(len(test_data)))

Number of Tweets in the test data set: 7646


In [12]:
val_data = keep_top_10(val_proc, top_10_test)
print("Number of Tweets in the validation data set: {}".format(len(val_data)))

Number of Tweets in the validation data set: 7613


In [13]:
# create list of stopwords
stop_words = list(set(stopwords.words('english')))

In [14]:
def tweets_cleaning(tweets, labels, stopwords: list, train = False, use_bigrams = False, 
                    lowercase = True, stemming = False, min_df = 2, embedding = False):
    """
    Text cleaning function that performs all necessary text preprocessing steps.
    Function only keeps characters, that are alphanumerical (non-alphanumerical values are discarded).
    Digits are treated by regular expressions.
    Lower-casing is performed to reduce noise and normalize the text (convert it into a uniform representation).
    Stemming is performed to only keep the stem of each word token but not any other deviated form. 
    Stop words (i.e., words that occur more frequently than other words in a given corpus) are removed.
    """
    
     # initialize Lancaster stemmer
    if stemming:
        st = LancasterStemmer()
    
    cleaned_data = []
    cleaned_labels = []
    
    all_bigrams = [] # serves as place-holder
    bigrams_dict = dict()
    vocab = dict()
    
    for tweet, label in zip(tweets, labels):
        tweet = re.sub(r'&amp\S+','', tweet)
        tweet = re.sub(r' & ', ' and ', tweet)
        tweet = re.sub(r'!+', ' ! ', tweet)
        tweet = re.sub(r'[?]+', ' ? ', tweet)
        tweet = re.sub('@.+', '@user', tweet)
        tweet = re.sub('#', '# ', tweet)

        # Create spaces instead of some punctuation marks, but not if it's part of an emoticon
        tweet = ' '.join([word if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word)
            else re.sub('[,.;\-_:/\n\t]+', ' ', word) for word in tweet.split()])
        
        tweet = tweet.split(" ")
        
        cleaned_tweet = []
        for word in tweet:
            
            #if emoticon is in word, keep the emoticon
            if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word):
                cleaned_word = word
            else:
                # keep special characters which might carry important information
                # perform lower-casing to normalize the text and reduce noise
                cleaned_word = ''.join([char for char in word if re.search('[<>$#€£!?@=]', char) or
                                        char.isalnum()])
            if lowercase:
                cleaned_word = cleaned_word.lower()
                
            if "<3" not in cleaned_word:
                cleaned_word = re.sub('[0-9]', '0', cleaned_word)
  
            # removes each \n (i.e., new line) or \t (i.e., tab) -> pipe char denotes a disjunction
            cleaned_word = re.sub(r'( \n| \t)+', '', cleaned_word)
            
            if stemming:
                cleaned_word = st.stem(cleaned_word)
            
            if len(cleaned_word) > 0 and cleaned_word not in stopwords:
                cleaned_tweet.append(cleaned_word)
                
                if train:
                    if cleaned_word in vocab:
                        vocab[cleaned_word] += 1
                    else:
                        vocab[cleaned_word] = 1
            
        # only append tweets with more than 1 word per tweet
        if len(cleaned_tweet) > 1:
            
            if train and use_bigrams:
                
                bigrams = [' '.join([cleaned_tweet[i-1], cleaned_tweet[i]]) 
                           for i, _ in enumerate(cleaned_tweet) if i > 0]
                
                for bigram in bigrams:
                    
                    if bigram in bigrams_dict:
                        bigrams_dict[bigram] += 1
                    else:
                        bigrams_dict[bigram] = 1 

            cleaned_tweet = ' '.join(cleaned_tweet)
            cleaned_data.append(cleaned_tweet)
            cleaned_labels.append(label)
            
    if train and embedding and not use_bigrams:
        
        word2index = dict()
        i = 1
        for word in vocab.keys():
            word2index[word] = i
            i += 1
            
        word2index.update({'UNK': len(word2idx) + 1})
        
        assert len(cleaned_data) == len(cleaned_labels)

        return cleaned_data, cleaned_labels, word2index
                
    if train:
        vocab = [word for word, freq in vocab.items() if freq >= min_df]
            
        
        if use_bigrams:
            all_bigrams = [bigram for bigram, freq in bigrams_dict.items() if freq >= min_df]
            vocab.extend(all_bigrams)
        
        #cleaned_data = [tweet.split() for tweet in cleaned_data]
        
        #for i, tweet in enumerate(cleaned_data):
        #    for j, word in enumerate(tweet):
        #        
        #        if word not in vocab:
        #            cleaned_data[i].pop(j)
        #            
        #    if len(cleaned_data) < 2:
        #        cleaned_data.pop(i)
        #        cleaned_labels.pop(i)
        #        
        #cleaned_data = [' '.join(tweet) for tweet in cleaned_data]
        
    assert len(cleaned_data) == len(cleaned_labels)
    
    return cleaned_data, cleaned_labels, sorted(vocab), sorted(all_bigrams)

In [15]:
cleaned_train_data, train_labels, vocab, bigrams = tweets_cleaning(train_data.text, 
                                                                   train_data.label, 
                                                                   stop_words, 
                                                                   train = True, 
                                                                   use_bigrams = True, 
                                                                   lowercase = True,
                                                                   min_df = 4)

cleaned_test_data, test_labels, _, _ = tweets_cleaning(test_data.text, 
                                                       test_data.label, 
                                                       stop_words, 
                                                       lowercase = True)

cleaned_val_data, val_labels, _, _ = tweets_cleaning(val_data.text, 
                                                     val_data.label, 
                                                     stop_words, 
                                                     lowercase = True)

In [16]:
print("Number of Tweets per data set after text cleaning was computed:")
print()
print("Train: {}".format(len(cleaned_train_data)))
print()
print("Test: {}".format(len(cleaned_test_data)))
print()
print("Validation: {}".format(len(cleaned_val_data)))

Number of Tweets per data set after text cleaning was computed:

Train: 64540

Test: 6142

Validation: 6114


In [17]:
print("Number of unique tokens in the vocabulary: {}".format(len(vocab)))

Number of unique tokens in the vocabulary: 14699


In [18]:
y_train = emoji_to_int(train_labels)
y_test = emoji_to_int(test_labels)
y_val = emoji_to_int(val_labels)

### Functions for the Bag of Words approach

In [19]:
def bag_of_words(train: list, test: list, val: list, ngram: tuple, vocab = None, 
                 n_best_factor = 0.7):
    """
    Create a weighted bag-of-words unigram or bigram representation of provided tweets.
    Ngram is set to unigram by default. If bigram bag-of-words should be created, pass tuple (2, 2).
    
    Vocabulary argument is set to None by default. 
    You can pass a vocabulary to this function, which may then be used for TfidfVectorizer. 
    If you do not pass a vocabulary to this function, TfidfVectorizer will create a vocabulary itself.
    """ 
    
    vectorizer = TfidfVectorizer(encoding = 'utf-8', ngram_range = ngram, analyzer = 'word', 
                                 vocabulary = vocab, min_df = 0.1, max_df = 0.9, norm = 'l2',
                                 smooth_idf = True, sublinear_tf = True)
    
    train_BoW = vectorizer.fit_transform(train).toarray()
    test_BoW = vectorizer.transform(test).toarray()
    val_BoW = vectorizer.transform(val).toarray()
    
    #n_best = int(len(vectorizer.idf_) * n_best_factor)
    #idx = np.argsort(vectorizer.idf_)[:n_best]

    #train_BoW = train_BoW[:, idx]
    #test_BoW = test_BoW[:, idx]
    #val_BoW = val_BoW[:, idx]


    return train_BoW, test_BoW, val_BoW

In [20]:
def to_cat_matrix(y):

    """ 
    Binary one-hot encoding using an indicator matrix.
    This function converts labels to a categorical matrix which is of size N x K.
    Each row is a row vector with k-1 zeros and a single 1.
    """
    N = len(y)
    K = len(set(y))
    ind_matrix = np.zeros((N,K), dtype = int)
    
    for i, cat in enumerate(y):
        ind_matrix[i, int(cat)] = 1
        
    return ind_matrix

In [21]:
X_train, X_test, X_val = bag_of_words(cleaned_train_data, cleaned_test_data, cleaned_val_data, ngram = (1, 2), vocab = vocab)

In [None]:
#np.savetxt('X_train_BoW.txt', X_train)
#np.savetxt('X_test_BoW.txt', X_test)
#np.savetxt('X_val_BoW.txt', X_val)

#np.savetxt('y_train_BoW.txt', y_train)
#np.savetxt('y_test_BoW.txt', y_test)
#np.savetxt('y_val_BoW.txt', y_val)

### Neural Network (Multilayer Perceptron)

In [22]:
def get_model(hidden_units: int, input_dims: int, n_labels: int):
    model = Sequential()
    model.add(Dense(hidden_units, input_dim = input_dims, activation = 'relu'))
    model.add(Dropout(0.5)) # dropout is important to prevent model from overfitting
    model.add(Dense(n_labels, activation = 'softmax'))
    adam = keras.optimizers.Adam(lr=0.001, beta_1 = 0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, amsgrad=False)
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

In [23]:
def preds_to_labels(ypred):
    """
    Firstly, extract the predicted label from a vector of probability distributions.
    Secondly, retrieve index of highest value (i.e., highest probability).
    """
    num_labels = [np.argmax(pred) for pred in ypred]
    return np.array(num_labels)

In [24]:
# set number of hidden units, epochs and batch size
n_units = 50
n_epochs = 6
n_batches = 32

In [33]:
#max_abs_scaler = MaxAbsScaler()
#X_train = max_abs_scaler.fit_transform(X_train)
#X_val = max_abs_scaler.transform(X_val)
#X_test = max_abs_scaler.transform(X_test)

MemoryError: 

In [None]:
# shuffle data before fitting the neural network with it
X_train, y_train = shuffle(X_train, y_train)
X_val, y_val = shuffle(X_val, y_val)

In [25]:
# get indicator matrix with one-hot-encoded vectors per label (of all labels)
y_train = to_cat_matrix(y_train)
y_val = to_cat_matrix(y_val)

In [26]:
model = get_model(n_units, X_train.shape[1], y_train.shape[1])

In [27]:
es = EarlyStopping(monitor='val_acc', mode='max', verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [28]:
model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = n_epochs, 
          batch_size = n_batches, callbacks = [es, mc])

Train on 64540 samples, validate on 6114 samples
Epoch 1/6

Epoch 00001: val_acc improved from -inf to 0.37422, saving model to best_model.h5
Epoch 2/6

Epoch 00002: val_acc improved from 0.37422 to 0.38894, saving model to best_model.h5
Epoch 3/6

Epoch 00003: val_acc improved from 0.38894 to 0.39549, saving model to best_model.h5
Epoch 4/6

Epoch 00004: val_acc improved from 0.39549 to 0.40334, saving model to best_model.h5
Epoch 5/6

Epoch 00005: val_acc improved from 0.40334 to 0.40415, saving model to best_model.h5
Epoch 6/6

KeyboardInterrupt: 

In [None]:
# load best model
saved_model = load_model('best_model.h5')

In [None]:
# get predictions
y_pred_test = saved_model.predict(X_test)

In [None]:
# convert predictions to labels
y_pred_labels = preds_to_labels(y_pred_test)

In [None]:
def accuracy_top_n(y_true, y_preds, top_n = 3):
    """
    If the correct label / emoji is among the top n (e.g., two, three) predictions,
    we consider the prediction as correctly labeled.
    """
    n_correct = 0
    n_total = 0
    
    for i, pred in enumerate(y_preds):
        top_3 = np.argsort(pred)[-top_n:]
        if y_true[i] in top_3:
            n_correct += 1
        n_total += 1
        
    ratio = n_correct / n_total
    return round(ratio, 4)

In [None]:
# if true label is among the top 3 predictions, prediction is deemed correctly labeled
accuracy_top_n(y_test, y_pred_test, top_n = 3)

In [None]:
# if true label is among the top 2 predictions, prediction is deemed correctly labeled
accuracy_top_n(y_test, y_pred_test, top_n = 2)

In [None]:
accuracy_score(y_test, y_pred_labels)

In [None]:
f1_score(y_test, y_pred_labels, average = 'weighted')

In [None]:
f1_score(y_test, y_pred_labels, average = 'micro')

In [None]:
print(classification_report(y_test, y_pred_labels, target_names=top_10_test))

In [None]:
i = 0
for tweet, pred, true in zip(cleaned_test_data, y_pred_labels, test_labels):
    print(tweet)
    print("prediction:", idx_emoji[pred])
    print("true label:", true)
    print()
    if i == 30:
        break
    i += 1

In [None]:
freq = {}
for pred in y_pred_labels:
    if idx_emoji[pred] in freq:
        freq[idx_emoji[pred]] += 1
    else:
        freq[idx_emoji[pred]] = 1 
print(freq)

In [None]:
freq = {}
for y_true in y_test:
    if idx_emoji[y_true] in freq:
        freq[idx_emoji[y_true]] += 1
    else:
        freq[idx_emoji[y_true]] = 1 
print(freq)

### NEXT PART ONLY FOR RESEARCH PAPER BUT NOT FOR COGSCI II PROJECT !!!

### Word Embeddings approach

In [None]:
lower = True

cleaned_train_data, train_labels, word2idx = tweets_cleaning(train_data.text, 
                                                                   train_data.label, 
                                                                   stop_words, 
                                                                   train = True, 
                                                                   use_bigrams = False, 
                                                                   lowercase = lower,
                                                                   min_df = 2,
                                                                   embedding = True)

cleaned_test_data, test_labels, _, _ = tweets_cleaning(test_data.text, 
                                                       test_data.label, 
                                                       stop_words, 
                                                       lowercase = lower)

cleaned_val_data, val_labels, _, _ = tweets_cleaning(val_data.text, 
                                                     val_data.label, 
                                                     stop_words, 
                                                     lowercase = lower)

In [None]:
# only convert y_train and y_val to categorical matrix
y_train = to_cat_matrix(emoji_to_int(train_labels))
y_val = to_cat_matrix(emoji_to_int(val_labels))

y_test = emoji_to_int(test_labels)

In [None]:
def sent2idx(word2idx: dict, documents: list):
    
    idx_docs = list()
    max_length = max([len(document) for document in documents])
    
    for document in documents: 
        idx_doc = [word2idx[word] if word in word2idx else word2idx['UNK'] 
                   for word in document.split()]
        
        if len(idx_doc) < max_length:
            idx_doc.extend([0 for _ in range(max_length - len(idx_doc))])
            
        idx_docs.append(idx_doc)
        
    return np.array(idx_docs)

In [None]:
X_train = sent2idx(word2idx, cleaned_train_data)
X_val = sent2idx(word2idx, cleaned_val_data)
X_test = sent2idx(word2idx, cleaned_test_data)

In [None]:
# shuffle data before fitting the neural network with it
X_train, y_train = shuffle(X_train, y_train)
X_val, y_val = shuffle(X_val, y_val)

In [None]:
def get_embeddings(text_file, dim):

    """ 
    Read GloVe txt.-file, load pre-trained word embeddings into memory
    and create a word_to_embedding dictionary, where keys are the discrete word strings
    and values are the corresponding continuous word embeddings, retrieved from the GloVe txt.-file.
    For unkown words, the representation is an empty vector (i.e., zeros matrix).
    """
    embeddings_dict = {}

    with open(text_file, encoding="utf8") as file:

        for line in file:
            values = line.split()
            word = values[0]
            wordvec = np.array(values[1:], dtype = 'float32')
            embeddings_dict[word] = list(wordvec)
    
    embeddings_dict.update({'UNK': [0 for _ in range(dim)]})

    return embeddings_dict

In [None]:
emoji_embeddings = get_embeddings("emoji2vec.txt")

In [None]:
def get_emojivecs(emoji_embeddings: dict, corpus: list, dims: int):

    N = len(corpus)
    M = dims
    
    emojivecs = []
    
    # document = tweet; corpus = all tweets
    for emoji in corpus:
        emoji_sequence = []

        try:
            emojivec = emoji_embeddings[emoji]
            assert len(emojivec) == M
            emoji_sequence.append(emojivec)
        except KeyError:
            emoji_sequence.append([0 for _ in range(M)])
            print("This {} does not exist in the pre-trained emoji embeddings.".format(emoji))

        emojivecs.append(emoji_sequence)

    assert len(emojivecs) == N
    return np.array(emojivecs)

In [None]:
def get_wordvecs(word_embeddings: dict, corpus: list, dims: int, zeros_padding = False):

    """ 
    Return a concatenated word vector representation of each tweet.
    The concatenated word vectors serve as the input data for the LSTM RNN.
    Each word (embedding) denotes a time step. (Number of timesteps is equal to the length of the input sentence.)
    
    Check whether length of word vector is equal to the number of dimensions we pass to this function.
    For unknown words (i.e., if key does not exist), the representation is an empty vector / zeros matrix of len dims.

    Sequences can have variable length (i.e., number of time steps per batch).
    However, in some cases you might want to zero pad the batch if a sequence < max length of sequences in the corpus.
    By default this argument is set to False as Keras and Tensorflow except input sequences of variable length.
    If set to True, zero padding is computed.
    """

    N = len(corpus)
    M = dims
    global max_length
    max_length = max([len(sequence) for sequence in corpus])
    wordvecs_corpus = []
    
    # document = tweet; corpus = all tweets
    for document in corpus:
        wordvec_sequence = []
        for word in document:
            
            try:
                wordvec = word_embeddings[word]
                assert len(wordvec) == M
                wordvec_sequence.append(wordvec)
            except KeyError:
                wordvec_sequence.append([0 for _ in range(M)])
                
        # needs to be resolved (!)
        if zeros_padding == True: 
            if len(document) < max_length:

                for _ in range(len(document), max_length):
                    wordvec_sequence.append([0 for _ in range(M)])

                assert len(wordvec_sequence) == max_length
        wordvecs_corpus.append(wordvec_sequence)

    assert len(wordvecs_corpus) == N
    return np.array(wordvecs_corpus)

In [None]:
def embedding_matrix(word2idx: dict, embeddings_dict: dict, dim: int):
    
    embedding_mat = np.zeros((len(word2idx) + 2, dim))
    
    for word, idx in word2idx.items():
        vec = embeddings_dict.get(word)
        # if word is not found in embeddings dictionary, vector will be all zeros
        if vec is not None:
            embedding_mat[idx] = vec
            
    return embedding_mat

In [None]:
word_embeddings = get_embeddings("glove.6B.50d.txt", 50)

In [None]:
embedding_mat = embedding_matrix(word2idx, word_embeddings, 50)

In [None]:
vocab_size = len(word2idx)
hidden_units = 50
n_features = 50
n_labels = 10
optimizer = keras.optimizers.Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08, 
                                  decay = 0.0, amsgrad = False)
dropout = 0.2

In [None]:
class GRU_NET():

    #embedding_dim = 300
    
    def __init__(self, vocab_size: int, hidden_units: int, n_features: int, embedding_matrix, 
                 n_labels: int, optimizer, dropout = 0.1):
        
        self.vocab_size = vocab_size
        self.hidden_units = hidden_units
        self.n_features = n_features
        self.embedding_matrix = embedding_matrix
        self.n_labels = n_labels
        # if we want to predict emoji vecs instead of emoji labels, use cosine proximity
        self.loss = "categorical_crossentropy" 
        self.optimizer = optimizer
        self.dropout = dropout
        
        print('Build model...')
        self.model = Sequential()
        
        self.model.add(Embedding(vocab_size + 2, n_features, weights = [embedding_matrix], 
                                 trainable = False, mask_zero = True))
        
        self.model.add(GRU(hidden_units, activation='relu', recurrent_activation='hard_sigmoid', 
                           return_sequences = True))    
        
        self.model.add(Dropout(dropout))
        
        self.model.add(GRU(hidden_units, activation='relu', recurrent_activation='hard_sigmoid', 
                           return_sequences = False))
        
        self.model.add(Dropout(dropout))
        
        #self.model.add(TimeDistributed(Dense(self.n_labels, activation = 'softmax')))
        self.model.add(Dense(self.n_labels, activation = 'softmax'))
        self.model.compile(loss = self.loss, optimizer = self.optimizer, metrics = ['accuracy'])
                       
    def fit(self, X_train, y_train, X_val, y_val,  n_epochs, n_batches):
        return self.model.fit(X_train, y_train, validation_data = (X_val, y_val), 
                              epochs = n_epochs, batch_size = n_batches)
    
    def predict(self, X_test):
        return self.model.predict(X_test)

In [None]:
class LSTM_NET():

    #embedding_dim = 300

    def __init__(self, vocab_size: int, hidden_units: int, n_features: int, embedding_matrix, n_labels: int, 
    optimizer, dropout = 0.1):
        self.vocab_size = vocab_size
        self.hidden_units = hidden_units
        self.n_features = n_features
        self.embedding_matrix = embedding_matrix
        self.n_labels = n_labels
        # if we want to predict emoji vecs instead of emoji labels, use cosine proximity
        self.loss = "categorical_crossentropy" 
        self.optimizer = optimizer
        self.dropout = dropout

        print('Build model...')
        self.model = Sequential()
                
        self.model.add(Embedding(vocab_size + 2, n_features, weights = [embedding_matrix], 
                                 trainable = False, mask_zero = True))


        #self.model.add(LSTM(hidden_units, activation = 'relu', recurrent_activation = 'hard_sigmoid',
                            #return_sequences = True))

        #self.model.add(Dropout(self.dropout))

        self.model.add(LSTM(hidden_units, activation = 'relu', 
                            recurrent_activation = 'hard_sigmoid', return_sequences = False))

        self.model.add(Dropout(self.dropout))

        #self.model.add(TimeDistributed(Dense(self.n_labels, activation = 'softmax')))
        self.model.add(Dense(self.n_labels, activation = 'softmax'))
        self.model.compile(loss = self.loss, optimizer = self.optimizer, metrics = ['accuracy'])

    def fit(self, X_train, y_train, X_val, y_val, n_epochs, n_batches):
        return self.model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = n_epochs, batch_size = n_batches)

    def predict(self, X_test):
        return self.model.predict(X_test)

In [None]:
n_epochs = 5
n_batches = 32

In [None]:
LSTM_NN = LSTM_NET(vocab_size, hidden_units, n_features, embedding_mat, n_labels, optimizer, dropout)

In [None]:
LSTM_NN.fit(X_train, y_train, X_val, y_val, n_epochs, n_batches)

In [None]:
# get predictions
y_pred_test = LSTM_NN.predict(X_test)

In [None]:
i = 0
for tweet, pred in zip(cleaned_test_data, y_pred_labels)
    print(tweet)
    print()
    print(idx_emoji[pred])
    if i == 10:
        break
    i += 1

In [None]:
accuracy_score(y_test, y_pred_test)

In [None]:
# convert predictions to labels
y_pred_labels = preds_to_labels(y_pred_test)

In [None]:
freq = {}
for pred in y_pred_labels:
    if idx_emoji[pred] in freq:
        freq[idx_emoji[pred]] += 1
    else:
        freq[idx_emoji[pred]] = 1 
print(freq)

In [None]:
i = 0
for tweet, pred, true in zip(cleaned_test_data, y_pred_labels, test_labels):
    print(tweet)
    print("prediction:", idx_emoji[pred])
    print("true label:", true)
    print()
    if i == 30:
        break
    i += 1

In [None]:
accuracy_score(y_test, y_pred_labels)

In [None]:
f1_score(y_test, y_pred_labels, average = 'micro')