In [1]:
import numpy as np
import pandas as pd

import keras
import getopt
import logging
#import nltk
import os
import re
import sys

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import MaxAbsScaler

from keras.models import Sequential
from keras.layers import Dense, Dropout, SimpleRNN
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


In [2]:
train_proc = pd.read_csv('train_set_processed.csv')
val_proc = pd.read_csv('val_set_processed.csv')
test_proc = pd.read_csv('test_set_processed.csv')

In [3]:
def count_emojis(data, n = 10):
    """
    Function that counts the number of emojis in the data set.
    Display the n most frequent emojis.
    """
    emoji_counts = {}
    for index, row in data.iterrows():
        emoji = row[1]
        if emoji not in emoji_counts:
            # compute simultaneous counting
            emoji_counts[emoji] = data[data.label == emoji].count()[1]
            
    # sort emojis by freq in descending order (list of tuples will be returned)
    sorted_emoji_counts = sorted(emoji_counts.items(), key= lambda kv: kv[1], reverse=True)
        
    return [emoji[0] for emoji in sorted_emoji_counts[:n]]


def emoji_to_int(labels: list):
    return [emoji_map[emoji] for emoji in labels]


def keep_top_10(data, top_10: list): 
    """
    Function that checks, whether Tweet consists of one of the top ten emojis.
    If, and only if, Tweet consists one of the most frequent emojis, 
    Tweet will be used for further analysis.
    Else: Line will be dropped.
    """
    idx_drop = []
    for index, row in data.iterrows():
        if row[1] not in top_10:
            idx_drop.append(index)
    return data.drop(data.index[idx_drop])

In [4]:
top_10_test = count_emojis(test_proc)
print(top_10_test)

['😍', '😂', '❤️', '💕', '😊', '😘', '😭', '💖', '😎', '✨']


In [5]:
emoji_map = {emoji: i for i, emoji in enumerate(top_10_test)}
idx_emoji = {i: emoji for i, emoji in enumerate(top_10_test)}

In [6]:
train_data = keep_top_10(train_proc, top_10_test)
print("Number of Tweets in the train data set: {}".format(len(train_data)))

test_data = keep_top_10(test_proc, top_10_test)
print("Number of Tweets in the test data set: {}".format(len(test_data)))

val_data = keep_top_10(val_proc, top_10_test)
print("Number of Tweets in the validation data set: {}".format(len(val_data)))

Number of Tweets in the train data set: 81236
Number of Tweets in the test data set: 7646
Number of Tweets in the validation data set: 7613


In [8]:
def tweets_cleaning(tweets, labels, use_stopwords = False, train = False, use_bigrams = False, 
                    lowercase = True, stemming = False, min_df = 2, embedding = False):
    """
    Text cleaning function that performs all necessary text preprocessing steps.
    Function only keeps characters, that are alphanumerical (non-alphanumerical values are discarded).
    Digits are treated by regular expressions.
    Lower-casing is performed to reduce noise and normalize the text (convert it into a uniform representation).
    Stemming is performed to only keep the stem of each word token but not any other deviated form. 
    Stop words (i.e., words that occur more frequently than other words in a given corpus) are removed.
    """
    if stemming:
        # initialize Lancaster stemmer
        st = LancasterStemmer()
    if use_stopwords:
        # create list of stopwords
        stopwords = list(set(stopwords.words('english')))
    cleaned_data = []
    cleaned_labels = []
    
    all_bigrams = [] # serves as place-holder
    bigrams_dict = dict()
    vocab = dict()
    
    for tweet, label in zip(tweets, labels):
        tweet = re.sub(r'&amp\S+','', tweet)
        tweet = re.sub(r' & ', ' and ', tweet)
        tweet = re.sub(r'!+', ' ! ', tweet)
        tweet = re.sub(r'[?]+', ' ? ', tweet)
        tweet = re.sub('@.+', '@user', tweet)
        tweet = re.sub('#', '# ', tweet)

        # Create spaces instead of some punctuation marks, but not if it's part of an emoticon
        tweet = ' '.join([word if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word)
            else re.sub('[,.;\-_:/\n\t]+', ' ', word) for word in tweet.split()])
        
        tweet = tweet.split(" ")
        
        cleaned_tweet = []
        for word in tweet:
            
            #if emoticon is in word, keep the emoticon
            if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word):
                cleaned_word = word
            else:
                # keep special characters which might carry important information
                # perform lower-casing to normalize the text and reduce noise
                cleaned_word = ''.join([char for char in word if re.search('[<>$#€£!?@=]', char) or
                                        char.isalnum()])
            if lowercase:
                cleaned_word = cleaned_word.lower()
                
            if "<3" not in cleaned_word:
                cleaned_word = re.sub('[0-9]', '0', cleaned_word)
  
            # removes each \n (i.e., new line) or \t (i.e., tab) -> pipe char denotes a disjunction
            cleaned_word = re.sub(r'( \n| \t)+', '', cleaned_word)
            
            if stemming:
                cleaned_word = st.stem(cleaned_word)
            
            if len(cleaned_word) > 0:
                if not use_stopwords:
                    cleaned_tweet.append(cleaned_word)
                elif(cleaned_word not in stopwords):
                    cleaned_tweet.append(cleaned_word)

                if train:
                    if cleaned_word in vocab:
                        vocab[cleaned_word] += 1
                    else:
                        vocab[cleaned_word] = 1
            
        # only append tweets with more than 1 word per tweet
        if len(cleaned_tweet) > 1:
            
            if train and use_bigrams:
                
                bigrams = [' '.join([cleaned_tweet[i-1], cleaned_tweet[i]]) 
                           for i, _ in enumerate(cleaned_tweet) if i > 0]
                
                for bigram in bigrams:
                    
                    if bigram in bigrams_dict:
                        bigrams_dict[bigram] += 1
                    else:
                        bigrams_dict[bigram] = 1 

            cleaned_tweet = ' '.join(cleaned_tweet)
            cleaned_data.append(cleaned_tweet)
            cleaned_labels.append(label)
            
    if train and embedding and not use_bigrams:
        
        word2index = dict()
        i = 1
        for word in vocab.keys():
            word2index[word] = i
            i += 1
            
        word2index.update({'UNK': len(word2idx) + 1})
        
        assert len(cleaned_data) == len(cleaned_labels)

        return cleaned_data, cleaned_labels, word2index
                
    if train:
        vocab = [word for word, freq in vocab.items() if freq >= min_df]  
        if use_bigrams:
            all_bigrams = [bigram for bigram, freq in bigrams_dict.items() if freq >= min_df]
            vocab.extend(all_bigrams)
        
    assert len(cleaned_data) == len(cleaned_labels)
    
    return cleaned_data, cleaned_labels, sorted(vocab), sorted(all_bigrams)

In [15]:
cleaned_train_data, train_labels, vocab, bigrams = tweets_cleaning(train_data.text, 
                                                                   train_data.label, 
                                                                   use_stopwords = False, 
                                                                   train = True, 
                                                                   use_bigrams = True, 
                                                                   lowercase = True,
                                                                   min_df = 2)

cleaned_test_data, test_labels, _, _ = tweets_cleaning(test_data.text, 
                                                       test_data.label, 
                                                       use_stopwords = False, 
                                                       lowercase = True)

cleaned_val_data, val_labels, _, _ = tweets_cleaning(val_data.text, 
                                                     val_data.label, 
                                                     use_stopwords = False, 
                                                     lowercase = True)

In [16]:
print("Number of unique tokens in the vocabulary: {}".format(len(vocab)))
print()
print("Number of Tweets per data set after text cleaning was computed:")
print("Train: {}".format(len(cleaned_train_data)))
print()
print("Test: {}".format(len(cleaned_test_data)))
print()
print("Validation: {}".format(len(cleaned_val_data)))

Number of unique tokens in the vocabulary: 55855

Number of Tweets per data set after text cleaning was computed:
Train: 68750

Test: 6539

Validation: 6505


In [7]:
y_train = emoji_to_int(train_labels)
y_test = emoji_to_int(test_labels)
y_val = emoji_to_int(val_labels)

NameError: name 'train_labels' is not defined

### Functions for the Bag of Words approach

In [10]:
def bag_of_words(train: list, test: list, val: list, ngram: tuple, vocab = None, 
                 n_best_factor = 0.7):
    """
    Create a weighted bag-of-words unigram or bigram representation of provided tweets.
    Ngram is set to unigram by default. If bigram bag-of-words should be created, pass tuple (2, 2).
    
    Vocabulary argument is set to None by default. 
    You can pass a vocabulary to this function, which may then be used for TfidfVectorizer. 
    If you do not pass a vocabulary to this function, TfidfVectorizer will create a vocabulary itself.
    """ 
    
    vectorizer = CountVectorizer(encoding = 'utf-8', ngram_range = ngram, analyzer = 'word', 
                                 vocabulary = vocab, max_df = 0.9)
    
    train_BoW = vectorizer.fit_transform(train) #.toarray()
    test_BoW = vectorizer.transform(test) #.toarray()
    val_BoW = vectorizer.transform(val) #.toarray()
    

    return train_BoW, test_BoW, val_BoW

def to_cat_matrix(y):

    """ 
    Binary one-hot encoding using an indicator matrix.
    This function converts labels to a categorical matrix which is of size N x K.
    Each row is a row vector with k-1 zeros and a single 1.
    """
    N = len(y)
    K = len(set(y))
    ind_matrix = np.zeros((N,K), dtype = int)
    
    for i, cat in enumerate(y):
        ind_matrix[i, int(cat)] = 1
        
    return ind_matrix

In [20]:
X_train, X_test, X_val = bag_of_words(cleaned_train_data, cleaned_test_data, cleaned_val_data, ngram = (1, 2), vocab = vocab)

In [32]:
y_train = emoji_to_int(train_data.iloc[:68750,:].label)
y_test = emoji_to_int(test_data.iloc[:6539,:].label)
y_val = emoji_to_int(val_data.iloc[:6505,:].label)

In [24]:
X_train, X_test, X_val = bag_of_words(train_data.iloc[:68750,:].text, test_data.iloc[:6539,:].text, val_data.iloc[:6505,:].text, ngram = (1, 2), vocab= None)

In [26]:
X_train.shape

(68750, 268399)

### Neural Network (Multilayer Perceptron)

In [27]:
def get_model(hidden_units: int, input_dims: int, n_labels: int):
    model = Sequential()
    model.add(Dense(hidden_units, input_dim = input_dims, activation = 'relu'))
    model.add(Dropout(0.5)) # dropout is important to prevent model from overfitting
    model.add(Dense(n_labels, activation = 'softmax'))
    adam = keras.optimizers.Adam(lr=0.001, beta_1 = 0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, amsgrad=False)
    model.compile(loss = 'categorical_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

def preds_to_labels(ypred):
    """
    Firstly, extract the predicted label from a vector of probability distributions.
    Secondly, retrieve index of highest value (i.e., highest probability).
    """
    num_labels = [np.argmax(pred) for pred in ypred]
    return np.array(num_labels)

def accuracy_top_n(y_true, y_preds, top_n = 3):
    """
    If the correct label / emoji is among the top n (e.g., two, three) predictions,
    we consider the prediction as correctly labeled.
    """
    n_correct = 0
    n_total = 0
    
    for i, pred in enumerate(y_preds):
        top_3 = np.argsort(pred)[-top_n:]
        if y_true[i] in top_3:
            n_correct += 1
        n_total += 1
        
    ratio = n_correct / n_total
    return round(ratio, 4)

In [33]:
# get indicator matrix with one-hot-encoded vectors per label (of all labels)
y_train = to_cat_matrix(y_train)
y_val = to_cat_matrix(y_val)

In [34]:
# set number of hidden units, epochs and batch size
n_units = 60
n_epochs = 6
n_batches = 32

model = get_model(n_units, X_train.shape[1], y_train.shape[1])

es = EarlyStopping(monitor='val_acc', mode='max', verbose=1)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

In [None]:
model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = n_epochs, 
          batch_size = n_batches, callbacks = [es, mc])

Train on 68750 samples, validate on 6505 samples
Epoch 1/6

Epoch 00001: val_acc improved from -inf to 0.40753, saving model to best_model.h5
Epoch 2/6

Epoch 00002: val_acc improved from 0.40753 to 0.45611, saving model to best_model.h5
Epoch 3/6

Epoch 00003: val_acc improved from 0.45611 to 0.46195, saving model to best_model.h5
Epoch 4/6

Epoch 00004: val_acc improved from 0.46195 to 0.46211, saving model to best_model.h5
Epoch 5/6

In [None]:
# load best model
saved_model = load_model('best_model.h5')

# get predictions
y_pred_test = saved_model.predict(X_test)

# convert predictions to labels
y_pred_labels = preds_to_labels(y_pred_test)

In [None]:
# if true label is among the top 3 predictions, prediction is deemed correctly labeled
print(accuracy_top_n(y_test, y_pred_test, top_n = 3))
# if true label is among the top 2 predictions, prediction is deemed correctly labeled
print(accuracy_top_n(y_test, y_pred_test, top_n = 2))

# if true label is among the top 2 predictions, prediction is deemed correctly labeled
print(accuracy_top_n(y_test, y_pred_test, top_n = 1))

In [None]:
f1_score(y_test, y_pred_labels, average = 'weighted')

f1_score(y_test, y_pred_labels, average = 'micro')

In [None]:
print(classification_report(y_test, y_pred_labels, target_names=top_10_test))

#### Baseline

length of train set: 68750   \\  X_train.shape[0]




X_train, X_test

In [99]:
y_train = emoji_to_int(train_labels)
y_test = emoji_to_int(test_labels)

In [100]:
import random
# Weighted Random
def count_all_emojis(data, n = 10):
    """
    Function that counts the number of emojis in the data set.
    Display the n most frequent emojis.
    """
    emoji_counts = {}
    for label in data:
        if label not in emoji_counts:
            # compute simultaneous counting
            emoji_counts[label] = data.count(label)
            
    # sort emojis by freq in descending order (list of tuples will be returned)
    sorted_emoji_counts = sorted(emoji_counts.items(), key= lambda kv: kv[1], reverse=True)
        
    return sorted_emoji_counts[:n]

counts = count_all_emojis(y_train)
print(counts)
probabilities = []
for label, count in counts:
    probabilities.append((label,count/68750))
print(probabilities)

[(0, 14539), (1, 12574), (2, 12316), (3, 6750), (4, 5774), (6, 3701), (5, 3390), (9, 3367), (8, 3308), (7, 3031)]


In [112]:
def weighted_random_prediction(testset, truelabels, probabilities):
    random.seed = 42
    predictions = []
    top3_predictions = []
    for tweet in testset:
        guess = np.random.choice([0,1,2,3,4,5,6,7,8,9], 1, p = probabilities)
        predictions.append(guess)
        
        guess3 = np.random.choice([0,1,2,3,4,5,6,7,8,9], 3, p = probabilities)
        top3_predictions.append(guess3)
        
    
    print(f1_score(truelabels, predictions,average= 'micro'))
    print(f1_score(truelabels, predictions,average= 'macro'))
    print(f1_score(truelabels, predictions,average= 'weighted'))
    
    
    print(accuracy_top_n(truelabels, top3_predictions, 3))
    
    print(accuracy_top_n(truelabels, top3_predictions, 2))
    
weighted_random_prediction(X_test, y_test, [el[1] for el in probabilities])

0.1348830096345007
0.09738817193887997
0.13395840643816426
0.5741
0.3744


In [118]:
# Majority Vote
print('acc: ', 14539/68750)
print('micro f1: ', f1_score(y_test, np.zeros(len(y_test), dtype=int), average='micro'))
print('macro f1: ', f1_score(y_test, np.zeros(len(y_test), dtype=int), average='macro'))
print('weighted f1: ', f1_score(y_test, np.zeros(len(y_test), dtype=int), average='weighted'))

print('top2: ', (14539 +12574) / 68750)
print('top3: ', (14539 + 12316+12574) / 68750)

acc:  0.21147636363636363
micro f1:  0.20905337207524086
macro f1:  0.034581330634960786
weighted f1:  0.07229343780087383
top2:  0.3943709090909091
top3:  0.5735127272727273


  'precision', 'predicted', average, warn_for)


#### Example Tweets 

In [104]:
# Print out example tweets
start = 0
finish = 1000
for tweet, preds, true in zip(cleaned_test_data[start:finish], y_pred_test[start:finish], test_labels[start:finish]):
    print(tweet)
    pred = np.argsort(preds)
    print("true label:", true)
    print("prediction:", idx_emoji[pred[-1]])
    print("prediction:", idx_emoji[pred[-2]])
    print("prediction:", idx_emoji[pred[-3]])    
    print()

thank you for 00k
true label: ✨
prediction: 💖
prediction: ❤️
prediction: 😘

you asked me for chocolate last night and i didnt have any so i got you this today blessed
true label: 😊
prediction: ❤️
prediction: 😂
prediction: 😍

# ewepvtltd now in # dubai ! all the besttt @user
true label: ❤️
prediction: ❤️
prediction: 😍
prediction: 💖

this weekend norman is hosting the 00rd annual jazz in june music festival showcasing the best jazz around !
true label: 😎
prediction: 😍
prediction: 💕
prediction: ❤️

picture full of trash i can only expect her to like him
true label: 😭
prediction: 😂
prediction: 😭
prediction: 😍

pray for kelso
true label: 😂
prediction: ❤️
prediction: 😂
prediction: 😭

get yourself a girl who can do both
true label: 😍
prediction: 😍
prediction: 😂
prediction: ❤️

happy birthday to the sweetest girl that im blessed to call my best friend ilysm
true label: ❤️
prediction: ❤️
prediction: 💕
prediction: 😘

im so in love
true label: 😍
prediction: 😍
prediction: 💕
prediction: ❤️

happy f

true label: 💕
prediction: 💕
prediction: ❤️
prediction: 😍

when ?
true label: 😭
prediction: 😂
prediction: 😍
prediction: 😭

screenshot pa more
true label: 😊
prediction: 😂
prediction: 😍
prediction: ❤️

have you seen my latest tweet over on @user
true label: 😂
prediction: ❤️
prediction: 😘
prediction: 😊

free drink
true label: 😍
prediction: 😂
prediction: 😍
prediction: 😊

pretty much been a horror life for us so yeah
true label: 😂
prediction: 😂
prediction: 😍
prediction: ❤️

my little bunny
true label: 😍
prediction: 😍
prediction: ❤️
prediction: 😂

my favourite
true label: ❤️
prediction: 😍
prediction: 💕
prediction: ❤️

gigi missing you
true label: ❤️
prediction: ❤️
prediction: 💕
prediction: 😍

moreno guys gt gt gt
true label: 😍
prediction: 😍
prediction: 😂
prediction: ❤️

i guess its nice here
true label: 😍
prediction: 😍
prediction: 😎
prediction: 😂

boss 000
true label: 😍
prediction: 😎
prediction: 😂
prediction: 💕

whats the point in saying sorry now ? impressions tonight
true label: 😎
predictio

shout out to babeee for being my best friend
true label: ❤️
prediction: ❤️
prediction: 😍
prediction: 😂

just chillin in the pool
true label: 😎
prediction: 😎
prediction: 😂
prediction: 😍

happy weekend friends
true label: ✨
prediction: 😊
prediction: 😎
prediction: ❤️

they look like kids in the amusement park
true label: 😂
prediction: 😂
prediction: 😍
prediction: 😭

my iftar date
true label: ✨
prediction: 😍
prediction: 💕
prediction: ❤️

today i am glad i have # netflix and # grumpybird
true label: ❤️
prediction: ❤️
prediction: 💖
prediction: 💕

soon to be tita cath
true label: 😍
prediction: 😍
prediction: ❤️
prediction: 😭

hsshdjsj omfg bf eliza
true label: 😍
prediction: 😍
prediction: 😭
prediction: 💕

selena gomez is too beautiful
true label: 😍
prediction: 😍
prediction: ❤️
prediction: 💕

when you you are pretending to have a headache in front of your crush and act hot at the same time # monsta x
true label: 😂
prediction: 😂
prediction: 😍
prediction: 😎

true love never gets old
true label: 😍
p

prediction: 😘
prediction: ✨

baby girl is finally gettin her weight up after being sick little chubby baby is a whole 00 pounds
true label: 😍
prediction: 😭
prediction: 😂
prediction: 😍

they are sooooooooooooooo lucky @user
true label: 💕
prediction: ❤️
prediction: 😍
prediction: 😘

time to save the galaxy !
true label: 😎
prediction: 😊
prediction: 😍
prediction: ❤️

happy valentines day
true label: ❤️
prediction: ❤️
prediction: 💕
prediction: 💖

my love for him is endless i love him so so so much
true label: 💖
prediction: ❤️
prediction: 💕
prediction: 💖

new reads
true label: 😍
prediction: 😍
prediction: 😊
prediction: 😎

the j0 sending our prayers to orlando # weareorlando
true label: 💖
prediction: ❤️
prediction: 💖
prediction: 💕

hot damn
true label: 😍
prediction: 😍
prediction: 😎
prediction: ✨

the official debut of my sea witch hair
true label: ✨
prediction: 😍
prediction: ✨
prediction: 💕

# thingsiwillalwaysbe respecting and wanting only the best for the things i love @user
true label: ❤️
pr

forget the summer this is how my whole life looks
true label: 😂
prediction: 😂
prediction: 😍
prediction: 😭

my dad and stepmom got me these bookmarks
true label: 😊
prediction: 😂
prediction: 😭
prediction: 💕

my brother is so sweet
true label: 💕
prediction: 💖
prediction: ❤️
prediction: 💕

orlando comfort dogs sent out to help shooting survivors
true label: 😍
prediction: ❤️
prediction: 😍
prediction: 😭

caption this
true label: 😂
prediction: 😂
prediction: 😍
prediction: ❤️

strawberry blonde
true label: 😍
prediction: 😍
prediction: ❤️
prediction: 💕

i still believe that its # blackoutday
true label: ✨
prediction: 😂
prediction: 😍
prediction: 😊

such a savage
true label: 😂
prediction: 😂
prediction: 😭
prediction: 😊

celebrating the end of # vidcon with @user
true label: 😍
prediction: 😍
prediction: 😊
prediction: 💕

snapchat fun
true label: 💕
prediction: 😊
prediction: 😂
prediction: 😘

watching the reader w my girl
true label: 😊
prediction: 😍
prediction: ❤️
prediction: 😂

# wewillwaitforyunjae # yu

prediction: ❤️
prediction: 💕
prediction: 😊

my grandma yai found a dead baby deer in front of her drive way dragged it to her back yard and buried it
true label: 😭
prediction: 😭
prediction: 😍
prediction: 😂

new instagram picture of dakota posted by truthspeaker
true label: 😍
prediction: 😍
prediction: ❤️
prediction: 😊

omg so excited yay !
true label: 😍
prediction: 😍
prediction: 😂
prediction: 😊

just arrived ! shop hazel tassel hem top gt gt
true label: 😍
prediction: 😍
prediction: ❤️
prediction: 😂

cue all the tears allllll of them
true label: 😭
prediction: 😭
prediction: ❤️
prediction: 💕

hosted a wee morning gathering @user
true label: 💕
prediction: 😊
prediction: ❤️
prediction: 😘

good morning world ! have a beautiful day !
true label: ❤️
prediction: 😘
prediction: 💕
prediction: ❤️

the war has just started
true label: 😂
prediction: 😍
prediction: 💕
prediction: ❤️

the fact that the author of tokyo ghoul did this for christina makes me happy
true label: 💕
prediction: 😂
prediction: ❤️
pre

In [None]:
# Count occurances of Emojis in the predictions
freq = {}
for pred in y_pred_labels:
    if idx_emoji[pred] in freq:
        freq[idx_emoji[pred]] += 1
    else:
        freq[idx_emoji[pred]] = 1 
print(freq)

In [None]:
# Count occurances of Emojis in the test set
freq = {}
for y_true in y_test:
    if idx_emoji[y_true] in freq:
        freq[idx_emoji[y_true]] += 1
    else:
        freq[idx_emoji[y_true]] = 1 
print(freq)

#### Print Weights

In [90]:
# load best model
best_model = load_model('best_model.h5')


# get predictions
y_pred_test = best_model.predict(X_test)

# convert predictions to labels
y_pred_labels = preds_to_labels(y_pred_test)

In [99]:
bow_size = len(vocab)
weight_dict = {}

for i, word in enumerate(vocab):
    a = np.zeros(bow_size)
    a[i] = 1
    
    b = np.array([a])
    probs = best_model.predict(b)
    highest_prob = max(probs[0])
    pred = preds_to_labels(probs)[0]
    
    
    if pred in weight_dict:
        weight_dict[pred].append((word, highest_prob))
    else:
        weight_dict[pred] = [(word, highest_prob)]
        

In [103]:
def takeSecond(elem):
    return elem[1]

for i in range(10):
    print(top_10_test[i])
    weight_dict[i].sort(key=takeSecond, reverse = True)
    print(weight_dict[i][:10])
    print()

😍
[('want these', 0.9983735), ('those eyes', 0.9981306), ('shes perfect', 0.9945299), ('damn beautiful', 0.9943896), ('slaying', 0.992997), ('is gorgeous', 0.992004), ('retrocode', 0.9905235), ('rihanna is', 0.9899038), ('that ass', 0.9872978), ('at those', 0.9865487)]

😂
[('capslockbot', 0.9998379), ('draymond', 0.9945152), ('nutshell', 0.9911703), ('nice try', 0.9887912), ('stop this', 0.98324454), ('so accurate', 0.97788996), ('response', 0.97481537), ('this aint', 0.9720667), ('messing', 0.97184485), ('jimins', 0.97169715)]

❤️
[('photo credit', 0.99470407), ('throwback time', 0.9716474), ('mya', 0.94878), ('solidarity', 0.94397146), ('love em', 0.9392214), ('with orlando', 0.9380964), ('abondthatcantbebroken', 0.93129426), ('special note', 0.92522085), ('stop thinking', 0.9237042), ('mom love', 0.92194134)]

💕
[('our dream', 0.9997149), ('cupcakehour', 0.9862292), ('my luv', 0.9273965), ('poppys cupcakes', 0.8832415), ('day celebration', 0.8709491), ('mums and', 0.8446297), ('yuju

In [None]:
print(probs)
print(pred)

In [79]:
len(vocab)

55855

In [31]:
weights = best_model.get_weights()

There are 4 layers:

first (0) layer: connections from all input nodes to all hidden units  <br>
 55000 x 60
<br> <br>
second (1) layer: weights of the hidden nodes <br>
60
<br> <br>
third (2) layer: connections from all hidden nodes to the 10 outputs <br>
60 x 10
<br> <br>
fourth (3) layer: weights of the softmax layer <br>
10

In [60]:
layer = 2

print(weights[layer].shape)
#print(weights[layer])

print(np.argmax(weights[layer], axis=0).shape)
print(np.argmax(weights[layer], axis=0))



(60, 10)
(10,)
[32 21 39 21 32 31 10 12 24 32]


### NEXT PART ONLY FOR RESEARCH PAPER BUT NOT FOR COGSCI II PROJECT !!!

### Word Embeddings approach

In [None]:
lower = True

cleaned_train_data, train_labels, word2idx = tweets_cleaning(train_data.text, 
                                                                   train_data.label, 
                                                                   stop_words, 
                                                                   train = True, 
                                                                   use_bigrams = False, 
                                                                   lowercase = lower,
                                                                   min_df = 2,
                                                                   embedding = True)

cleaned_test_data, test_labels, _, _ = tweets_cleaning(test_data.text, 
                                                       test_data.label, 
                                                       stop_words, 
                                                       lowercase = lower)

cleaned_val_data, val_labels, _, _ = tweets_cleaning(val_data.text, 
                                                     val_data.label, 
                                                     stop_words, 
                                                     lowercase = lower)

In [None]:
# only convert y_train and y_val to categorical matrix
y_train = to_cat_matrix(emoji_to_int(train_labels))
y_val = to_cat_matrix(emoji_to_int(val_labels))
y_test = emoji_to_int(test_labels)

In [None]:
def sent2idx(word2idx: dict, documents: list):
    
    idx_docs = list()
    max_length = max([len(document) for document in documents])
    
    for document in documents: 
        idx_doc = [word2idx[word] if word in word2idx else word2idx['UNK'] 
                   for word in document.split()]
        
        if len(idx_doc) < max_length:
            idx_doc.extend([0 for _ in range(max_length - len(idx_doc))])
            
        idx_docs.append(idx_doc)
        
    return np.array(idx_docs)

In [None]:
X_train = sent2idx(word2idx, cleaned_train_data)
X_val = sent2idx(word2idx, cleaned_val_data)
X_test = sent2idx(word2idx, cleaned_test_data)

In [None]:
# shuffle data before fitting the neural network with it
X_train, y_train = shuffle(X_train, y_train)
X_val, y_val = shuffle(X_val, y_val)

In [None]:
def get_embeddings(text_file, dim):

    """ 
    Read GloVe txt.-file, load pre-trained word embeddings into memory
    and create a word_to_embedding dictionary, where keys are the discrete word strings
    and values are the corresponding continuous word embeddings, retrieved from the GloVe txt.-file.
    For unkown words, the representation is an empty vector (i.e., zeros matrix).
    """
    embeddings_dict = {}

    with open(text_file, encoding="utf8") as file:

        for line in file:
            values = line.split()
            word = values[0]
            wordvec = np.array(values[1:], dtype = 'float32')
            embeddings_dict[word] = list(wordvec)
    
    embeddings_dict.update({'UNK': [0 for _ in range(dim)]})

    return embeddings_dict

In [None]:
emoji_embeddings = get_embeddings("emoji2vec.txt")

In [None]:
def get_emojivecs(emoji_embeddings: dict, corpus: list, dims: int):

    N = len(corpus)
    M = dims
    
    emojivecs = []
    
    # document = tweet; corpus = all tweets
    for emoji in corpus:
        emoji_sequence = []

        try:
            emojivec = emoji_embeddings[emoji]
            assert len(emojivec) == M
            emoji_sequence.append(emojivec)
        except KeyError:
            emoji_sequence.append([0 for _ in range(M)])
            print("This {} does not exist in the pre-trained emoji embeddings.".format(emoji))

        emojivecs.append(emoji_sequence)

    assert len(emojivecs) == N
    return np.array(emojivecs)

In [None]:
def get_wordvecs(word_embeddings: dict, corpus: list, dims: int, zeros_padding = False):

    """ 
    Return a concatenated word vector representation of each tweet.
    The concatenated word vectors serve as the input data for the LSTM RNN.
    Each word (embedding) denotes a time step. (Number of timesteps is equal to the length of the input sentence.)
    
    Check whether length of word vector is equal to the number of dimensions we pass to this function.
    For unknown words (i.e., if key does not exist), the representation is an empty vector / zeros matrix of len dims.

    Sequences can have variable length (i.e., number of time steps per batch).
    However, in some cases you might want to zero pad the batch if a sequence < max length of sequences in the corpus.
    By default this argument is set to False as Keras and Tensorflow except input sequences of variable length.
    If set to True, zero padding is computed.
    """

    N = len(corpus)
    M = dims
    global max_length
    max_length = max([len(sequence) for sequence in corpus])
    wordvecs_corpus = []
    
    # document = tweet; corpus = all tweets
    for document in corpus:
        wordvec_sequence = []
        for word in document:
            
            try:
                wordvec = word_embeddings[word]
                assert len(wordvec) == M
                wordvec_sequence.append(wordvec)
            except KeyError:
                wordvec_sequence.append([0 for _ in range(M)])
                
        # needs to be resolved (!)
        if zeros_padding == True: 
            if len(document) < max_length:

                for _ in range(len(document), max_length):
                    wordvec_sequence.append([0 for _ in range(M)])

                assert len(wordvec_sequence) == max_length
        wordvecs_corpus.append(wordvec_sequence)

    assert len(wordvecs_corpus) == N
    return np.array(wordvecs_corpus)

In [None]:
def embedding_matrix(word2idx: dict, embeddings_dict: dict, dim: int):
    
    embedding_mat = np.zeros((len(word2idx) + 2, dim))
    
    for word, idx in word2idx.items():
        vec = embeddings_dict.get(word)
        # if word is not found in embeddings dictionary, vector will be all zeros
        if vec is not None:
            embedding_mat[idx] = vec
            
    return embedding_mat

In [None]:
word_embeddings = get_embeddings("glove.6B.50d.txt", 50)

In [None]:
embedding_mat = embedding_matrix(word2idx, word_embeddings, 50)

In [None]:
vocab_size = len(word2idx)
hidden_units = 50
n_features = 50
n_labels = 10
optimizer = keras.optimizers.Adam(lr = 0.001, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08, 
                                  decay = 0.0, amsgrad = False)
dropout = 0.2

In [None]:
class GRU_NET():

    #embedding_dim = 300
    
    def __init__(self, vocab_size: int, hidden_units: int, n_features: int, embedding_matrix, 
                 n_labels: int, optimizer, dropout = 0.1):
        
        self.vocab_size = vocab_size
        self.hidden_units = hidden_units
        self.n_features = n_features
        self.embedding_matrix = embedding_matrix
        self.n_labels = n_labels
        # if we want to predict emoji vecs instead of emoji labels, use cosine proximity
        self.loss = "categorical_crossentropy" 
        self.optimizer = optimizer
        self.dropout = dropout
        
        print('Build model...')
        self.model = Sequential()
        
        self.model.add(Embedding(vocab_size + 2, n_features, weights = [embedding_matrix], 
                                 trainable = False, mask_zero = True))
        
        self.model.add(GRU(hidden_units, activation='relu', recurrent_activation='hard_sigmoid', 
                           return_sequences = True))    
        
        self.model.add(Dropout(dropout))
        
        self.model.add(GRU(hidden_units, activation='relu', recurrent_activation='hard_sigmoid', 
                           return_sequences = False))
        
        self.model.add(Dropout(dropout))
        
        #self.model.add(TimeDistributed(Dense(self.n_labels, activation = 'softmax')))
        self.model.add(Dense(self.n_labels, activation = 'softmax'))
        self.model.compile(loss = self.loss, optimizer = self.optimizer, metrics = ['accuracy'])
                       
    def fit(self, X_train, y_train, X_val, y_val,  n_epochs, n_batches):
        return self.model.fit(X_train, y_train, validation_data = (X_val, y_val), 
                              epochs = n_epochs, batch_size = n_batches)
    
    def predict(self, X_test):
        return self.model.predict(X_test)

In [None]:
class LSTM_NET():

    #embedding_dim = 300

    def __init__(self, vocab_size: int, hidden_units: int, n_features: int, embedding_matrix, n_labels: int, 
    optimizer, dropout = 0.1):
        self.vocab_size = vocab_size
        self.hidden_units = hidden_units
        self.n_features = n_features
        self.embedding_matrix = embedding_matrix
        self.n_labels = n_labels
        # if we want to predict emoji vecs instead of emoji labels, use cosine proximity
        self.loss = "categorical_crossentropy" 
        self.optimizer = optimizer
        self.dropout = dropout

        print('Build model...')
        self.model = Sequential()
                
        self.model.add(Embedding(vocab_size + 2, n_features, weights = [embedding_matrix], 
                                 trainable = False, mask_zero = True))


        #self.model.add(LSTM(hidden_units, activation = 'relu', recurrent_activation = 'hard_sigmoid',
                            #return_sequences = True))

        #self.model.add(Dropout(self.dropout))

        self.model.add(LSTM(hidden_units, activation = 'relu', 
                            recurrent_activation = 'hard_sigmoid', return_sequences = False))

        self.model.add(Dropout(self.dropout))

        #self.model.add(TimeDistributed(Dense(self.n_labels, activation = 'softmax')))
        self.model.add(Dense(self.n_labels, activation = 'softmax'))
        self.model.compile(loss = self.loss, optimizer = self.optimizer, metrics = ['accuracy'])

    def fit(self, X_train, y_train, X_val, y_val, n_epochs, n_batches):
        return self.model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = n_epochs, batch_size = n_batches)

    def predict(self, X_test):
        return self.model.predict(X_test)

In [None]:
n_epochs = 5
n_batches = 32

In [None]:
LSTM_NN = LSTM_NET(vocab_size, hidden_units, n_features, embedding_mat, n_labels, optimizer, dropout)

In [None]:
LSTM_NN.fit(X_train, y_train, X_val, y_val, n_epochs, n_batches)

In [None]:
# get predictions
y_pred_test = LSTM_NN.predict(X_test)

In [None]:
i = 0
for tweet, pred in zip(cleaned_test_data, y_pred_labels)
    print(tweet)
    print()
    print(idx_emoji[pred])
    if i == 10:
        break
    i += 1

In [None]:
accuracy_score(y_test, y_pred_test)

In [None]:
# convert predictions to labels
y_pred_labels = preds_to_labels(y_pred_test)

In [None]:
freq = {}
for pred in y_pred_labels:
    if idx_emoji[pred] in freq:
        freq[idx_emoji[pred]] += 1
    else:
        freq[idx_emoji[pred]] = 1 
print(freq)

In [None]:
i = 0
for tweet, pred, true in zip(cleaned_test_data, y_pred_labels, test_labels):
    print(tweet)
    print("prediction:", idx_emoji[pred])
    print("true label:", true)
    print()
    if i == 30:
        break
    i += 1

In [None]:
accuracy_score(y_test, y_pred_labels)

In [None]:
f1_score(y_test, y_pred_labels, average = 'micro')