In [1]:
import numpy as np
import pandas as pd

import getopt
import logging
import nltk
import os
import re
import sys

from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer

from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import MaxAbsScaler


import keras
from keras import layers
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

Using TensorFlow backend.


## Helpers for reading Embeddings from files

In [2]:
def get_embeddings(text_file):

    """ 
    Read GloVe txt.-file, load pre-trained word embeddings into memory
    and create a word_to_embedding dictionary, where keys are the discrete word strings
    and values are the corresponding continuous word embeddings, retrieved from the GloVe txt.-file.
    For unkown words, the representation is an empty vector (i.e., zeros matrix).
    """
    embeddings_dict = {}

    with open(text_file, encoding="utf8") as file:

        for line in file:
            values = line.split()
            word = values[0]
            wordvec = np.array(values[1:], dtype = 'float32')
            embeddings_dict[word] = list(wordvec)

    return embeddings_dict

def sent2idx(word2idx, documents):
    idx_docs = list()    
    for document in documents: 
        idx_doc = [word2idx[word] if word in word2idx else word2idx['UNK'] 
                   for word in document.split()]            
        idx_docs.append(idx_doc)
    return np.array(idx_docs)

def embedding_matrix(word2idx, embeddings_dict, dim):
    
    embedding_mat = np.zeros((len(word2idx.keys()), dim))
    
    for word, idx in word2idx.items():
        vec = embeddings_dict.get(word)
        # if word is not found in embeddings dictionary, vector will be all zeros
        if vec is not None:
            embedding_mat[idx] = vec            
    return embedding_mat

def get_emojivecs(emoji_embeddings, corpus, dims):

    N = len(corpus)
    M = dims
    
    emojivecs = []
    
    # document = tweet; corpus = all tweets
    for emoji in corpus:
        emoji_sequence = []

        try:
            emojivec = emoji_embeddings[emoji]
            assert len(emojivec) == M
            emoji_sequence.append(emojivec)
        except KeyError:
            emoji_sequence.append([0 for _ in range(M)])
            print("This {} does not exist in the pre-trained emoji embeddings.".format(emoji))

        emojivecs.append(emoji_sequence)

    assert len(emojivecs) == N
    return np.array(emojivecs)

def get_wordvecs(word_embeddings, corpus, dims, zeros_padding = False):

    """ 
    Return a concatenated word vector representation of each tweet.
    The concatenated word vectors serve as the input data for the LSTM RNN.
    Each word (embedding) denotes a time step. (Number of timesteps is equal to the length of the input sentence.)
    
    Check whether length of word vector is equal to the number of dimensions we pass to this function.
    For unknown words (i.e., if key does not exist), the representation is an empty vector / zeros matrix of len dims.

    Sequences can have variable length (i.e., number of time steps per batch).
    However, in some cases you might want to zero pad the batch if a sequence < max length of sequences in the corpus.
    By default this argument is set to False as Keras and Tensorflow except input sequences of variable length.
    If set to True, zero padding is computed.
    """

    N = len(corpus)
    M = dims
    global max_length
    max_length = max([len(sequence) for sequence in corpus])
    wordvecs_corpus = []
    
    # document = tweet; corpus = all tweets
    for document in corpus:
        wordvec_sequence = []
        for word in document:
            
            try:
                wordvec = word_embeddings[word]
                assert len(wordvec) == M
                wordvec_sequence.append(wordvec)
            except KeyError:
                wordvec_sequence.append([0 for _ in range(M)])
                
        # needs to be resolved (!)
        if zeros_padding == True: 
            if len(document) < max_length:

                for _ in range(len(document), max_length):
                    wordvec_sequence.append([0 for _ in range(M)])

                assert len(wordvec_sequence) == max_length
        wordvecs_corpus.append(wordvec_sequence)

    assert len(wordvecs_corpus) == N
    return np.array(wordvecs_corpus)

## Helpers for top10 emojis

In [3]:
def count_emojis(data, n = 10):
    """
    Function that counts the number of emojis in the data set.
    Display the n most frequent emojis.
    """
    emoji_counts = {}
    for index, row in data.iterrows():
        emoji = row[1]
        if emoji not in emoji_counts:
            # compute simultaneous counting
            emoji_counts[emoji] = data[data.label == emoji].count()[1]
            
    # sort emojis by freq in descending order (list of tuples will be returned)
    sorted_emoji_counts = sorted(emoji_counts.items(), key= lambda kv: kv[1], reverse=True)
        
    return [emoji[0] for emoji in sorted_emoji_counts[:n]]


def emoji_to_int(labels):
    return [emoji_map[emoji] for emoji in labels]

def to_cat_matrix(y):
    """ 
    Binary one-hot encoding using an indicator matrix.
    This function converts labels to a categorical matrix which is of size N x K.
    Each row is a row vector with k-1 zeros and a single 1.
    """
    N = len(y)
    K = len(set(y))
    ind_matrix = np.zeros((N,K), dtype = int)
    
    for i, cat in enumerate(y):
        ind_matrix[i, int(cat)] = 1
        
    return ind_matrix

def preds_to_labels(ypred):
    """
    Firstly, extract the predicted label from a vector of probability distributions.
    Secondly, retrieve index of highest value (i.e., highest probability).
    """
    num_labels = [np.argmax(pred) for pred in ypred]
    return np.array(num_labels)

def keep_top_10(data, top_10): 
    """
    Function that checks, whether Tweet consists of one of the top ten emojis.
    If, and only if, Tweet consists one of the most frequent emojis, 
    Tweet will be used for further analysis.
    Else: Line will be dropped.
    """
    idx_drop = []
    for index, row in data.iterrows():
        if row[1] not in top_10:
            idx_drop.append(index)
    return data.drop(data.index[idx_drop])

## Read Data and reduce to top10

In [4]:
train_file = 'Datasets/train_set_processed.csv'
test_file = 'Datasets/test_set_processed.csv'
val_file = 'Datasets/val_set_processed.csv'


train_data = pd.read_csv(train_file, sep=',', encoding = 'utf8', engine='c', header = 0)
test_data = pd.read_csv(test_file, sep=',', encoding = 'utf8', engine='c', header = 0)
val_data = pd.read_csv(val_file, sep=',', encoding = 'utf8', engine='c', header = 0)

train_data.head()

Unnamed: 0,text,label
0,Kicking it at @arteryrcdings,✌️
1,Go England,💪
2,I'm single,😒
3,My boy Buddy all grown up now,💔
4,"The one and only, @TheBigPygmy appearing on @e...",🙏


In [5]:
top_10_test = count_emojis(test_data)
print(top_10_test)

emoji_map = {emoji: i for i, emoji in enumerate(top_10_test)}
idx_emoji = {i: emoji for i, emoji in enumerate(top_10_test)}

['😍', '😂', '❤️', '💕', '😊', '😘', '😭', '💖', '😎', '✨']


In [6]:
train_data = keep_top_10(train_data, top_10_test)
print("Number of Tweets in the train data set: {}".format(len(train_data)))

test_data = keep_top_10(test_data, top_10_test)
print("Number of Tweets in the test data set: {}".format(len(test_data)))

val_data = keep_top_10(val_data, top_10_test)
print("Number of Tweets in the validation data set: {}".format(len(val_data)))

Number of Tweets in the train data set: 81236
Number of Tweets in the test data set: 7646
Number of Tweets in the validation data set: 7613


## Clean tweets and provide word2ind dictionaries

In [7]:
def tweets_cleaning(tweets, labels, use_stopwords = False, train = False, use_bigrams = False, 
                    lowercase = True, stemming = False, min_df = 2, embedding = True):
    """
    Text cleaning function that performs all necessary text preprocessing steps.
    Function only keeps characters, that are alphanumerical (non-alphanumerical values are discarded).
    Digits are treated by regular expressions.
    Lower-casing is performed to reduce noise and normalize the text (convert it into a uniform representation).
    Stemming is performed to only keep the stem of each word token but not any other deviated form. 
    Stop words (i.e., words that occur more frequently than other words in a given corpus) are removed.
    """
    if stemming:
        # initialize Lancaster stemmer
        st = LancasterStemmer()
    if use_stopwords:
        # create list of stopwords
        stopwords = list(set(stopwords.words('english')))
    cleaned_data = []
    cleaned_labels = []
    
    all_bigrams = [] # serves as place-holder
    bigrams_dict = dict()
    vocab = dict()
    
    for tweet, label in zip(tweets, labels):
        tweet = re.sub(r'&amp\S+','', tweet)
        tweet = re.sub(r' & ', ' and ', tweet)
        tweet = re.sub(r'!+', ' ! ', tweet)
        tweet = re.sub(r'[?]+', ' ? ', tweet)
        tweet = re.sub('@.+', '@user', tweet)
        tweet = re.sub('#', '# ', tweet)

        # Create spaces instead of some punctuation marks, but not if it's part of an emoticon
        tweet = ' '.join([word if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word)
            else re.sub('[,.;\-_:/\n\t]+', ' ', word) for word in tweet.split()])
        tweet = tweet.split(" ")
        
        cleaned_tweet = []
        for word in tweet:
            
            #if emoticon is in word, keep the emoticon
            if re.search(r'(?:X|:|;|=)(?:-)?(?:\)|\(|O|D|P|S)+', word):
                cleaned_word = word
            # keep special characters which might carry important information
            else:
                cleaned_word = ''.join([char for char in word if re.search('[<>$#€£!?@=]', char) or
                                        char.isalnum()])
            
            # perform lower-casing to normalize the text and reduce noise
            if lowercase:
                cleaned_word = cleaned_word.lower()
                
            if "<3" not in cleaned_word:
                cleaned_word = re.sub('[0-9]', '0', cleaned_word)
  
            # removes each \n (i.e., new line) or \t (i.e., tab) -> pipe char denotes a disjunction
            cleaned_word = re.sub(r'( \n| \t)+', '', cleaned_word)
            
            # perform stemming
            if stemming:
                cleaned_word = st.stem(cleaned_word)
            
            # Check if the word is a stopword, if so dont append it
            # Also creates the dictionary{word:freq} of all words
            if len(cleaned_word) > 0:
                if not use_stopwords:
                    cleaned_tweet.append(cleaned_word)
                elif(cleaned_word not in stopwords):
                    cleaned_tweet.append(cleaned_word)

                if train:
                    if cleaned_word in vocab:
                        vocab[cleaned_word] += 1
                    else:
                        vocab[cleaned_word] = 1
            
        # only append tweets with more than 1 word per tweet
        if len(cleaned_tweet) > 1:
            
            # Create bigrams
            if train and use_bigrams:
                
                bigrams = [' '.join([cleaned_tweet[i-1], cleaned_tweet[i]]) 
                           for i, _ in enumerate(cleaned_tweet) if i > 0]
                
                for bigram in bigrams:
                    
                    if bigram in bigrams_dict:
                        bigrams_dict[bigram] += 1
                    else:
                        bigrams_dict[bigram] = 1 

            cleaned_tweet = ' '.join(cleaned_tweet)
            cleaned_data.append(cleaned_tweet)
            cleaned_labels.append(label)
            
    # Creates and returns a dict {word: index}
    if train and embedding and not use_bigrams:
        word2index = dict()
        word2index["<PAD>"] = 0
        word2index["UNK"] = 1
        i = 2
        for word in vocab.keys():
            word2index[word] = i
            i += 1        
        assert len(cleaned_data) == len(cleaned_labels)
        return cleaned_data, cleaned_labels, word2index
                
    if train:
        vocab = [word for word, freq in vocab.items() if freq >= min_df]  
        if use_bigrams:
            all_bigrams = [bigram for bigram, freq in bigrams_dict.items() if freq >= min_df]
            vocab.extend(all_bigrams)
    
    assert len(cleaned_data) == len(cleaned_labels)    
    return cleaned_data, cleaned_labels, sorted(vocab), sorted(all_bigrams)

In [9]:
cleaned_train_data, cleaned_train_labels, word2idx = tweets_cleaning(train_data.text, train_data.label, train = True)
cleaned_test_data, cleaned_test_labels,_,_  = tweets_cleaning(test_data.text, test_data.label, train = False)
cleaned_val_data, cleaned_val_labels,_,_  = tweets_cleaning(val_data.text, val_data.label, train = False)

print("Number of unique tokens in the vocabulary: {} \n".format(len(word2idx.keys())))
print("Number of Tweets per data set after text cleaning was computed:")
print("Train: {}".format(len(cleaned_train_data)))
print("Test: {}".format(len(cleaned_test_data)))
print("Validation: {}".format(len(cleaned_val_data)))

Number of unique tokens in the vocabulary: 33332 

Number of Tweets per data set after text cleaning was computed:
Train: 68750
Test: 6539
Validation: 6505


In [10]:
y_train = emoji_to_int(cleaned_train_labels)
y_test = emoji_to_int(cleaned_test_labels)
y_val = emoji_to_int(cleaned_val_labels)

y_train_cat = to_cat_matrix(y_train)
y_test_cat = to_cat_matrix(y_test)
y_val_cat = to_cat_matrix(y_val)

In [11]:
X_train = sent2idx(word2idx, cleaned_train_data)
X_val = sent2idx(word2idx, cleaned_val_data)
X_test = sent2idx(word2idx, cleaned_test_data)
print(X_train[0])

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 9, 12, 13, 14, 15, 3]


## Load pre-trained embeddings and create look-up matrix

In [28]:
word_embeddings = get_embeddings('glove.6B.200d.txt')

In [29]:
embedding_dim = 200
maxlen = 124
vocab_size = len(word2idx)

In [30]:
embedding_mat  = embedding_matrix(word2idx, word_embeddings, embedding_dim)

In [31]:
X_train = keras.preprocessing.sequence.pad_sequences(X_train, value=word2idx["<PAD>"], padding='post', maxlen=maxlen)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, value=word2idx["<PAD>"], padding='post', maxlen=maxlen)
X_val = keras.preprocessing.sequence.pad_sequences(X_val, value=word2idx["<PAD>"], padding='post', maxlen=maxlen)

## Keras model

In [32]:
model = keras.Sequential([
    #layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
    layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights = [embedding_mat], trainable = False, mask_zero = True),
    layers.Bidirectional(layers.LSTM(128, return_sequences = True)),
    layers.Bidirectional(layers.LSTM(128, return_sequences = False)),
    layers.Dense(32, activation='relu'),
    layers.Dense(10, activation='softmax')
])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 124, 200)          6666400   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 124, 256)          336896    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_7 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_8 (Dense)              (None, 10)                330       
Total params: 7,406,090
Trainable params: 739,690
Non-trainable params: 6,666,400
_________________________________________________________________


In [33]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
es = keras.callbacks.EarlyStopping(monitor = 'val_loss', mode='min', verbose=1)
mc = keras.callbacks.ModelCheckpoint('best.h5', monitor='val_loss',mode='min', verbose=1, save_best_only= True)

In [35]:
history = model.fit(
    X_train,
    y_train_cat,
    epochs=10,
    batch_size=512,
    validation_data=(X_val, y_val_cat))

Train on 68750 samples, validate on 6505 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# get predictions
y_pred_test = model.predict(X_test)

# convert predictions to labels
y_pred_labels = preds_to_labels(y_pred_test)

In [None]:
accuracy_score(y_test, y_pred_labels)

from gensim.models.keyedvectors import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model.save_word2vec_format('word2vec.txt', binary=False)

word_embeddings = get_embeddings("word2vec.txt")