In [None]:
def get_embeddings(text_file):

    """ 
    Read GloVe txt.-file, load pre-trained word embeddings into memory
    and create a word_to_embedding dictionary, where keys are the discrete word strings
    and values are the corresponding continuous word embeddings, retrieved from the GloVe txt.-file.
    For unkown words, the representation is an empty vector (i.e., zeros matrix).
    """
    embeddings_dict = {}

    with open(text_file, encoding="utf8") as file:

        for line in file:
            values = line.split()
            word = values[0]
            wordvec = np.array(values[1:], dtype = 'float32')
            embeddings_dict[word] = list(wordvec)

    return embeddings_dict


def get_emojivecs(emoji_embeddings: dict, corpus: list, dims: int):

    N = len(corpus)
    M = dims
    
    emojivecs = []
    
    # document = tweet; corpus = all tweets
    for emoji in corpus:
        emoji_sequence = []

        try:
            emojivec = emoji_embeddings[emoji]
            assert len(emojivec) == M
            emoji_sequence.append(emojivec)
        except KeyError:
            emoji_sequence.append([0 for _ in range(M)])
            print("This {} does not exist in the pre-trained emoji embeddings.".format(emoji))

        emojivecs.append(emoji_sequence)

    assert len(emojivecs) == N
    return np.array(emojivecs)

def get_wordvecs(word_embeddings: dict, corpus: list, dims: int, zeros_padding = False):

    """ 
    Return a concatenated word vector representation of each tweet.
    The concatenated word vectors serve as the input data for the LSTM RNN.
    Each word (embedding) denotes a time step. (Number of timesteps is equal to the length of the input sentence.)
    
    Check whether length of word vector is equal to the number of dimensions we pass to this function.
    For unknown words (i.e., if key does not exist), the representation is an empty vector / zeros matrix of len dims.

    Sequences can have variable length (i.e., number of time steps per batch).
    However, in some cases you might want to zero pad the batch if a sequence < max length of sequences in the corpus.
    By default this argument is set to False as Keras and Tensorflow except input sequences of variable length.
    If set to True, zero padding is computed.
    """

    N = len(corpus)
    M = dims
    global max_length
    max_length = max([len(sequence) for sequence in corpus])
    wordvecs_corpus = []
    
    # document = tweet; corpus = all tweets
    for document in corpus:
        wordvec_sequence = []
        for word in document:
            
            try:
                wordvec = word_embeddings[word]
                assert len(wordvec) == M
                wordvec_sequence.append(wordvec)
            except KeyError:
                wordvec_sequence.append([0 for _ in range(M)])
                
        # needs to be resolved (!)
        if zeros_padding == True: 
            if len(document) < max_length:

                for _ in range(len(document), max_length):
                    wordvec_sequence.append([0 for _ in range(M)])

                assert len(wordvec_sequence) == max_length
        wordvecs_corpus.append(wordvec_sequence)

    assert len(wordvecs_corpus) == N
    return np.array(wordvecs_corpus)