### Notebook for the vector representation of the training set prior the neural network training

In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile

from keras.utils.np_utils import to_categorical

import json

In [None]:
tweets = pd.read_feather('pickles/train_3.feather')

In [None]:
train_tweets, sentiments = tweets['Tweet'], tweets['Sentiment']
# train_tweets, sentiments = tweets_16m['text'], tweets_16m['target']

all_tweets = train_tweets # + test_tweets
tokenizer = Tokenizer(filters=' ')
tokenizer.fit_on_texts(all_tweets)
word_index = tokenizer.word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_tweets)

sequences = train_sequences # + test_sequences
MAX_SEQUENCE_LENGTH = 0
for elt in sequences:
    if len(elt) > MAX_SEQUENCE_LENGTH:
        MAX_SEQUENCE_LENGTH = len(elt)

MAX_SEQUENCE_LENGTH

In [None]:
train_sequences = pad_sequences(train_sequences, MAX_SEQUENCE_LENGTH)
train_sequences.shape

### Words embeddings

Currently loading them from pre-built word embeddings, next step is using our word embeddings.

In [None]:
# googlenews_w2v = KeyedVectors.load_word2vec_format('data/embeddings/GoogleNews-vectors-negative300.bin', binary=True)

tmp_file = get_tmpfile('datastories.300d.word2vec')
glove2word2vec('data/embeddings/datastories.twitter.300d.txt', tmp_file)
w2v = KeyedVectors.load_word2vec_format(tmp_file)

### EmoLex

EmoLex is a text file containing words and a weight for 10 different sentiments tied with the word.

To add EmoLex, we append the 10 values to the 300 already existing with the word embeddings value.

If the word does not exist in the EmoLex database, we add an array of size 10 and values 0.1 to describe the fact that the word does not describe any sentiment.

In [None]:
emolex = pd.read_csv('TP_transfer_learning_2018/EmoLex.txt', sep='\t')

### OLE

Opinion Lexicon English is a database that contains a list of english word used in Positive sentences and Negative sentences. If a word is present in neither, it is described as neutral.

In [None]:
positive_words = []
negative_words = []

with open('data/positive-words.txt') as positive_file, open('data/negative-words.txt', encoding='ISO-8859-1') as negative_file:
    for _ in range(35):
        next(positive_file)
        next(negative_file)
        
    for line in positive_file:
        positive_words.append(line)
    for line in negative_file:
        negative_words.append(line)

### Emoji valence and AFINN

Emoji Valence is a json file containing a score between -5 and 5 for emojis
AFINN is a text file containing also a score between -5 and 5 for english words

These two features are merged together since emojis are not present in AFINN file and the emoji file does not contain emoji. Furthermore, they both use score between -5 and 5.

In [None]:
with open('./data/index.json') as emojiFile:
    emoji_valence = json.load(emojiFile)

emoji_valence

In [None]:
afinn = pd.read_csv('data/AFINN-111.txt', sep='\t')
print(afinn.loc[afinn['word'] == 'abandon'].val)

### Building embedding matrix

Here, we build the embedding matrix used in the training steps later with first the 300 values in the words embeddings, and we add to that the 10 values of the EmoLex and a last value for the OLE.

In [None]:
targets = to_categorical(sentiments, 3)
nb_words = len(word_index) + 1

EMBEDDING_DIM = 300
EMOLEX_DIM = 10
OLE_DIM = 1
EMOJI_VALENCE_DIM = 1
AFINN_DIM = 1 

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM + EMOLEX_DIM + OLE_DIM + AFINN_DIM))

oov = []  # Out of vocabulary
oov.append((np.random.rand(EMBEDDING_DIM) * 2.0) - 1.0)
oov = oov / np.linalg.norm(oov)

empty_emolex = np.full(10, 0.1)

print(empty_emolex)
print(oov.shape)

for word, i in word_index.items():
    emoji_val = 0
    
    word_vector = oov
    if word in w2v.vocab:
        word_vector = w2v.word_vec(word)

    emolex_row = emolex.loc[emolex['word'] == word]
    if emolex_row.empty:
        word_vector = np.append(word_vector, empty_emolex)
    else:
        word_vector = np.append(word_vector, emolex_row.values.tolist()[0][1:])
    
    ole_val = 0
    if word in positive_words:
        ole_val = 5
    elif word in negative_words:
        ole_val = -5
    word_vector = np.append(word_vector, ole_val)
    
    afinn_val = 0
    afinn_row = afinn.loc[afinn['word'] == word]
    if not afinn_row.empty:
        afinn_val = afinn_row.val
    else:
        for emoji in emoji_valence:
            if word == emoji['emoji']:
                afinn_val = emoji['polarity']
    word_vector = np.append(word_vector, afinn_val)
    
    embedding_matrix[i] = word_vector

        
print(embedding_matrix.shape)