# Sentiment Analysis

This notebook is the main one for this project. It will help navigate through the implementation scripts and notebooks, without going into too much detail.

In [59]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import *
import matplotlib.pyplot as plt
from tokenizer import tokenizer as tweet_tokenizer
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import *
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.optimizers import * 
from keras.regularizers import *
from keras.models import load_model
import pickle
import json
from keras.callbacks import ModelCheckpoint
from src.preprocessing import standardization
import os


%matplotlib inline

PATH = './data'

path_train_3 = f'{PATH}/data_train_3.csv'
path_test_3 = f'{PATH}/data_test_3.csv'
path_train_7 = f'{PATH}/data_train_7.csv'
path_val_7 = f'{PATH}/data_val_7.csv'

## Preprocessing and Embeddings

The first part of this project was to find a big enough dataset. As mantioned in the notebook *"src/word2vec_training"*, different techniques were investigated. We finally decided to pick this dataset:

https://archive.org/details/archiveteam-twitter-stream-2017-11

From this base we filtered non english, truncated, retweeted or duplicate tweets. You can download the resulting dataset of 23M tweets we used here:

https://mega.nz/#!UI0ViKiZ!x6eBjFPmkKqDcV6Il-rpQj-DNcSJIOeL6Axk-vfuOyU

Here is a preprocessing pipeline example:

In [11]:
from functools import partial
from src.text_preprocessing import TweetTokenizer, NLTKStemmer, NLTKLemmatizer, CorpusWrapper, BatchMaker, Pipeline

input_stream = [
    'This is a tweet',
    'This is another tweet',
]

factories = [
    TweetTokenizer,
    partial(CorpusWrapper, NLTKStemmer),
    partial(CorpusWrapper, NLTKLemmatizer),
    partial(BatchMaker, batch_size=100000),
]

batch_pipeline = Pipeline(input_stream, factories)

In [12]:
for batch in batch_pipeline:
    for tweet in batch:
        print(tweet)

['thi', 'be', 'a', 'tweet']
['thi', 'be', 'anoth', 'tweet']


### Loading and Preprocessing our datasets 

In [43]:
tweets = pd.read_csv(path_train_3, sep='\t', names=['ID', 'Class', 'Tweet'])
tweets_7 = pd.read_csv(path_train_7, sep='\t', names=['ID', 'Class', 'Tweet'], dtype={'Tweet': str})
tweets_3_test = pd.read_csv(path_test_3, sep='\t', names=['ID', 'Class', 'Tweet'], dtype={'Tweet': str})

tweets.shape, tweets_7.shape, tweets_3_test.shape

((50333, 3), (1630, 3), (1630, 3))

In [44]:
tweets['Sentiment'] = tweets['Class'].apply(lambda x: {'negative': 0, 'neutral': 1, 'positive': 2}[x])
tweets['Tweet'] = tweets['Tweet'].apply(lambda x: standardization(x))

In [45]:
tweets_7['Sentiment'] = tweets_7['Class'] + 3
tweets_7['Tweet'] = tweets_7['Tweet'].apply(lambda x: standardization(x))

In [46]:
tweets_3_test['Sentiment'] = tweets_3_test['Class'].apply(lambda x: {'negative': 0, 'neutral': 1, 'positive': 2}[x])
tweets_3_test['Tweet'] = tweets_3_test['Tweet'].apply(lambda x: standardization(x))

In [47]:
display(tweets.sample(2))
display(tweets_7.head(2))
display(tweets_3_test.head(2))

Unnamed: 0,ID,Class,Tweet,Sentiment
11552,281269284217421824,positive,watch nightmare christmas first time long time...,2
18098,634203696872579072,positive,snoop dogg gonna centennial game friday gonna ...,2


Unnamed: 0,ID,Class,Tweet,Sentiment
0,0,0,yeah ☺ ️ playing well,3
1,1,0,least not guy try discourage anymore want neve...,3


Unnamed: 0,ID,Class,Tweet,Sentiment
0,449,negative,site crash everytime try book help tell nothin...,0
1,450,negative,theme week ask lord strength perspective perse...,0


### Tokenizer

In [51]:
train_x, train_y = tweets['Tweet'], tweets['Sentiment']
test_x, test_y = tweets_3_test['Tweet'], tweets_3_test['Sentiment']
train7_x, train7_y = tweets_7['Tweet'], tweets_7['Sentiment']

all_tweets = pd.concat([train_x, test_x, train7_x])
tokenizer = Tokenizer(filters=' ')
tokenizer.fit_on_texts(all_tweets)
word_index = tokenizer.word_index

### Embedding Loading

We used Gensim to train embeddings on our dataset, using a previously defined pipeline. You can load our embeddings this way:

In [13]:
from gensim.models import Word2Vec

# model = Word2Vec.load('./data/trained_embeddings_23M.model')

However, after testing, we noticed that the model for 3 classes was more performent with pre-trained embeddings on more than 330M tweets

In [23]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import get_tmpfile


tmp_file = get_tmpfile('datastories.300d.word2vec')
glove2word2vec('./data/embeddings/datastories.twitter.300d.txt', tmp_file)
w2v = KeyedVectors.load_word2vec_format(tmp_file)

## Enriching the embedding matrix

### EmoLex

In [24]:
emolex = pd.read_csv('data/EmoLex.txt', sep='\t')

### OLE

In [25]:
positive_words = []
negative_words = []

with open('data/positive-words.txt') as positive_file, open('data/negative-words.txt', encoding='ISO-8859-1') as negative_file:
    for _ in range(35):
        next(positive_file)
        next(negative_file)
        
    for line in positive_file:
        positive_words.append(line)
    for line in negative_file:
        negative_words.append(line)

### Emoji valence and AFINN

In [27]:
afinn = pd.read_csv('data/AFINN-111.txt', sep='\t')

def val_to_list(x):
    x += 5
    return(to_categorical(x, num_classes=11, dtype='int'))


afinn[['val']] = afinn['val'].apply(val_to_list)

In [29]:
with open('./data/index.json') as emojiFile:
    emoji_valence = json.load(emojiFile)

for elmt in emoji_valence:
    val = elmt['polarity']
    elmt['polarity'] = val_to_list(val)

### Depeche Mood

In [30]:
DepecheMoodpp = pd.read_csv('data/DepecheMood/DepecheMood_english_token_full.tsv', sep='\t')
DepecheMood = pd.read_csv('data/DepecheMood/DepecheMood_freq.txt', sep='\t')

### Embedding matrix

In [52]:
nb_words = len(word_index) + 1

EMBEDDING_DIM = 300
EMOLEX_DIM = 10
OLE_DIM = 2
EMOJI_VALENCE_DIM = 11
AFINN_DIM = 11 
DEPECHE_MOOD_DIM = 8
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM + EMOLEX_DIM + OLE_DIM + AFINN_DIM + DEPECHE_MOOD_DIM + EMOJI_VALENCE_DIM))

oov = []  # Out of vocabulary
oov.append((np.random.rand(EMBEDDING_DIM) * 2.0) - 1.0)
oov = oov / np.linalg.norm(oov)
empty_afinn = np.full(11, 0)
empty_emoji = np.full(11, 0)
empty_emolex = np.full(10, 0)
empty_depeche = np.full(8, 0)


for word, i in word_index.items():
    emoji_val = 0
    
    word_vector = oov
    if word in w2v.vocab:
        word_vector = w2v.word_vec(word)

    emolex_row = emolex.loc[emolex['word'] == word]
    if emolex_row.empty:
        word_vector = np.append(word_vector, empty_emolex)
    else:
        word_vector = np.append(word_vector, emolex_row.values.tolist()[0][1:])
        
    depeche_row = DepecheMoodpp.loc[DepecheMoodpp['word'] == word]
    if depeche_row.empty:
        word_vector = np.append(word_vector, empty_depeche)
    else:
        word_vector = np.append(word_vector, depeche_row.values.tolist()[0][1:9])
        
    ole_val = [0, 0]
    if word in positive_words:
        ole_val = [1, 0]
    elif word in negative_words:
        ole_val = [0, 1]
    word_vector = np.append(word_vector, ole_val)
    
    afinn_val = empty_afinn
    emoji_val = empty_emoji
    
    afinn_row = afinn.loc[afinn['word'] == word]
    if not afinn_row.empty:
        afinn_val = afinn_row['val'].item()
    else:
        for emoji in emoji_valence:
            if word == emoji['emoji']:
                emoji_val = emoji['polarity']

    word_vector = np.append(word_vector, afinn_val)
    
    word_vector = np.append(word_vector, emoji_val)
    
    embedding_matrix[i] = word_vector

        
print(embedding_matrix.shape)

(37785, 342)


In [121]:
# Claim memory back from this very large object we don't use anymore
del w2v

## Source task (3 classes)

In [53]:
train_sequences = tokenizer.texts_to_sequences(train_x)
test_sequences = tokenizer.texts_to_sequences(test_x)
train7_sequences = tokenizer.texts_to_sequences(train7_x)

sequences = train_sequences + test_sequences + train7_sequences
MAX_SEQUENCE_LENGTH = 0
for elt in sequences:
    if len(elt) > MAX_SEQUENCE_LENGTH:
        MAX_SEQUENCE_LENGTH = len(elt)

train_sequences = pad_sequences(train_sequences, MAX_SEQUENCE_LENGTH)
test_sequences = pad_sequences(test_sequences, MAX_SEQUENCE_LENGTH)
train7_sequences = pad_sequences(train7_pad_sequencesuences, MAX_SEQUENCE_LENGTH)

train_sequences.shape, test_sequences.shape, train7_sequences.shape

((50333, 32), (1630, 32), (1630, 32))

In [54]:
targets = to_categorical(train_y, 3)

X_train, X_val, y_train, y_val = train_test_split(train_sequences, targets, test_size=0.3)

print('training set: ' + str(len(X_train)) + ' samples')
print('validation set: ' + str(len(X_val)) + ' samples')

print('x_train:', X_train.shape)
print('y_train:', y_train.shape)

training set: 35233 samples
validation set: 15100 samples
x_train: (35233, 32)
y_train: (35233, 3)


In [55]:
def model_mine():
    vocab_size = embedding_matrix.shape[0]
    embedding_size = embedding_matrix.shape[1]

    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], 
                        input_length=MAX_SEQUENCE_LENGTH, trainable=False, name='embedding_layer'))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(150, return_sequences=True)))
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(150, return_sequences=True)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

    print(model.summary())
    
    return model

In [56]:
model = model_mine()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 32, 342)           12922470  
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 342)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 32, 300)           591600    
_________________________________________________________________
dropout_2 (Dropout)          (None, 32, 300)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32, 300)           541200    
_________________________________________________________________
dropout_3 (Dropout)          (None, 32, 300)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9600)              0         
__________

In [57]:
model.fit(X_train, y_train, batch_size=128, validation_data=(X_val, y_val), epochs=6)

Train on 35233 samples, validate on 15100 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7fc42400ecc0>

## Transfer learning (7 classes)

In [58]:
def reshape_model(model):
    model.pop()
    model.pop()
    for layer in model.layers:
        layer.trainable = False

    dense1 = Dense(150, activation='relu')
    dense3 = Dense(80, activation='relu')
    dense4 = Dense(30, activation='relu')
    dense2 = Dense(7, activation='softmax')

    model.add(dense1)
    model.add(Dropout(0.2))
    model.add(dense3)
    model.add(dense4)
    model.add(Flatten())
    model.add(dense2)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [60]:
model = reshape_model(model)

In [61]:
targets = to_categorical(train7_y, 7)

In [62]:
X_train, X_val, y_train, y_val = train_test_split(train7_sequences, targets, test_size=0.1)

print('training set: ' + str(len(X_train)) + ' samples')
print('validation set: ' + str(len(X_val)) + ' samples')

print('x_train:', X_train.shape)
print('y_train:', y_train.shape)

training set: 1467 samples
validation set: 163 samples
x_train: (1467, 32)
y_train: (1467, 7)


In [76]:
model.fit(X_train, y_train, batch_size=128, validation_data=(X_val, y_val), epochs=30)

Train on 1467 samples, validate on 163 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fc1798eadd8>

# Model evaluation

In [64]:
dev_7 = pd.read_csv(path_val_7, sep='\t')

seven_to_3 = {'-3: very negative emotional state can be inferred': 0,
              '-2: moderately negative emotional state can be inferred': 0,
              '-1: slightly negative emotional state can be inferred': 0,
              '3: very positive emotional state can be inferred': 2,
              '1: slightly positive emotional state can be inferred': 2,
              '2: moderately positive emotional state can be inferred': 2,
              '0: neutral or mixed emotional state can be inferred': 1}

seven_to_7 = {'-3: very negative emotional state can be inferred': -3,
              '-2: moderately negative emotional state can be inferred': -2,
              '-1: slightly negative emotional state can be inferred': -1,
              '3: very positive emotional state can be inferred': 3,
              '1: slightly positive emotional state can be inferred': 1,
              '2: moderately positive emotional state can be inferred': 2,
              '0: neutral or mixed emotional state can be inferred': 0}


dev_7['Sentiment'] = dev_7['Intensity Class'].apply(lambda x: seven_to_3[x])
dev_7['Target'] = dev_7['Intensity Class'].apply(lambda x: seven_to_7[x])
dev_7['Target_from_0'] = dev_7['Intensity Class'].apply(lambda x: seven_to_7[x] + 3)
dev_7['Tweet'] = dev_7['Tweet'].apply(lambda x: standardization(x))

dev_x, dev_y = dev_7['Tweet'], dev_7['Target_from_0']

dev_sequences = tokenizer.texts_to_sequences(dev_x)
dev_sequences = pad_sequences(dev_sequences, MAX_SEQUENCE_LENGTH)

In [65]:
from scipy.stats import pearsonr
from sklearn.metrics import cohen_kappa_score

y_pred = np.argmax(model.predict(dev_sequences, batch_size=128), axis=1)
y_true = dev_y

p = pearsonr(y_true, y_pred)[0]
w = cohen_kappa_score(y_pred, y_true, weights='quadratic')

print(f'Pearson: {p}')
print(f'Quadratic kappa: {w}')

Pearson: 0.6448126409970354
Quadratic kappa: 0.6407632098871721


## Creating a submission file

In [69]:
def read_preprocess(filepath, test=False):
    df = pd.read_csv(filepath, sep='\t')

    seven_to_7 = {'-3: very negative emotional state can be inferred': -3,
                  '-2: moderately negative emotional state can be inferred': -2,
                  '-1: slightly negative emotional state can be inferred': -1,
                  '3: very positive emotional state can be inferred': 3,
                  '1: slightly positive emotional state can be inferred': 1,
                  '2: moderately positive emotional state can be inferred': 2,
                  '0: neutral or mixed emotional state can be inferred': 0}

    if test == False:
        df['Target'] = df['Intensity Class'].apply(lambda x: seven_to_7[x])
        df['Target_from_0'] = df['Intensity Class'].apply(lambda x: seven_to_7[x] + 3)

    
    df['Tweet_standardized'] = df['Tweet'].apply(lambda x: standardization(x))
    
    if not test:
        tweets, targets = df['Tweet_standardized'], df['Target_from_0']
    else:
        tweets, targets = df['Tweet_standardized'], None

    sequences = tokenizer.texts_to_sequences(tweets)
    sequences = pad_sequences(sequences, 32)
    
    return df, sequences, targets

def predict(sequences, targets, test=False):
    y_pred = np.argmax(model.predict(sequences, batch_size=128), axis=1)
    y_true = targets
    
    if not test:
        p = pearsonr(y_true, y_pred)[0]
        print(f"Pearson: {p}")

    return y_pred

def output_to_csv(df, y_pred, output_filename='test_output.tsv'):
    seven_to_7 = {-3: '-3: very negative emotional state can be inferred',
              -2: '-2: moderately negative emotional state can be inferred',
              -1: '-1: slightly negative emotional state can be inferred',
              3: '3: very positive emotional state can be inferred',
              1: '1: slightly positive emotional state can be inferred',
              2: '2: moderately positive emotional state can be inferred',
              0: '0: neutral or mixed emotional state can be inferred'}

    y_out = [seven_to_7[y - 3] for y in y_pred]

    output_file = df.assign(ic=pd.Series(y_out).values)
    output_file = output_file[['ID', 'Tweet', 'Affect Dimension', 'ic']]
    output_file = output_file.rename(index=str, columns={"ic": "Intensity Class"})

    file = open(output_filename, 'w')
    output_file.to_csv(path_or_buf=file, sep='\t', index=False)
    file.close()

In [70]:
df, seq_test, y_test = read_preprocess('data/test.txt', test=True)

In [77]:
y_pred = predict(seq_test, y_test, test=True)

In [78]:
output_to_csv(df, y_pred, 'answer.tsv')