In [1]:
# external libraries
import pandas as pd
import numpy as np
from collections import Counter
from ast import literal_eval
import time
import sys 
from shutil import copyfile
from sklearn.metrics import accuracy_score, f1_score
# tensorflow and keras
import keras.optimizers
from keras.datasets import imdb
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Concatenate, Bidirectional, Reshape
from keras.layers import GRU, CuDNNGRU, CuDNNLSTM
from keras.layers.embeddings import Embedding
from keras.constraints import maxnorm
from keras.regularizers import L1L2
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard
from keras.backend import tile
import keras.backend as K
from keras.layers import Lambda
# fix random seed for reproducibility - only works for CPU version of tensorflow
np.random.seed(42)

Using TensorFlow backend.


In [2]:
sentences_df = pd.read_csv('../../../../data/processed/tok_sentence_baby_reviews_spell.csv')
reviews_df = pd.read_csv('../../../../data/processed/tok_baby_reviews.csv')

In [3]:
df = reviews_df.merge(sentences_df, on='uuid')

In [4]:
print("\nFiles read, converting tokens to lists.")
for col in ['summary_tokens', 'review_tokens', 'sentence_tokens']:
    df[col] = df[col].map(literal_eval)


Files read, converting tokens to lists.


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,polarity,year,uuid,asin,reviewerID,review_tokens,summary_tokens,summary_wc,review_wc,summary_wc_std,review_wc_std,year_std,group_id,sentiment,sentence_tokens,sentence_wc,sentence_wc_std
0,0,negative,2005,1114646400B000056OUGA3FFDK09UJS1TD,B000056OUG,A3FFDK09UJS1TD,"[i, am, primarily, breastfeeding, ,, but, i, t...","[infrequent, bottle, user, ?, then, buy, steam...",10,84,1.560512,-0.494464,-0.845298,1,neg,"[i, am, primarily, breastfeeding, but, i, thou...",20,0.249451
1,0,negative,2005,1114646400B000056OUGA3FFDK09UJS1TD,B000056OUG,A3FFDK09UJS1TD,"[i, am, primarily, breastfeeding, ,, but, i, t...","[infrequent, bottle, user, ?, then, buy, steam...",10,84,1.560512,-0.494464,-0.845298,1,neu,"[i, discovered, medela, microwave, steam, clea...",16,-0.118286
2,0,negative,2005,1114646400B000056OUGA3FFDK09UJS1TD,B000056OUG,A3FFDK09UJS1TD,"[i, am, primarily, breastfeeding, ,, but, i, t...","[infrequent, bottle, user, ?, then, buy, steam...",10,84,1.560512,-0.494464,-0.845298,1,neu,"[i, feel, badly, for, the, waste, of, money, b...",16,-0.118286
3,0,negative,2005,1114646400B000056OUGA3FFDK09UJS1TD,B000056OUG,A3FFDK09UJS1TD,"[i, am, primarily, breastfeeding, ,, but, i, t...","[infrequent, bottle, user, ?, then, buy, steam...",10,84,1.560512,-0.494464,-0.845298,1,pos,"[and, not, space, consuming, .]",5,-1.129561
4,0,negative,2005,1114646400B000056OUGA3FFDK09UJS1TD,B000056OUG,A3FFDK09UJS1TD,"[i, am, primarily, breastfeeding, ,, but, i, t...","[infrequent, bottle, user, ?, then, buy, steam...",10,84,1.560512,-0.494464,-0.845298,1,pos,"[plus, you, can, use, them, during, travel, at...",11,-0.577956


In [6]:
### Preprocessing 
# declare the padding and unknown symbols
pad_mask_int = 0
pad_mask_sym = '==pad_mask=='
unknown_int = 1
unknown_sym = '==unknown_sym=='

# vocabulary set
vocab_counter = Counter()
for doc in df['sentence_tokens']:
    vocab_counter.update(doc)

min_times_word_used = 2 # if at least 2 then the model will be prepared for unknown words in test and validation sets
print(len(vocab_counter), "tokens before discarding those that appear less than {} times.".format(min_times_word_used))
for key in list(vocab_counter.keys()):
    if vocab_counter[key] < min_times_word_used: 
        vocab_counter.pop(key)
print(len(vocab_counter), "tokens after discarding those that appear less than {} times.".format(min_times_word_used))   
vocab_set = set(vocab_counter.keys())

# vocabulary list and int map
vocab_list = [pad_mask_sym, unknown_sym] + sorted(vocab_set)
vocab_map = {word: index for index, word in enumerate(vocab_list)}

# label set
label_set = set(df['sentiment'].unique())

# label list and int map
label_list = sorted(label_set)
label_map = {word: index for index, word in enumerate(label_list)}

# polarity feature set
polarity_set = set(df['polarity'].unique())

# polarity list and int map
polarity_list = sorted(polarity_set)
polarity_map = {word: index for index, word in enumerate(polarity_list)}

# group feature set
group_set = set(df['group_id'].unique())

# group list and int map
group_list = sorted(group_set)
group_map = {word: index for index, word in enumerate(group_list)}

9429 tokens before discarding those that appear less than 2 times.
5313 tokens after discarding those that appear less than 2 times.


In [7]:
# pretrained embeddings are from https://nlp.stanford.edu/projects/glove/
# start by loading in the embedding matrix
# load the whole embedding into memory
print("\nReading big ol' word embeddings")
count = 0
embeddings_index_1 = dict()
with open('../../../../data/external/glove.twitter.27B.50d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            print(values)
        embeddings_index_1[word] = coefs
print('Loaded %s word vectors.' % len(embeddings_index_1))

#embeddings_index_2 = dict()
#with open('../../../data/external/glove.twitter.27B.100d.txt') as f:
#    for line in f:
#        values = line.split()
#        word = values[0]
#        try:
#            coefs = np.asarray(values[1:], dtype='float32')
#        except:
#            print(values)
#        embeddings_index_2[word] = coefs
#print('Loaded %s word vectors.' % len(embeddings_index_2))

embedding_dim_1 = 50
embedding_dim_2 = 0

embedding_dim = embedding_dim_1 + embedding_dim_2

# create a weight matrix for words in training docs
if embedding_dim_2 > 0:
    embedding_matrix = np.zeros((len(vocab_list), embedding_dim))
    for i, word in enumerate(vocab_list):
        embedding_vector_1 = embeddings_index_1.get(word)
        embedding_vector_2 = embeddings_index_2.get(word)
        if embedding_vector_1 is not None and embedding_vector_2 is not None:
            embedding_matrix[i] = np.concatenate((embedding_vector_1, embedding_vector_2))
        elif embedding_vector_1 is None and embedding_vector_2 is not None:
            embedding_matrix[i] = np.concatenate((np.zeros(embedding_dim_1), embedding_vector_2))        
        elif embedding_vector_1 is not None and embedding_vector_2 is None:
            embedding_matrix[i] = np.concatenate((embedding_vector_1, np.zeros(embedding_dim_2)))
        else:
            print(word)
            count += 1 # maybe we should use fuzzywuzzy to get vector of nearest word? Instead of all zeros
else:
    embedding_matrix = np.zeros((len(vocab_list), embedding_dim))
    for i, word in enumerate(vocab_list):
        embedding_vector = embeddings_index_1.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
        else:
            print(word)
            count += 1 # maybe we should use fuzzywuzzy to get vector of nearest word? Instead of all zeros

print(count)


Reading big ol' word embeddings
Loaded 1193514 word vectors.
==pad_mask==
==unknown_sym==
(8
):
);
->
..
0
00
06
07
1
1-2
1/2
1/3
1/4
10
10-12
10-15
100
10oz
11
12
12-18
120
13
130
14
15
150
159
16
17
18
18mo
19
1st
1yo
2
2-1
2-3
20
200
2002
2004
2005
2007
2009
2012
21
22
22lbs
23
24
25
26
27
29
2nd
2yo
2yr
3
3-4
3/4
30
30-45
300
32
34
35
37
39
3m
3rd
3yo
4
4-5
4-6
40
45
4oz
4th
5
5-10
50
500
6
6-7
600
6mo
7
7-8
70
75
77
7oz
8
8-9
80
8lbs
8oz
9
90
900
900mhz
99
9oz
:(
:)
:-)
::
:P
;)
;-)
=)
adiri
ameda
armholes
attatch
avents
babiesrus
baby-proofing
babyhawk
barely-used
baseboards
basinett
born-free
bouncenette
breast-fed
breastpump
breastshield
breastshields
brown's
car-seats
cd's
childproof
colapsed
colicky
compactly
cosleeper
cosleeping
cozyup
cushie
doens't
dreamscapes
drop-ins
dropins
engorged
engorgement
eurobath
evenflow
excema
exersaucer
flammability
fraying
freedislike
freshners
front-to-back
furnature
fussier
fussiness
heatable
highchairs
hygeia
i'm
i've
indentations
inverts

In [8]:
from scipy import sparse 
from typing import List, Set, Dict, Tuple, Optional
import numpy as np

def create_one_hot(labels, label_dict: dict):
    """
    
    Args:
        labels:        array of labels, e.g. NumPy array or Pandas Series
        label_dict:    dict of label indices
    Return:
        one_hot_numpy: sparse CSR 2d array of one-hot vectors
    """
    one_hot_numpy = sparse.dok_matrix((len(labels), len(label_dict)), dtype=np.int8)
    for i, label in enumerate(labels):
        one_hot_numpy[i, label_dict[label]] = 1
    return sparse.csr_matrix(one_hot_numpy) 

def undo_one_hot(pred, label_list: list) -> List[List[str]]:
    """
    
    Args: 
        pred:       NumPy array of one-hot predicted classes
        label_list: a list of the label strings
    Return:
        label_pred: a list of predicted labels
    """
    label_pred = [label_list[np.argmax(row)] for row in pred]
    return label_pred
    # this could probably be done awesomely fast as NumPy vectorised but it works


def word_index(los: List[List[str]], vocab_dict: Dict[str, int], unknown: int, reverse: bool=False) -> List[List[int]]:
    """
    Replaces words with integers from a vocabulary dictionary or else with the integer for unknown
    
    Args:
        los:     list of lists of split sentences
        pad_to:  how big to make the padded list
        unknown: the integer to put in for unknown tokens (either because they were pruned or not seen in training set)
        reverse: reverse the order of tokens in the sub-list 
    Returns: 
        new_los: list of lists of split sentences where each token is replaced by an integer
        
    Examples:
    >>> print(word_index([['one', 'two', 'three'], ['one', 'two']], {'one': 1, 'two': 2, 'three': 3}, unknown=4))
    [[1, 2, 3], [1, 2]]
    >>> print(word_index([['one', 'two', 'three'], ['one', 'two']], {'one': 1, 'two': 2, 'three': 3}, unknown=4, reverse=True))
    [[3, 2, 1], [2, 1]]
    """
    new_los = []
    if reverse:
        for sentence in los:
            new_los.append([vocab_dict[word] if word in vocab_dict else unknown for word in sentence][::-1])        
    else:
        for sentence in los:
            new_los.append([vocab_dict[word] if word in vocab_dict else unknown for word in sentence])
    return new_los



In [9]:
# create one-hot sparse matrix of labels
y = create_one_hot(df['sentiment'], label_map).todense()

# create one-hot of review polarity
polarity = create_one_hot(df['polarity'], polarity_map)[:, 0].todense()

# create one-hot of group number
group = create_one_hot(df['group_id'], group_map).todense()


# replace strings with ints (tokenization is done on the Series fed to word_index())
sentences = word_index(df['sentence_tokens'], vocab_map, unknown_int, reverse=False)

# pad / truncate 
from keras.preprocessing.sequence import pad_sequences

sentence_len = max(map(len, list(df['sentence_tokens'])))

sentences = pad_sequences(sequences=sentences, 
                              maxlen=sentence_len, 
                              dtype='int32', 
                              padding='pre', 
                              value=pad_mask_int)

#group = pad_sequences(sequences=group, 
#                              maxlen=embedding_dim, 
#                              dtype='int32', 
#                              padding='pre', 
#                              value=pad_mask_int)

In [10]:
print(sentences[:2])
print(polarity[:2])
print(group[:2])
print(y[:2])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0 2268
   281 3490  665  750 2268 4718 4712 5260  493 4998 1852 4419 3546 3249
   295 4682 3079  629   20]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0

In [11]:
NAME = 'sentences-ablation-group-9-{}'.format(time.strftime('%y%m%d_%H%M', time.localtime(time.time())))

for g in range(6,9):
    training_mask = np.logical_or(df['group_id'] != g, df['group_id'] != 9)  
    validation_mask = df['group_id'] == g  
        
    input_s = Input(shape=(sentence_len,), dtype='int32', name='input_s')
    input_p = Input(shape=(1,), dtype='float32', name='input_p')
    input_g = Input(shape=(len(group_list),), dtype='float32', name='input_g')

    embedding_vector_length = embedding_dim
    GRU_nodes_sentences = 8

    emb = Embedding(len(vocab_list), embedding_vector_length, mask_zero=True,
                        weights=[embedding_matrix], trainable=False)

    emb_s = emb(input_s)

    gru_s = GRU(GRU_nodes_sentences,
            kernel_initializer='glorot_uniform',
            recurrent_initializer='orthogonal',
            bias_initializer='zeros',
            kernel_regularizer=None,
            recurrent_regularizer=None,
            bias_regularizer=L1L2(l1=0.1, l2=0.0),
            activity_regularizer=L1L2(l1=1e-07, l2=0.0),
            kernel_constraint=maxnorm(3),
            recurrent_constraint=maxnorm(3),
            bias_constraint=None,
            return_sequences=False,
            return_state=False,
            go_backwards=False,
            stateful=False,
            dropout=0.3)(emb_s)

    concat_1 = Concatenate()([gru_s,  input_p, input_g]) # 
    output = Dense(len(label_set), activation='softmax')(gru_s)
    model = Model([input_s, input_p, input_g], output) # , ,  
    nadam = keras.optimizers.nadam(lr=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])

    print(model.summary())
#     es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
    tensorboard = TensorBoard(log_dir = './tb_logs/{}'.format('group_'+str(g)+'_'+NAME))

    hist1 = model.fit(x=[sentences[training_mask], polarity[training_mask], group[training_mask]], #  , 
                      y=y[training_mask], 
                      validation_data=([sentences[validation_mask],
                                        polarity[validation_mask],
                                        group[validation_mask]], #   
                                       y[validation_mask]), 
                      epochs=50, batch_size=64, callbacks=[tensorboard]) 
    pred = model.predict([sentences[validation_mask], 
                          polarity[validation_mask], 
                          group[validation_mask]]) #  
    pred = undo_one_hot(pred, label_list)
    true_sentiment = df.loc[validation_mask, 'sentiment']
    
    f1_micro = f1_score(true_sentiment, pred, average='micro')
    f1_macro = f1_score(true_sentiment, pred, average='macro')
    accu = accuracy_score(true_sentiment, pred)

    metrics_string = """
    Group {}
     Sklearn
      f1 micro {}
      f1 macro is {}
      Accuracy {}
     TF
      {}
      """.format(g, f1_micro, f1_macro, accu, [key + " " + str(hist1.history[key][-1])  for key in hist1.history.keys()])
    print(metrics_string)
    
    with open(NAME+'.txt', mode='a') as fp:
        fp.write(metrics_string)

copyfile('sentence_predictions.ipynb', './tb_logs/{}.ipynb'.format(NAME)) # sys.argv[0] for .py files

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_s (InputLayer)         (None, 159)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 159, 50)           265750    
_________________________________________________________________
gru_1 (GRU)                  (None, 8)                 1416      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 27        
Total params: 267,193
Trainable params: 1,443
Non-trainable params: 265,750
_________________________________________________________________
None
Train on 15654 samples, validate on 1749 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epo

Train on 15654 samples, validate on 1773 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

    Group 7
     Sklearn
      f1 micro 0.6717428087986463
      f1 macro is 0.6020591806324684
      Accuracy 0.6717428087986463
     TF
      ['val_loss 0.772277628844294', 'val_acc 0.671742809168444', 'loss 0.8615684076730541', 'acc 0.5942251181197347']
      
_________________________________________________________________
Layer (type)                 

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

    Group 8
     Sklearn
      f1 micro 0.47805171377029465
      f1 macro is 0.4817129306466874
      Accuracy 0.47805171377029465
     TF
      ['val_loss 0.9839780348578778', 'val_acc 0.4780517146663356', 'loss 0.8585387152761613', 'acc 0.5958221540211017']
      


'./tb_logs/sentences-ablation-group-9-190517_1842.ipynb'

In [None]:
f1_score(pred, true_sentiment, average='macro')

In [None]:
true_sentiment = df.loc[np.logical_not(group_mask), 'sentiment']

In [None]:
pred = undo_one_hot(pred, label_list)

In [13]:
[key + " " + str(hist1.history[key][-1])  for key in hist1.history.keys()]

['val_loss 0.7653999250259758',
 'val_acc 0.6525821599042472',
 'loss 0.7189908383824065',
 'acc 0.6974193549498008']