In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter

In [2]:
df = pd.read_csv('../../data/raw/phase1_movie_reviews-train.csv')
np.random.seed(42)
df = df.sample(frac=1).reset_index(drop=True) # shuffle

In [3]:
# simplify by only working on small data set
df = df.head(9000)

In [4]:
df.head()

Unnamed: 0,polarity,summary,reviewText,year
0,negative,Don't take this movie serious...,Rating System:1 star = abysmal; some books des...,2004
1,positive,Ridley Scott's Graphic Masterpiece Comes To DVD,Ridley Scott's Graphic War Film Black Hawk Dow...,2002
2,negative,It'll be forgotten in ten years.,"You know how every year, there are usually a f...",2003
3,negative,"""I miss you, Benny-boo-boo-boo!""","""How to Lose A Guy in Ten Days"" starts off awk...",2005
4,negative,Yawn,You could be forgiven after 15 minutes of this...,2003


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 4 columns):
polarity      9000 non-null object
summary       8999 non-null object
reviewText    9000 non-null object
year          9000 non-null int64
dtypes: int64(1), object(3)
memory usage: 281.3+ KB


In [6]:
df[df.isnull().any(axis=1)]

Unnamed: 0,polarity,summary,reviewText,year
1796,negative,,This show lacks many things when compared to C...,2004


In [7]:
print(df.at[1796, 'reviewText'])

This show lacks many things when compared to CSI: namely acting, and good writing.  Curuso is a total bore, no personality and acts exactly the same as he did on NYPD Blue.  A total let down, but that was to be expected.  I would go so far as to say that this is a poor show, period.  I watched the entire first season, always waiting for it to get better.  It never did.  The only decent acting comes from Procter and the M.E. from News Radio, who is utterly unbelieveable in her role.  This is what happens when TV execs recycle TV shows.  Suprisingly, CSI NY is even worse, and I have tremendous respect for Sinise as an actor.Don't fake the funk.


In [8]:
# social science coding random sampling code

# np.random.seed(1980)
# row_i = np.random.randint(0, high=len(df), size=20)
# for i in row_i:
#     print("Polarity: ", df.at[i, 'polarity'])
#     print("Year: ", df.at[i, 'year'])
#     print("Summary: ", df.at[i, 'summary'])
#     print()
#     print("Review: ", df.at[i, 'reviewText'])
#     print()

# Preprocessing

In [9]:
pad_mask_int = 0
pad_mask_sym = '==pad_mask=='
# padleft_int = 0
# padleft_sym = '==padleft_sym=='
# padright_int = 1
# padright_sym = '==padright_sym=='
unknown_int = 1
unknown_sym = '==unknown_sym=='

In [10]:
# fill nan with empty string
df.fillna('', inplace=True)

In [11]:
# lower-case?
# for col in ['polarity', 'summary', 'reviewText']:
#     df[col] = df[col].str.lower()
# tokenizer does this

In [12]:
# tokenization 

import re
from nltk.tokenize import TweetTokenizer

def tokenize(string):
    """
    takes string input and tokenizes into a list of strings. 
    If runtime is slow move tknzr out of function and call for it in the input. 
    This is highly unlikely given that function runs in O(1) = constant time.  
    """
    
    string = re.sub('\&quot;', '"', string)
    
    tknzr = TweetTokenizer(preserve_case = False, strip_handles = True, reduce_len = True)
    tokens = tknzr.tokenize(string)
    return tokens

print(tokenize('This movie is my favorite!!!!!!!! I &quot;love&quot; it so much'))

['this', 'movie', 'is', 'my', 'favorite', '!', '!', '!', 'i', '"', 'love', '"', 'it', 'so', 'much']


In [13]:
%%time
df["review_tokens"] = df["reviewText"].map(tokenize)
df['summary_tokens'] =  df["summary"].map(tokenize)

CPU times: user 6.48 s, sys: 31.1 ms, total: 6.51 s
Wall time: 6.51 s


In [14]:
# tokenization example
list(df.loc[1:1, 'review_tokens'])

[['ridley',
  "scott's",
  'graphic',
  'war',
  'film',
  'black',
  'hawk',
  'down',
  'is',
  'a',
  'masterpiece',
  'on',
  'every',
  'level',
  'of',
  'film',
  'making',
  '.',
  'it',
  'intorduces',
  'its',
  'charcters',
  'well',
  'even',
  'theo',
  'we',
  "don't",
  'know',
  'much',
  'about',
  'them',
  '.',
  'we',
  'feel',
  'what',
  'they',
  'half',
  'to',
  'go',
  'through',
  'in',
  'the',
  'film',
  '.',
  'what',
  'i',
  'liked',
  'most',
  'about',
  'the',
  'film',
  'is',
  'that',
  'it',
  'shows',
  'the',
  'level',
  'of',
  'cauos',
  'that',
  'war',
  'is',
  '.',
  'when',
  'something',
  'goes',
  'wrong',
  'these',
  'charcters',
  'act',
  'like',
  'real',
  'people',
  'would',
  'they',
  'act',
  'scared',
  ',',
  'afride',
  ',',
  'and',
  'nervous',
  '.',
  'you',
  'mite',
  'not',
  'know',
  'these',
  'charcters',
  'well',
  'enough',
  '.',
  'but',
  'you',
  'no',
  'them',
  'enough',
  'to',
  'feel',
  'for',
 

In [15]:
# create word count columns 
df['summary_wc'] = df['summary_tokens'].map(len)
df['review_wc'] = df['review_tokens'].map(len)

In [16]:
# standardise continuous columns 
mean_summary_wc = np.mean(df['summary_wc'])
mean_review_wc = np.mean(df['review_wc'])
mean_year = np.mean(df['year'])

std_summary_wc = np.std(df['summary_wc'])
std_review_wc = np.std(df['review_wc'])
std_year = np.std(df['year'])

df['summary_wc'] = (df['summary_wc'] - mean_summary_wc) / std_summary_wc
df['review_wc'] = (df['review_wc'] - mean_review_wc) / std_review_wc
df['year'] = (df['year'] - mean_year) / std_year

# if you don't the training is completely fucked and forever stuck at 0.5 accuracy 

In [17]:
# train, validation, test split
from sklearn.model_selection import train_test_split

print("Splitting into train/test/validation...")
train, test = train_test_split(df, test_size=0.2,random_state = 7)
validation, test = train_test_split(test, test_size=0.5, random_state = 7)

Splitting into train/test/validation...


In [18]:
test['review_wc']

8770   -0.137212
3929   -0.840514
1096   -0.608838
4431   -0.331654
6784   -0.207542
3529    0.065505
8946    0.487486
5464   -0.414395
6401   -0.315106
2684    2.245740
5589   -0.439218
6947   -0.447492
2223   -0.364751
4898   -0.195131
4384   -0.869473
7798   -0.186857
3063   -0.675031
5174    0.218576
5841    0.408881
3722    1.459697
2551   -0.753635
6215   -0.368888
4310    0.156520
1257   -0.385436
934     2.729777
2169   -0.228227
8981   -0.848788
5072   -0.356477
2405    0.152383
842    -0.555056
          ...   
6736   -0.861199
3389    0.214439
8052   -0.389573
6304   -0.844651
5270    0.214439
187    -0.546782
5693   -0.749498
1587   -0.373025
788     1.219747
8250    0.185480
7632   -0.844651
1127   -0.799143
1901   -0.745361
6846   -0.724676
582    -0.865336
3709   -0.699853
4797   -0.782595
6623   -0.848788
2495   -0.737087
247     0.086190
732    -0.282009
2235   -0.037922
6913    3.966761
3213    2.489827
1538   -0.981174
1787   -0.509548
5577    0.479211
5448   -0.8694

In [19]:
# vocabulary set
vocab_counter = Counter()
for doc in train['summary_tokens']:
    vocab_counter.update(doc)
for doc in train['review_tokens']:
    vocab_counter.update(doc)    

min_times_word_used = 2 # if at least 2 then the model will be prepared for unknown words in test and validation sets
print(len(vocab_counter), "tokens before discarding those that appear less than {} times.".format(min_times_word_used))
for key in list(vocab_counter.keys()):
    if vocab_counter[key] < min_times_word_used: 
        vocab_counter.pop(key)
print(len(vocab_counter), "tokens after discarding those that appear less than {} times.".format(min_times_word_used))   
vocab_set = set(vocab_counter.keys())

66147 tokens before discarding those that appear less than 2 times.
30097 tokens after discarding those that appear less than 2 times.


In [20]:
# vocabulary list and int map
vocab_list = [pad_mask_sym, unknown_sym] + sorted(vocab_set)
vocab_map = {word: index for index, word in enumerate(vocab_list)}

In [21]:
print(vocab_list[1])
vocab_map['==unknown_sym==']

==unknown_sym==


1

In [22]:
# label set
label_set = set(train['polarity'].unique())

In [23]:
# label list and int map
label_list = sorted(label_set)
label_map = {word: index for index, word in enumerate(label_list)}

In [24]:
# polarity to 0 / 1

In [25]:
from scipy import sparse 

def create_one_hot(labels, label_dict):
    """
    
    Args:
        labels:        list of labels
        label_dict:    dict of label indices
    Return:
        one_hot_numpy: sparse CSR 2d array of one-hot vectors
    """
    one_hot_numpy = sparse.dok_matrix((len(labels), len(label_dict)), dtype=np.int8)
    for i, label in enumerate(labels):
        one_hot_numpy[i, label_dict[label]] = 1
    return sparse.csr_matrix(one_hot_numpy) 

def undo_one_hot(pred, label_list):
    """
    
    Args: 
        pred: NumPy array of one-hot predicted classes
        label_list: a list of the label strings
    Return:
        label_pred: an NumPy array of predicted labels
    """
    labels = np.array(label_list)
    label_pred = labels[np.argmax(pred, axis=1)]
    return label_pred

In [26]:
y_train = create_one_hot(train['polarity'], label_map)
y_validation = create_one_hot(validation['polarity'], label_map)
y_test = create_one_hot(test['polarity'], label_map)

In [27]:
y_train.todense()

matrix([[0, 1],
        [1, 0],
        [0, 1],
        ...,
        [1, 0],
        [0, 1],
        [1, 0]], dtype=int8)

In [28]:
# replace strings with ints 
from typing import List, Set, Dict, Tuple, Optional

def word_index(los: List[List[str]], vocab_dict: Dict[str, int], unknown: int, reverse: bool=False) -> List[List[int]]:
    """
    Replaces words with integers from a vocabulary dictionary or else with 1+ number of keys in dictionary
    
    Args:
        los: list of lists of split sentences
        pad_to: how big to make the padded list
    Returns: 
        new_los: list of lists of split sentences wrapped around
        
    Examples:
    print(word_index([['one', 'two', 'three'], ['one', 'two']], {'one': 1, 'two': 2, 'three': 3}))
    """
    new_los = []
    if reverse:
        for sentence in los:
            new_los.append(reversed([vocab_dict[word] if word in vocab_dict else unknown for word in sentence]))        
    else:
        for sentence in los:
            new_los.append([vocab_dict[word] if word in vocab_dict else unknown for word in sentence])
    return new_los

In [29]:
print(word_index([['one', 'two', 'three'], ['one', 'two']], {'one': 1, 'two': 2, 'three': 3}, unknown=4))

[[1, 2, 3], [1, 2]]


In [30]:
train_summary = word_index(train['summary_tokens'], vocab_map, unknown_int)
train_review = word_index(train['review_tokens'], vocab_map, unknown_int) 

validation_summary = word_index(validation['summary_tokens'], vocab_map, unknown_int)
validation_review = word_index(validation['review_tokens'], vocab_map, unknown_int) 

test_summary = word_index(test['summary_tokens'], vocab_map, unknown_int)
test_review = word_index(test['review_tokens'], vocab_map, unknown_int) 

In [31]:
# pad / truncate 
from keras.preprocessing.sequence import pad_sequences

summary_len = max(map(len, list(train['summary_tokens'])))
review_len = 500

train_summary = pad_sequences(sequences=train_summary, 
                              maxlen=summary_len, 
                              dtype='int32', 
                              padding='pre', 
                              value=pad_mask_int)
train_review = pad_sequences(sequences=train_review, 
                             maxlen=review_len, 
                             dtype='int32', 
                             padding='pre',
                             truncating='pre',
                             value=pad_mask_int)

validation_summary = pad_sequences(sequences=validation_summary, 
                              maxlen=summary_len, 
                              dtype='int32', 
                              padding='pre', 
                              value=pad_mask_int)
validation_review = pad_sequences(sequences=validation_review, 
                             maxlen=review_len, 
                             dtype='int32', 
                             padding='pre',
                             truncating='pre',
                             value=pad_mask_int)

test_summary = pad_sequences(sequences=test_summary, 
                              maxlen=summary_len, 
                              dtype='int32', 
                              padding='pre', 
                              value=pad_mask_int)
test_review = pad_sequences(sequences=test_review, 
                             maxlen=review_len, 
                             dtype='int32', 
                             padding='pre',
                             truncating='pre',
                             value=pad_mask_int)

Using TensorFlow backend.


In [32]:
print(max(map(len, list(df['summary_tokens']))))
print(max(map(len, list(df['review_tokens']))))
print(len(vocab_list))
# could simplify this now we have a wc column

39
5694
30099


In [33]:
# one-hot if not using embeddings (a Keras embedding layer can actually also do one-hot for you so...)

# Model

In [34]:
import keras.optimizers
from keras.datasets import imdb
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Concatenate 
from keras.layers import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.constraints import maxnorm
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
# fix random seed for reproducibility - only works for CPU version of tensorflow
np.random.seed(42)

In [35]:
import matplotlib.pyplot as plt
import sklearn.metrics
import itertools

def plot_results(losses, accuracies):
    fig,ax = plt.subplots(1,2,figsize=[12,2])
    ax[0].plot(losses)
    ax[0].set_ylabel('loss')
    ax[0].set_xlabel('iteration');
    ax[1].plot(accuracies);
    ax[1].set_ylabel('accuracy')
    ax[1].set_xlabel('iteration');
    
    
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    From: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#        print("Normalized confusion matrix")
#    else:
#        print('Confusion matrix, without normalization')
#
#    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if cm[i, j] != 0:
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    
    
def plot_confusion(y, y_pred, label_list) -> None:
    """
    Args:
        y: true labels
        y_pred: predicted labels
        label_list: ordered iterable of labels
    """
    # Compute confusion matrix
    cnf_matrix = sklearn.metrics.confusion_matrix(y, y_pred, labels=label_list)
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    plt.figure(figsize=(13,10))
    plot_confusion_matrix(cnf_matrix, classes=label_list,
                          title='Confusion matrix, without normalization')

    # Plot normalized confusion matrix
    plt.figure()
    plt.figure(figsize=(13,10))
    plot_confusion_matrix(cnf_matrix, classes=label_list, normalize=True,
                          title='Normalized confusion matrix')

    plt.show()

In [36]:
%%time
# pretrained embeddings are from https://nlp.stanford.edu/projects/glove/
# start by loading in the embedding matrix
# load the whole embedding into memory
embeddings_index = dict()
with open('../../data/external/glove.42B.300d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Loaded %s word vectors.' % len(embeddings_index))

embedding_dim = 300

Loaded 1917494 word vectors.
CPU times: user 1min 38s, sys: 1.91 s, total: 1min 40s
Wall time: 1min 39s


In [37]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((len(vocab_list), embedding_dim))
count = 0
for i, word in enumerate(vocab_list):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        count += 1
        print(word)
print("Failed to find {} out of {} tokens.".format(count, len(vocab_list)))

==pad_mask==
==unknown_sym==
#10
#14
#16
#17
#24
#u-control
(8
)8
):
);
*
.  ...
. .
. . .
. ...
..
... ...
/8
1-2-
100.00+
2.the
6victoria
6vs
7vs
8)
8/
8:
8]
8vs
9vs
:(
:-(
::
:D
:P
<spoiler>
_cabaret_
_lolita_
a's
a-res
abc's
aboriginee
academybest
acornmedia
acting.overall
acting.the
action-fest
actor's
actor.it
adam's
add.the
administration's
adventures.reed
affleck's
africa's
agent's
ain't
akhenaton's
aki's
akio's
al's
alcott's
alec's
alex's
alexander's
algren's
ali's
alice's
alien's
alien.the
aliens-are-coming-to-kill-us
all's
allen's
alma's
almereyda's
alpha's
altman's
amazon's
ambril
amelia's
america's
american's
amidala's
amigo's
anakin's
anderson's
andoheb
andy's
angel's
angle's
ani's
animation's
anime's
aniston's
ann's
ann-margret's
anna's
anne's
anne-moss
annie's
anniston's
another's
anotonius
anti-normal
anton's
antonio's
antony's
anybody's
anyone's
aoyama's
architect's
aren't
argento's
ariannus
army's
arn't
arnetia
arnie's
arnold's
arquette's
arthur's
artist's
ash's
asim

In [38]:
%%time
# Best model - Combined 

# Keras functional API for joined model
input_s = Input(shape=(summary_len,), dtype='int32', name='input_s')
input_r = Input(shape=(review_len,), dtype='int32', name='input_r')

embedding_vector_length = embedding_dim
GRU_nodes_summary = 100
GRU_nodes_review = 100

emb_s = Embedding(len(vocab_list), embedding_vector_length, mask_zero=True,
                  input_length=summary_len, weights=[embedding_matrix], trainable=False)(input_s)
emb_r = Embedding(len(vocab_list), embedding_vector_length, mask_zero=True,
                  input_length=review_len, weights=[embedding_matrix], trainable=False)(input_r)

gru_s = GRU(GRU_nodes_summary, activation='tanh', recurrent_activation='sigmoid', dropout=0.4, 
              recurrent_dropout=0.3, kernel_constraint=maxnorm(4), recurrent_constraint=maxnorm(5),
              unroll=True, 
            
              use_bias=True, kernel_initializer='glorot_uniform', 
              recurrent_initializer='orthogonal', bias_initializer='zeros', 
              kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, 
              activity_regularizer=None,  
              bias_constraint=None, implementation=1, return_sequences=False, return_state=False, 
              go_backwards=False, stateful=False, reset_after=False)(emb_s)
gru_r = GRU(GRU_nodes_review, activation='tanh', recurrent_activation='sigmoid', dropout=0.4, 
              recurrent_dropout=0.3, unroll=True, 
              
              kernel_constraint=None, recurrent_constraint=None,
              use_bias=True, kernel_initializer='glorot_uniform', 
              recurrent_initializer='orthogonal', bias_initializer='zeros', 
              kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, 
              activity_regularizer=None,  
              bias_constraint=None, implementation=1, return_sequences=False, return_state=False, 
              go_backwards=False, stateful=False, reset_after=False)(emb_r)

concat = Concatenate()([gru_s, gru_r])
#calc = Dense(GRU_nodes_summary+GRU_nodes_review+3, activation='relu')(concat) # this might be superfluous?
output = Dense(len(label_set), activation='softmax')(concat)
model = Model([input_s, input_r], output)
nadam = keras.optimizers.nadam(lr=0.0008)
model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])
print(model.summary())
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6)
hist = model.fit(x=[train_summary, train_review], 
                 y=y_train, 
                 validation_data=([validation_summary, validation_review], 
                                  y_validation), 
                 epochs=100, batch_size=128, callbacks=[es])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_s (InputLayer)            (None, 39)           0                                            
__________________________________________________________________________________________________
input_r (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 39, 300)      9029700     input_s[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 500, 300)     9029700     input_r[0][0]                    
__________________________________________________________________________________________________
gru_1 (GRU

In [None]:
losses=hist.history['loss']
accuracies=hist.history['acc']
print("Training loss / accuracy")
plot_results(losses,accuracies)

In [None]:
losses=hist.history['val_loss']
accuracies=hist.history['val_acc']
print("Validation loss / accuracy")
plot_results(losses,accuracies)

In [None]:
# Predict for validation data 
y_pred = model.predict([validation_summary, validation_review])

# Undo one-hot
y_pred = undo_one_hot(y_pred, label_list)
y_orig = validation['polarity']

print("Validation data, confusion")
plot_confusion(y_orig, y_pred, label_list)

In [None]:
%%time
# Keras functional API for joined model w/ metadata
input_s = Input(shape=(summary_len,), dtype='int32', name='input_s')
input_r = Input(shape=(review_len,), dtype='int32', name='input_r')
input_wc_s = Input(shape=(1,), dtype='float32', name='input_wc_s')
input_wc_r = Input(shape=(1,), dtype='float32', name='input_wc_r')
input_year = Input(shape=(1,), dtype='float32', name='input_year')

embedding_vector_length = embedding_dim
GRU_nodes_summary = 100
GRU_nodes_review = 100

emb_s = Embedding(len(vocab_list), embedding_vector_length, mask_zero=True,
                  input_length=summary_len, weights=[embedding_matrix], trainable=False)(input_s)
emb_r = Embedding(len(vocab_list), embedding_vector_length, mask_zero=True,
                  input_length=review_len, weights=[embedding_matrix], trainable=False)(input_r)

gru_s = GRU(GRU_nodes_summary, activation='relu', recurrent_activation='sigmoid', dropout=0.3, 
              recurrent_dropout=0.3, kernel_constraint=maxnorm(4), recurrent_constraint=maxnorm(5),
              unroll=True, 
            
              use_bias=True, kernel_initializer='glorot_uniform', 
              recurrent_initializer='orthogonal', bias_initializer='zeros', 
              kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, 
              activity_regularizer=None,  
              bias_constraint=None, implementation=1, return_sequences=False, return_state=False, 
              go_backwards=False, stateful=False, reset_after=False)(emb_s)
gru_r = GRU(GRU_nodes_review, activation='relu', recurrent_activation='sigmoid', dropout=0.3, 
              recurrent_dropout=0.3, unroll=True, 
              
              kernel_constraint=None, recurrent_constraint=None,
              use_bias=True, kernel_initializer='glorot_uniform', 
              recurrent_initializer='orthogonal', bias_initializer='zeros', 
              kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, 
              activity_regularizer=None,  
              bias_constraint=None, implementation=1, return_sequences=False, return_state=False, 
              go_backwards=False, stateful=False, reset_after=False)(emb_r)

concat1 = Concatenate()([input_wc_s, input_wc_r, input_year])
calc = Dense(32, activation='relu')(concat1) 

concat2 = Concatenate()([gru_s, gru_r, calc])

output = Dense(len(label_set), activation='softmax')(concat2)
model = Model([input_s, input_r, input_wc_s, input_wc_r, input_year], output)
nadam = keras.optimizers.nadam(lr=0.0006)
model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])
print(model.summary())
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6)
hist = model.fit(x=[train_summary, train_review, train['summary_wc'], train['review_wc'], train['year']], 
                 y=y_train, 
                 validation_data=([validation_summary, validation_review, 
                                   validation['summary_wc'], validation['review_wc'],
                                   validation['year']], 
                                  y_validation), 
                 epochs=60, batch_size=128, callbacks=[es])

In [None]:
losses=hist.history['loss']
accuracies=hist.history['acc']
print("Training loss / accuracy")
plot_results(losses,accuracies)

In [None]:
losses=hist.history['val_loss']
accuracies=hist.history['val_acc']
print("Validation loss / accuracy")
plot_results(losses,accuracies)

In [None]:
# Predict for validation data 
y_pred = model.predict([validation_summary, validation_review, 
                        validation['summary_wc'], validation['review_wc'],
                        validation['year']])

# Undo one-hot
y_pred = undo_one_hot(y_pred, label_list)
y_orig = validation['polarity']

print("Validation data, confusion")
plot_confusion(y_orig, y_pred, label_list)

In [None]:
for weight in model.get_weights():
    df = pd.DataFrame(weight)
    print(df[df.isnull().any(axis=1)])

In [None]:
# remove embeddings by removing "weights=[embedding_matrix]" from Keras code

In [None]:
%%time
# create and run the model (with summaries)
embedding_vector_length = embedding_dim
model = Sequential()
model.add(Embedding(len(vocab_list), embedding_vector_length, mask_zero=True,
                    input_length=summary_len, weights=[embedding_matrix], trainable=False))
model.add(GRU(100, activation='relu', recurrent_activation='sigmoid', dropout=0.3, 
              recurrent_dropout=0.3, kernel_constraint=maxnorm(4), recurrent_constraint=maxnorm(5),

              use_bias=True, kernel_initializer='glorot_uniform', 
              recurrent_initializer='orthogonal', bias_initializer='zeros', 
              kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, 
              activity_regularizer=None,  
              bias_constraint=None, implementation=1, return_sequences=False, return_state=False, 
              go_backwards=False, stateful=False, unroll=True, reset_after=False))
model.add(Dense(len(label_set), activation='softmax'))
nadam = keras.optimizers.nadam(lr=0.0006)
model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])
print(model.summary())
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6)
hist = model.fit(x=train_summary, y=y_train, 
                 validation_data=(validation_summary, y_validation), 
                 epochs=50, batch_size=128, callbacks=[es])

In [None]:
losses=hist.history['loss']
accuracies=hist.history['acc']
print("Training loss / accuracy")
plot_results(losses,accuracies)

In [None]:
losses=hist.history['val_loss']
accuracies=hist.history['val_acc']
print("Validation loss / accuracy")
plot_results(losses,accuracies)

In [None]:
# Predict for validation data 
y_pred = model.predict(validation_summary)

# Undo one-hot
y_pred = undo_one_hot(y_pred, label_list)
y_orig = validation['polarity']

print("Validation data, confusion")
plot_confusion(y_orig, y_pred, label_list)

In [None]:
for weight in model.get_weights():
    df = pd.DataFrame(weight)
    print(df[df.isnull().any(axis=1)])

In [None]:
%%time
# create and run the model (with reviews)
embedding_vector_length = embedding_dim
model = Sequential()
model.add(Embedding(len(vocab_list), embedding_vector_length, mask_zero=True,
                    input_length=review_len, weights=[embedding_matrix], trainable=False))
model.add(GRU(100, activation='relu', recurrent_activation='sigmoid', dropout=0.3, 
              recurrent_dropout=0.3, 
              
              kernel_constraint=None, recurrent_constraint=None,
              use_bias=True, kernel_initializer='glorot_uniform', 
              recurrent_initializer='orthogonal', bias_initializer='zeros', 
              kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, 
              activity_regularizer=None,  
              bias_constraint=None, implementation=1, return_sequences=False, return_state=False, 
              go_backwards=False, stateful=False, unroll=True, reset_after=False))
model.add(Dense(len(label_set), activation='softmax'))
nadam = keras.optimizers.nadam(lr=0.0006, clipvalue=0.5)
model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])
print(model.summary())
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6)
hist = model.fit(x=train_review, y=y_train, 
                 validation_data=(validation_review, y_validation), 
                 epochs=50, batch_size=128, callbacks=[es])

In [None]:
losses=hist.history['loss']
accuracies=hist.history['acc']
print("Training loss / accuracy")
plot_results(losses,accuracies)

In [None]:
losses=hist.history['val_loss']
accuracies=hist.history['val_acc']
print("Validation loss / accuracy")
plot_results(losses,accuracies)

In [None]:
for weight in model.get_weights():
    df = pd.DataFrame(weight)
    print(df[df.isnull().any(axis=1)])

In [None]:
# Predict for validation data 
y_pred = model.predict(validation_review)

# Undo one-hot
y_pred = undo_one_hot(y_pred, label_list)
y_orig = validation['polarity']

print("Validation data, confusion")
plot_confusion(y_orig, y_pred, label_list)