In [1]:
import sys
sys.path.append("/usr/local/lib/python2.7/dist-packages")

In [2]:
import json
import os
import math



import unitok3.unitok.configs.english
import unitok3.unitok as tok

In [3]:
train_data='Headline_Trainingdata.json'

In [4]:
test_data='Headlines_Testdata_withscores.json'
trail_data='Headline_Trialdata.json'

In [5]:
import sys
sys.path.append("/home/ubuntu/anaconda2/lib/python2.7/site-packages")

In [8]:
def unitok_tokens(text):
    '''Tokenises using unitok http://corpus.tools/wiki/Unitok the text. Given
    a string of text returns a list of strings (tokens) that are sub strings
    of the original text. It does not return any whitespace.
    String -> List of Strings
    '''

    tokens = tokenize(text, unitok3.unitok.configs.english)
    return [token for tag, token in tokens if token.strip()]

In [9]:
def max_length(texts):
    '''Given a list of strings it will return the length of the string with the
    most tokens. Where length is measured in number of tokens. unitok_tokens
    method is used to identify tokens.
    List of strings -> Integer
    '''

    max_token_length = 0
    for text in texts:
        tokens = unitok_tokens(text)
        if len(tokens) > max_token_length:
            max_token_length = len(tokens)
    return max_token_length

In [10]:
def process_data(texts, wordvec_model, max_token_length):
    '''Given a list of Strings a word2vec model and the maximum token length
    it will return a 3 dimensional numpy array of the following shape:
    (number of texts, word2vec model vector size, max token length).
    Each text will have each token mapped to a vector in the word2vec model. If
    the token does not exist then a vector of zeros will be inserted instead.
    The vector of zero applices when the text has no more tokens but has not
    reached the mex token length (This is also called padding).
    List of strings, gensim.models.Word2Vec, Integer -> 3D Numpy array.
    '''

    vector_length = wordvec_model.vector_size
    all_vectors = []

    for text in texts:
        vector_format = []
        tokens = unitok_tokens(text)[0:max_token_length]
        for token in tokens:
            if token in fin_word2vec_model.raw_vocab:
                vector_format.append(fin_word2vec_model[token].reshape(1,vector_length))
            else:
                vector_format.append(numpy.zeros(300).reshape(1,vector_length))
        while len(vector_format) != max_token_length:
            vector_format.append(numpy.zeros(vector_length).reshape(1,vector_length))
        all_vectors.append(numpy.vstack(vector_format))
    return numpy.asarray(all_vectors)

In [11]:
def whitespace_tokens(text):

    return text.split()

In [12]:
def analyzer(token):
 
    return token


In [13]:
def ngrams(token_list, n_range):
    '''Given a list of tokens will return a list of tokens that have been
    concatenated with the n closests tokens.'''

    def get_n_grams(temp_tokens, n):
        token_copy = list(temp_tokens)
        gram_tokens = []
        while(len(token_copy) >= n):
            n_list = []
            for i in range(0,n):
                n_list.append(token_copy[i])
            token_copy.pop(0)
            gram_tokens.append(' '.join(n_list))
        return gram_tokens

    all_n_grams = []
    for tokens in token_list:
        if n_range == (1,1):
            all_n_grams.append(tokens)
        else:
            all_tokens = []
            for n in range(n_range[0], n_range[1] + 1):
                all_tokens.extend(get_n_grams(tokens, n))
            all_n_grams.append(all_tokens)

    return all_n_grams

In [14]:
def __get_submitted_values():
    early_stop_path = ('Early Stopping',
                       config_path(['submitted_data', 'early_stopping']))
    tweeked_path = ('Tweeked', config_path(['submitted_data', 'tweeked']))

    for sub_name, sub_path in [early_stop_path, tweeked_path]:
        sentiment_values = []
        with open(sub_path, 'r') as fp:
            for data in json.load(fp):
                sentiment_values.append(data['sentiment score'])
        yield sub_name, sentiment_values

In [15]:
def __get_submitted_values():
    early_stop_path = ('Early Stopping',
                       config_path(['submitted_data', 'early_stopping']))
    tweeked_path = ('Tweeked', config_path(['submitted_data', 'tweeked']))

    for sub_name, sub_path in [early_stop_path, tweeked_path]:
        sentiment_values = []
        with open(sub_path, 'r') as fp:
            for data in json.load(fp):
                sentiment_values.append(data['sentiment score'])
        yield sub_name, sentiment_values

In [16]:
def compare(predicted_sentiments):


    for sub_name, sent_values in __get_submitted_values():
        sim_value = 1 - cosine(sent_values, predicted_sentiments)
        msg = ('Similarity between your predicted values and {}: {}'
              ).format(sub_name, sim_value)
        print(msg)

In [17]:
def __text_sentiment_company(all_data):
    '''Given a list of dicts will return a tuple of 3 lists containing:
    1. list of strings lower cased - text data
    2. numpy array (len(text data), 1) dimension of floats - sentiment values
    3. list of strings - company names associated to the text data
    list of dicts -> tuple(list of strings, numpy array, list of strings)
    '''

    text = []
    sentiment = []
    company = []
    for data in all_data:
        text.append(data['title'].lower())
        company.append(data['company'].lower())
        # This field does not exist in test dataset
        if 'sentiment' in data:
            sentiment.append(data['sentiment'])
        elif 'sentiment score' in data:
            sentiment.append(data['sentiment score'])
    return text, numpy.asarray(sentiment), company

In [18]:
def fin_data(data_type):
    '''Given either train, trail or test string as data type will retrieve
    those datasets that were given out in SEMEval task 5 track 2 2017 in the
    format of a tuple containing:
    1. list of strings lower cased - text data
    2. numpy array (len(text data), 1) dimension of floats - sentiment values
    3. list of strings - company names associated to the text data
    String -> tuple(list of strings, numpy array, list of strings)
    '''

    #data_path = config_path(['data', 'fin_data', data_type + '_data'])
    with open(data_type, 'r') as fp:
        return __text_sentiment_company(json.load(fp))


In [19]:
def cosine_score(predicted_values, true_values):
    '''Given two arrays of same length returns the cosine similarity where 1
    is most similar and 0 is not similar.
    list, list -> float
    '''

    return 1 - cosine(predicted_values, true_values)

In [20]:
def stats_report(clf, f_name):
  
    def convert_value(value):

        if callable(value):
            value = value.__name__
        return str(value)

    means  = clf.cv_results_['mean_test_score']
    stds   = clf.cv_results_['std_test_score']
    params = clf.cv_results_['params']
    with open(f_name, 'w') as fp:
        fp.write("Mean\tSD\t{} \n".format('\t'.join(params[0].keys())))
        for mean, std, param in zip(means, stds, params):
            param_values = []
            for key, value in param.items():
                if ('__words_replace' in key or '__disimlar' in key or
                    '__word2extract' in key):
                    param_values.append(convert_value(value[0]))
                else:
                    param_values.append(convert_value(value))
            fp.write("{}\t{}\t{}\n".format(str(mean), str(std), '\t'.join(param_values)))

In [21]:
def pred_true_diff(pred_values, true_values, score_function, mapping=None):
    results = []

    for i in range(len(pred_values)):
        mapped_value = i
        # This is to support both lists and numpy arrays
        if hasattr(mapping,'__index__') or hasattr(mapping, 'index'):
            mapped_value = mapping[i]
        results.append((mapped_value, pred_values[i],
                       score_function([pred_values[i]], [true_values[i]])))
    return results

In [22]:
from scipy.spatial.distance import cosine
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [23]:
def error_cross_validate(train_data, train_values, model, n_folds=10,
                         shuffle=True, score_function=mean_absolute_error):
 

    results = []

    train_data_array = numpy.asarray(train_data)
    train_values_array = numpy.asarray(train_values)

    kfold = KFold(n_splits=n_folds, shuffle=shuffle)
    for train, test in kfold.split(train_data_array, train_values_array):
        model.fit(train_data_array[train], train_values_array[train])

        predicted_values = model.predict(train_data_array[test])
        real_values = train_values_array[test]

        results.extend(pred_true_diff(predicted_values, real_values,
                                      score_function, mapping=test))
    return results


In [24]:
def top_n_errors(error_res, train_data, train_values, companies, n=10):
   
    

    error_res = sorted(error_res, key=lambda value: value[2], reverse=True)
    top_errors = error_res[:n]
    return [{'Sentence':train_data[index], 'Company':companies[index],
            'True value':train_values[index], 'Pred value':pred_value,
            'index':index} for index, pred_value, _ in top_errors]

In [25]:
def comps2sent(text_data, companies):
  

    sentence_compid = {}
    for i in range(len(text_data)):
        text = text_data[i]
        comp = companies[i]
        comps_indexs = sentence_compid.get(text, [])
        comps_indexs.append((comp,i))
        sentence_compid[text] = comps_indexs
    compscount_ids = {}
    for _, compsid in sentence_compid.items():
        ids = compscount_ids.get(len(compsid), [])
        ids.append([comp_id[1] for comp_id in compsid])
        compscount_ids[len(compsid)] = ids
    return compscount_ids

In [26]:
def sent_type_errors(top_errors, compscount_ids):
   

    ids_compscount = {}
    for compscount, ids_list in compscount_ids.items():
        for ids in ids_list:
            for a_id in ids:
                ids_compscount[a_id] = compscount

    comps_errors = {}
    for error in top_errors:
        sent_id = error['index']
        comp_count = ids_compscount[sent_id]
        errors = comps_errors.get(comp_count, [])
        errors.append(error)
        comps_errors[comp_count] = errors
    return comps_errors

In [27]:
def error_dist(comps_ids):
    return {k : len(v) for k, v in comps_ids.items()}

In [28]:
import sys
sys.path.append("/home/ubuntu/anaconda2/lib/python2.7/site-packages")

In [29]:
import gensim
import numpy
from scipy.spatial.distance import cosine
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [30]:
from keras.utils.vis_utils import plot_model
import numpy
from scipy.spatial.distance import cosine
from sklearn.model_selection import KFold

Using TensorFlow backend.


In [31]:
class LSTMModel:

    def __init__(self, word2vec_model):
        self._word2vec_model = word2vec_model
        self._max_length = 0
        self._model = None



    def cross_validate(self, train_text, train_sentiments, n_folds=10,
                       shuffle=True, score_function=cosine_score):

        all_results = []
        train_text_array = numpy.asarray(train_text)
        train_sentiments_array = numpy.asarray(train_sentiments)

        kfold = KFold(n_splits=n_folds, shuffle=shuffle)
        for train, test in kfold.split(train_text_array, train_sentiments_array):
            self.fit(train_text_array[train], train_sentiments_array[train])
            predicted_sentiments = self.predict(train_text_array[test])
            result = score_function(predicted_sentiments, train_sentiments_array[test])
            all_results.append(result)
        return all_results

    def _text2vector(self, texts):
        '''Given a list of Strings will convert to a numpy 3D array where each
        token in the text is reprsented as a vector from the self.word2vec_model.
        see semeval.helper.process_data for more details.
        list of strings -> 3D numpy array (len(texts), max_number_tokens,
        self.word2vec_model.vector_size)
        '''

        if self._max_length == 0:
            raise Exception('Your model requires training first')

        return process_data(texts, self._word2vec_model, self._max_length)

    def fit(self):
        '''All sub classes should overide this but pre-filter so that random
        seed can be set and allow all models to be more reprocible.
        '''

        # Required for reproducibility
        numpy.random.seed(1337)

    def predict(self, test_texts):
        '''Given a list of strings will return a list of predicted values based
        on what the LSTM has been trained on.
        List of strings -> list of predicted values.
        '''

        test_vectors = self._text2vector(test_texts)
        if self._model == None:
            raise Exception('Your model requires training first')
        return self._model.predict(test_vectors)

    def _set_max_length(self, texts):

        self._max_length = max_length(texts)
        return self._max_length

    def _set_model(self, model):

        self._model = model
        return model

    def visualise_model(self, f_name):
        '''Given a file path will visulaise the LSTM model.
        String -> Void
        '''

        if self._model == None:
            raise Exception('Your model requires training first')
        plot(self._model, to_file=f_name)

In [32]:
import numpy

from keras.models import Sequential
from keras.layers import Dense, Activation, Bidirectional, LSTM, Dropout
from keras.callbacks import EarlyStopping

class EarlyStoppingLSTM(LSTMModel):
    '''Model that can train an LSTM and apply the trainned model to unseen
    data. Inherits from LSTMModel.
    Instance Arguments:
    self._word2vec_model - gensim.models.Word2Vec required as an argument to __init__
    self._max_length = 0
    self._model = None
    public methods:
    train - trains a Bi-directional LSTM with dropout and early stopping on
    the texts and sentiment values given.
    test - Using the trained model saved at self._model will return a list of
    sentiment values given the texts in the argument of the method.
    '''

    def __init__(self, word2vec_model):
        super().__init__(word2vec_model)

    def fit(self, train_texts, sentiment_values):
 

        super().fit()

        max_length    = self._set_max_length(train_texts)
        vector_length = self._word2vec_model.vector_size

        train_vectors = self._text2vector(train_texts)

        model = Sequential()
        model.add(Dropout(0.5, input_shape=(max_length, vector_length)))
        # Output of this layer is of max_length by max_length * 2 dimension
        # instead of max_length, vector_length
        model.add(Bidirectional(LSTM(max_length, activation='softsign',
                                     return_sequences=True)))
        model.add(Dropout(0.5))
        model.add(Bidirectional(LSTM(max_length, activation='softsign')))
        model.add(Dropout(0.5))
        model.add(Dense(1))
        model.add(Activation('linear'))

        model.compile(loss='mse',
                      optimizer='rmsprop',
                      metrics=['cosine_proximity'])
                      #clipvalue=5



        early_stopping = EarlyStopping(monitor='val_loss', patience=10)

        model.fit(train_vectors, sentiment_values, validation_split=0.1,
                  callbacks=[early_stopping] , epochs=100)

        return self._set_model(model)
    
    def predict(self, test_texts):
        '''Given a list of strings will return a list of predicted values based
        on what the LSTM has been trained on.
        List of strings -> list of predicted values.
        '''

        test_vectors = self._text2vector(test_texts)
        if self._model == None:
            raise Exception('Your model requires training first')
        return self._model.predict(test_vectors)

In [33]:
def fin_word_vector():

    #fin_word2vec_path = config_path(['models', 'fin_word2vec'])
    return gensim.models.Word2Vec.load('all_fin_model_lower')

In [34]:
train_texts, train_sentiments, train_companies = fin_data(train_data)
trial_texts, trial_sentiments, trial_companies = fin_data(trail_data)

In [35]:
test_texts, test_sentiments, test_companies = fin_data(test_data)


In [36]:
fin_word2vec_model = fin_word_vector()


In [39]:
# Required to for the results on the test data
true_values = eval_format(test_texts, test_sentiments)

In [66]:
fin_word2vec_model.sorted_vocab

1

In [39]:
google_fin_word2vec_model=gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  


In [38]:
def eval_format(title_list, sentiment_list):
    '''Given a list of strings and a list of floats it will convert them into
    a list of dicts so that they can be a parameter in the eval_func.
    list of strings, list of floats -> list of dicts
    '''

    assert len(title_list) == len(sentiment_list), 'The two list have to be of the same length'

    return [{'title' : title_list[i], 'sentiment score' : sentiment_list[i]} for
            i in range(len(title_list))]

In [39]:
len(test_sentiments)

491

In [40]:
# Required to for the results on the test data
true_values = eval_format(trial_texts, trial_sentiments)

In [40]:
early_lstm = EarlyStoppingLSTM(fin_word2vec_model)


In [41]:
def tokenize_recursively(text, re_list, depth=0):
    if depth >= len(re_list):
        return [('*', text)]
    token_type, regular_expr = re_list[depth]
    tokens = []
    pos = 0
    while pos < len(text):
        m = regular_expr.search(text, pos)
        if not m:
            tokens.extend(tokenize_recursively(text[pos:], re_list, depth+1))
            break
        else:
            startpos, endpos = m.span()
            if startpos > pos:
                tokens.extend(tokenize_recursively(text[pos:startpos], re_list, depth+1))
            tokens.append((token_type, text[startpos:endpos]))
            pos = endpos
    return tokens


def tokenize(text, configuration):
    re_list = configuration.re_list
    return tokenize_recursively(text, re_list)

In [42]:
def error_analysis(data, values, comps, clf, text=False, cv=None, num_errors=50,
                   score_function=mean_absolute_error):

    compcount_id = None
    if text:
        compcount_id = comps2sent(text, comps)
    else:
        compcount_id = comps2sent(data, comps)
    error_results = None
    if cv:
        if isinstance(cv, dict):
            error_results = error_cross_validate(data, values, clf,
                                                 score_function=score_function, **cv)
        else:
            error_results = error_cross_validate(data, values, clf,
                                                 score_function=score_function)
    else:
        pred_values = clf.predict(data)
        error_results = pred_true_diff(pred_values, values, score_function)
    top_errors = top_n_errors(error_results, data, values,
                              comps, n=num_errors)
    error_details = sent_type_errors(top_errors, compcount_id)
    error_distribution = error_dist(error_details)
    return error_details, error_distribution

In [43]:
# Get the 10 fold cross validation results
early_res = early_lstm.cross_validate(train_texts, train_sentiments)

Train on 924 samples, validate on 103 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Train on 924 samples, validate on 103 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Train on 925 samples, validate on 103 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Train

Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Train on 925 samples, validate on 103 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Train on 925 samples, validate on 103 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Train on 925 samples, validate on 103 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Train on 925 samples, validate on 103 sample

In [44]:
early_lstm.fit(train_texts, train_sentiments)
early_error_details, early_error_dist = error_analysis(test_texts, test_sentiments,
                                                            test_companies, early_lstm)


Train on 1027 samples, validate on 115 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [45]:
early_error_details

{1: [{'Company': 'bp',
   'Pred value': array([ 0.03432425], dtype=float32),
   'Sentence': 'bp reports biggest ever annual loss',
   'True value': -0.98999999999999999,
   'index': 415},
  {'Company': 'glencore',
   'Pred value': array([ 0.03432425], dtype=float32),
   'Sentence': 'glencore shares in record crash as profit fears grow',
   'True value': -0.97099999999999997,
   'index': 142},
  {'Company': 'bp',
   'Pred value': array([ 0.03432425], dtype=float32),
   'Sentence': 'oil giant bp reports loss of $4.4 billion in 4th quarter of 2014',
   'True value': -0.96299999999999997,
   'index': 83},
  {'Company': 'aviva plc',
   'Pred value': array([ 0.03432425], dtype=float32),
   'Sentence': 'aviva posts forecast-beating 2015 operating profit of $3.8 bln',
   'True value': 0.94599999999999995,
   'index': 257},
  {'Company': 'persimmon',
   'Pred value': array([ 0.03432425], dtype=float32),
   'Sentence': 'update: persimmon profit up strongly, outlook positive',
   'True value': 0.

In [46]:
early_error_dist

{1: 47, 2: 3}

In [47]:
def metric3(pred_values, true_values):
    '''Given two lists finds the similarities between the list using the
    equation 5 on slide 20 of the presentation at:
    ./presentation/slides.pdf
    List of ints, List of ints -> int
    '''

    all_score = 0
    if len(pred_values) > 1:
        cosine_value = cosine_score(numpy.asarray(pred_values),
                                    numpy.asarray(true_values))
        if numpy.isnan(cosine_value):
            cosine_value = 0
        all_score = len(pred_values) * cosine_value

    if len(pred_values) == 1:
        pred_score = pred_values[0]
        test_score = true_values[0]
        if pred_score==0 and test_score==0:
            all_score = 1
        elif test_score==0 or (pred_score / test_score) > 0:
            all_score = 1 - math.fabs(true_values[0] - pred_values[0])
    return all_score

In [48]:
def eval_func(test_data, pred_data, metric=metric3):
    '''Takes a list of dicts where each dict contains two keys:
    'sentiment score' - a float value
    'title' - a string
    The function finds the mean cosine similarity between each titles sentiment values.
    (A title can have more than one sentiment value associated with it if it has more
    than one company mentioned.)
    Optional argument:
    metric - is a function which defines the metric that you would like to use.
    See the metric functions within this module.
    Default is metric3
    List of dicts, list of dicts -> float
    '''

    all_vals   = []
    title_id   = {}
    test_sents = []
    pred_sents = []
    for i in range(len(test_data)):
        data = test_data[i]
        ids = title_id.get(data['title'], [])
        ids.append(i)
        title_id[data['title']] = ids
        test_sents.append(test_data[i]['sentiment score'])
        pred_sents.append(pred_data[i]['sentiment score'])

    if metric == metric1:
        return metric1(pred_sents, test_sents)

    for _, ids in title_id.items():

        pred_sent_scores = []
        test_sent_scores = []
        for a_id in ids:
            pred_value = pred_data[a_id]['sentiment score']
            test_value = test_data[a_id]['sentiment score']

            pred_sent_scores.append(pred_value)
            test_sent_scores.append(test_value)

        all_vals.append(metric(pred_sent_scores, test_sent_scores))

    if metric == metric2:
        return sum(all_vals) / len(all_vals)
    elif metric == metric3:
        return sum(all_vals) / len(test_data)
    else:
        raise Exception('Cannot identify that metric function')

In [49]:
def metric1(pred_values, test_values):
    '''Wrapper for cosine_score, given two lists returns an int.
    Wrapper so that the function name matches the name in the presentation and
    paper.
    List of ints, List of ints -> int
    '''

    return cosine_score(pred_values, test_values)

In [50]:
def metric2(pred_values, test_values):
    '''Given two lists finds the similarities between the list using the
    equation 4 on slide 20 of the presentation at:
    ./presentation/slides.pdf
    List of ints, List of ints -> int
    '''

    all_score = 0
    cosine_value = cosine_score(numpy.asarray(pred_values),
                                numpy.asarray(test_values))
    if not numpy.isnan(cosine_value):
        all_score = cosine_value
    return all_score

In [51]:
pred_values = eval_format(test_texts, early_lstm.predict(test_texts))

print('Metric 1 {}'.format(eval_func(true_values, pred_values, metric=metric1)))
print('Metric 2 {}'.format(eval_func(true_values, pred_values, metric=metric2)))
print('Metric 3 {}'.format(eval_func(true_values, pred_values, metric=metric3)))

Metric 1 0.03579421126901983
Metric 2 0.12382685130371796
Metric 3 0.41390099818946685


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [52]:
#Tweaked LSTM
from keras.models import Sequential
from keras.layers import Dense, Activation, Bidirectional, LSTM

In [53]:
class TweekedLSTM(LSTMModel):
    '''Model that can train an LSTM and apply the trainned model to unseen
    data. Inherits from LSTMModel.
    Instance Arguments:
    self._word2vec_model - gensim.models.Word2Vec required as an argument to __init__
    self._max_length = 0
    self._model = None
    public methods:
    train - trains a Bi-directional LSTM with dropout and manually set stopping
    on the texts and sentiment values given.
    test - Using the trained model saved at self._model will return a list of
    sentiment values given the texts in the argument of the method.
    '''



    def __init__(self, word2vec_model):
        super().__init__(word2vec_model)

    def fit(self, train_texts, sentiment_values):
        '''Given a list of Strings and a list of floats (sentiments) or numpy
        array of floats. It will return a trained LSTM model and `save` the model to
        self._model for future use using self.test(texts).
        The model converts the list of strings into list of numpy matrixs
        which has the following dimensions:
        length of the longest train text broken down into tokens
        by
        the vector size of the word2vec model given in the constructor
        e.g. 21, 300 if the word2vec model vector size if 300 and the length of
        the longest train text in tokens is 21.
        For more details on the layers use read the source or after training
        visualise using visualise_model function.
        '''

        super().fit()

        # Required for any transformation of text latter.
        max_length    = self._set_max_length(train_texts)
        vector_length = self._word2vec_model.vector_size

        train_vectors = self._text2vector(train_texts)

        model = Sequential()
        # Output of this layer is of max_length by max_length * 2 dimension
        # instead of max_length, vector_length
        model.add(Bidirectional(LSTM(max_length, activation='softsign',
                                     dropout_W=0.2, dropout_U=0.2,
                                     return_sequences=True),
                                input_shape=(max_length, vector_length)))
        model.add(Bidirectional(LSTM(max_length, activation='softsign',
                                     dropout_W=0.2, dropout_U=0.2)))
        model.add(Dense(1))
        model.add(Activation('linear'))

        model.compile(loss='mse',
                      optimizer='rmsprop',
                      metrics=['cosine_proximity'])
                      #clipvalue=5

        model.fit(train_vectors, sentiment_values, nb_epoch=25)

        return self._set_model(model)

In [54]:
tweeked_lstm = TweekedLSTM(fin_word2vec_model)
tweeked_res = tweeked_lstm.cross_validate(train_texts, train_sentiments)
tweeked_lstm.fit(train_texts, train_sentiments)




Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoc

Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoc

<keras.models.Sequential at 0x7f67fe8f9fd0>

In [56]:
tweek_error_details, tweek_error_dist = error_analysis(test_texts, test_sentiments,
                                                            test_companies, tweeked_lstm)

In [57]:
tweek_error_details

{1: [{'Company': 'bp',
   'Pred value': array([ 0.0399968], dtype=float32),
   'Sentence': 'bp reports biggest ever annual loss',
   'True value': -0.98999999999999999,
   'index': 415},
  {'Company': 'glencore',
   'Pred value': array([ 0.0399968], dtype=float32),
   'Sentence': 'glencore shares in record crash as profit fears grow',
   'True value': -0.97099999999999997,
   'index': 142},
  {'Company': 'bp',
   'Pred value': array([ 0.0399968], dtype=float32),
   'Sentence': 'oil giant bp reports loss of $4.4 billion in 4th quarter of 2014',
   'True value': -0.96299999999999997,
   'index': 83},
  {'Company': 'aviva plc',
   'Pred value': array([ 0.0399968], dtype=float32),
   'Sentence': 'aviva posts forecast-beating 2015 operating profit of $3.8 bln',
   'True value': 0.94599999999999995,
   'index': 257},
  {'Company': 'persimmon',
   'Pred value': array([ 0.0399968], dtype=float32),
   'Sentence': 'update: persimmon profit up strongly, outlook positive',
   'True value': 0.93700

In [58]:
tweek_error_dist

{1: 47, 2: 3}

In [59]:
pred_values = eval_format(test_texts, tweeked_lstm.predict(test_texts))
print('Metric 1 {}'.format(eval_func(true_values, pred_values, metric=metric1)))
print('Metric 2 {}'.format(eval_func(true_values, pred_values, metric=metric2)))
print('Metric 3 {}'.format(eval_func(true_values, pred_values, metric=metric3)))

Metric 1 0.03579421134515637
Metric 2 0.123826853165601
Metric 3 0.4163761234895398


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [60]:
# Print both LSTM's cross validation results
avg_tweek_percentage = (sum(tweeked_res) / len(tweeked_res)) * 100
print('Tweeked lstm cross val score {}'.format(avg_tweek_percentage))

avg_early_percentage = (sum(early_res) / len(early_res)) * 100
print('Early lstm cross val score {}'.format(avg_early_percentage))

Tweeked lstm cross val score 7.7931559069293765
Early lstm cross val score 7.820167801195983
