In [5]:
import numpy as np
from numpy import array
import pickle as plk
import pandas as pd
import torch
import os

In [6]:
df = pd.read_excel(os.getcwd()+'/Q&A_Database_new.xlsx','QA', skiprows=3)

In [8]:
# Load model
from models import InferSent
model_version = 1
MODEL_PATH = "/Users/cyian/encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

In [9]:
# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = '/Users/cyian/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else '/Users/cyian/dataset/fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [32]:
l = df['Breakout'].unique() # class them with the BreakOut categories
dic = {} #create dictionary for questions
for category in l:
    list_ = list(df.loc[df['Breakout']  == category]['Question'])
    dic[category] = list_

In [43]:
import spacy
import re
import string
# Load English tokenizer, tagger, parser, NER and word vectors

# tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

#tokenize sentence by sentence
def question_split(input_):
    list_ = []
    for q in input_:
        q = q.split('\n')
        if len(q) ==1:
            list_.append(q)
    return list_

def tokenize(sent):
#   sent = re.sub('[^A-Za-z&]', ' ', sent) # replace non-letter with space
#   sent = re.sub(r'\b[a-zA-Z]\b', '', sent) #remove single letter 
    sent = re.sub('(\n)+', ' ', sent)
    sent = re.sub('y ou', 'you', sent)
    sent = re.sub('y es', 'yes', sent)
    sent = re.sub('v o', 'vo', sent)
    sent = re.sub("don't", 'dont', sent)
    sent = re.sub('[^A-Za-z&.!?,]', ' ', sent)
#     tokens = tokenizer(sent)
    return sent.split()

In [44]:
import nltk
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
raw_sent = []
for sent in dic['Balance sheet']:
    tmp = tokenizer.tokenize(sent)
    raw_sent+=tmp

In [45]:
len(raw_sent)

3117

In [46]:
all_tokens = []
for q in raw_sent:
    tokens = ' '.join(tokenize(q))
    all_tokens.append(tokens)

In [47]:
all_tokens

['On net interest income, do you have an outlook for how the net interest income dollars could trend from here, assuming that you don t get much help from higher rates, what are the key drivers?',
 'And what s kind of your outlook for NIM and NII dollars for the year?',
 'On your deposit discussion about pushing out, I think you said during your Investor Day that you would like to get about B of non core deposits off the balance sheet, and I know you said that you hope to make progress in Q .',
 'How should we model that out?',
 'And what type of benefit have you modeled out to the NIM with that B of deposits?',
 'Thank you so much for giving us a detailed rundown in terms of how you re thinking about your liquidity strategy.',
 'And I also appreciate the . B in deposit growth came from consumer and escrow.',
 'But taking a step back of the . trillion in period end deposits, how much would you classify as non operational corporate?',
 'And I guess, we re just wondering, is adopting a s

In [48]:
embeddings = model.encode(all_tokens, bsize=128, tokenize=False, verbose=True)
print('nb sentences encoded : {0}'.format(len(embeddings)))

Nb words kept : 55961/61603 (90.8%)
Speed : 73.8 sentences/s (cpu mode, bsize=128)
nb sentences encoded : 3117


In [36]:
embeddings[0]

array([0.09301893, 0.10547252, 0.14857621, ..., 0.03394615, 0.        ,
       0.03599665], dtype=float32)

In [51]:
embeddings.shape

(3117, 4096)

In [55]:
train, test = embeddings[:embeddings.shape[0]-500], embeddings[-500:]
nb_sequenced_sentences = 15
X_train = np.zeros((train.shape[0]-nb_sequenced_sentences,nb_sequenced_sentences, 4096), dtype=np.float)
y_train = np.zeros((train.shape[0]-nb_sequenced_sentences, 4096),  dtype=np.float)
X_test = np.zeros((test.shape[0]-nb_sequenced_sentences, nb_sequenced_sentences, 4096), dtype=np.float)
y_test = np.zeros((test.shape[0]-nb_sequenced_sentences, 4096),  dtype=np.float)
for idx in range(train.shape[0]-nb_sequenced_sentences):
    X_train[idx] = train[idx:idx+nb_sequenced_sentences]
    y_train[idx] = train[idx+nb_sequenced_sentences]
for idx in range(test.shape[0]-nb_sequenced_sentences):
    X_test[idx] = test[idx:idx+nb_sequenced_sentences]
    y_test[idx] = test[idx+nb_sequenced_sentences]

In [56]:
from __future__ import print_function
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Embedding, Flatten, Bidirectional, Input, LSTM
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy, mean_squared_error, mean_absolute_error, logcosh
from keras.layers.normalization import BatchNormalization

def bidirectional_lstm_model(seq_length, vector_dim):
    print('Building LSTM model...')
    model = Sequential()
    model.add(Bidirectional(LSTM(rnn_size, activation="relu"),input_shape=(seq_length, vector_dim)))
    model.add(Dropout(0.5))
    model.add(Dense(vector_dim))
    
    optimizer = Adam(lr=learning_rate)
    callbacks=[EarlyStopping(patience=2, monitor='val_loss')]
    model.compile(loss='logcosh', optimizer=optimizer, metrics=['acc'])
    print('LSTM model built.')
    return model

In [57]:
rnn_size = 512 # size of RNN
vector_dim = 4096
learning_rate = 0.0001 #learning rate

model_sequence = bidirectional_lstm_model(nb_sequenced_sentences, vector_dim)

Building LSTM model...
LSTM model built.


In [60]:
batch_size = 30 # minibatch size

callbacks=[EarlyStopping(patience=3, monitor='val_loss'),
           ModelCheckpoint(filepath='my_model_sequence_lstm.{epoch:02d}.hdf5',\
                           monitor='val_loss', verbose=1, mode='auto', period=5)]

history = model_sequence.fit(X_train, y_train,
                 batch_size=batch_size,
                 shuffle=True,
                 epochs=40,
                 callbacks=callbacks,
                 validation_split=0.1)

#save the model
model_sequence.save('my_model_sequence_lstm2.hdf5')

Train on 2341 samples, validate on 261 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40

Epoch 00005: saving model to my_model_sequence_lstm.05.hdf5
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40

Epoch 00010: saving model to my_model_sequence_lstm.10.hdf5
