In [35]:
import numpy as np
from numpy import array
import pickle as plk
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Bidirectional
from keras.layers import Dropout
from keras.layers import LSTM, GRU
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.utils import to_categorical
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
import os

In [4]:
# df = pd.read_excel('/Users/luyin/Desktop/project/Q&A.xlsx',header = 0)
df = pd.read_excel(os.getcwd()+'/Q&A_Database_new.xlsx','QA', skiprows=3)

In [5]:
l = df['Analyst name'].unique() # 79 unique analyst
dic = {} #create dictionary for questions

for name in l:
    list_ = list(df.loc[df['Analyst name']  == name]['Question'])
    dic[name] = list_

In [6]:
import spacy
import re
import string
# Load English tokenizer, tagger, parser, NER and word vectors

tokenizer = spacy.load('en_core_web_sm')
punctuations = string.punctuation

def tokenize(sent):
#   sent = re.sub('[^A-Za-z&]', ' ', sent) # replace non-letter with space
#   sent = re.sub(r'\b[a-zA-Z]\b', '', sent) #remove single letter 
    sent = re.sub('^[0-9]+', '', sent)
    tokens = tokenizer(sent)
    return [(token.text.lower()) for token in tokens if (token.text.lower() not in punctuations and token.is_alpha and token.text.lower() not in {'\xa0', ' '," "})]

# tokens = tokenize(" going to hit them one way or another strong dollar did seem to have a huge impact and y ou, what are you doing? I'm 's what do you and me think or like apples and Apple is looking at buying and bought U.K. startup for $1 billion. '\n' another sentence")
# for token in tokens:
#     print (token)

In [7]:
def tokenize_analyst_q(name):
    all_tokens = []
    for q in dic[name]:
        tokens = tokenize(q)
        all_tokens += tokens
    return all_tokens

In [8]:
tokens = tokenize_analyst_q('Glenn Schorr')

In [9]:
len(tokens)

12657

In [10]:
def build_vocab(input_):
    vocab = sorted(set(input_))
    vocab_to_int = dict((c, i) for i, c in enumerate(vocab))
    int_to_vocab = dict((i, c) for i, c in enumerate(vocab))
    n_total = len(input_)
    n_vocab = len(vocab)
    print ("Total Words: {}".format(n_total))
    print ("Total Vocab: {}".format(n_vocab))
    return n_total, n_vocab

In [11]:
# organize into sequences of tokens
def build_sequence(length, input_):
    sequences = list()
    for l in range(length, len(input_)):
        seq = input_[l-length:l]
        line = ' '.join(seq)
        sequences.append(line)
    print('Total Sequences: {}' .format(len(sequences)))
    return sequences

In [12]:
n_total, n_vocab = build_vocab(tokens)
sequences = build_sequence(20, tokens)

Total Words: 12657
Total Vocab: 1658
Total Sequences: 12637


In [13]:
def save_file(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

out_filename = 'Glenn_Schorr_sequences.txt'
save_file(sequences, out_filename)


In [15]:
# load doc into memory
def load_file(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
in_filename = 'Glenn_Schorr_sequences.txt'
doc = load_file(in_filename)
lines = doc.split('\n')

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines) #fit on texts
sequences = tokenizer.texts_to_sequences(lines)
vocab_size = len(tokenizer.word_index) + 1
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [22]:
X

array([[   1,  263,    6, ...,    2,   31,   21],
       [ 263,    6,  237, ...,   31,   21,   59],
       [   6,  237,   37, ...,   21,   59,  238],
       ...,
       [ 108,    4,   82, ...,   51, 1658,  329],
       [   4,   82,    4, ..., 1658,  329,    4],
       [  82,    4,  237, ...,  329,    4,   18]])

In [24]:
words_to_load = 50000
PAD_IDX = 0
UNK_IDX = 1
import numpy as np
# reserve the 1st 2nd token for padding and <UNK> respectively
with open('/Users/cyian/Desktop/NYU/FALL2018/DS-GA1001_NLP/hws/HW2/wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+2, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    ordered_words_ft.extend(['<pad>', '<unk>'])
    loaded_embeddings_ft[0,:] = np.zeros(300)
    loaded_embeddings_ft[1,:] = np.random.normal(size = 300)
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+2, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+2
        idx2words_ft[i+2] = s[0]
        ordered_words_ft.append(s[0])
    words_ft['<pad>'] = 0
    words_ft['<unk>'] = 1
    idx2words_ft[0] = '<pad>'
    idx2words_ft[1] = '<unk>'

In [26]:
def word_to_id(word_list):
    return [words_ft[x] if x in ordered_words_ft else UNK_IDX for x in word_list.split()]
def sent_to_id(sent_list):
    return [word_to_id(x) for x in sent_list]

In [25]:
lines

['the performance in equities was great and you mentioned it pretty much across the board do you think there any',
 'performance in equities was great and you mentioned it pretty much across the board do you think there any seasonality',
 'in equities was great and you mentioned it pretty much across the board do you think there any seasonality any',
 'equities was great and you mentioned it pretty much across the board do you think there any seasonality any one',
 'was great and you mentioned it pretty much across the board do you think there any seasonality any one time',
 'great and you mentioned it pretty much across the board do you think there any seasonality any one time events',
 'and you mentioned it pretty much across the board do you think there any seasonality any one time events block',
 'you mentioned it pretty much across the board do you think there any seasonality any one time events block trades',
 'mentioned it pretty much across the board do you think there any seas

In [55]:
len_list = []
for x in lines:
    len_list.append(len(x.split()))

In [56]:
sorted(len_list)[-1]

20

In [57]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 20

In [65]:
# integer encode sequences of words
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(lines) #fit on texts
# sequences = tokenizer.texts_to_sequences(lines)
# vocab_size = len(tokenizer.word_index) + 1
MAX_NUM_WORDS = words_to_load
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
X, y = data[:,:-1], data[:,-1]
vocab_size = len(words_ft)
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]
 
# define model
model = Sequential()
embedding_layer = Embedding(vocab_size,
                            EMBEDDING_DIM,
                            weights=[loaded_embeddings_ft],
                            input_length=MAX_SEQUENCE_LENGTH-1,
                            trainable=False)
model.add(embedding_layer)
model.add(Bidirectional(GRU(100, return_sequences=True)))
model.add(Bidirectional(GRU(100)))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

#import the checkpoint to save current model
filepath="GRU_0.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
# compile model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# fit the model
model.fit(X, y, batch_size=200, epochs=100, callbacks=callbacks_list)

# categorical_crossentropy
 
# save the model to file
model.save('model.h5')
# save the tokenizer
plk.dump(tokenizer, open('tokenizer.pkl', 'wb'))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 19, 300)           15000600  
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 19, 200)           240600    
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 200)               180600    
_________________________________________________________________
dense_17 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_18 (Dense)             (None, 50002)             5050202   
Total params: 20,492,102
Trainable params: 5,491,502
Non-trainable params: 15,000,600
_________________________________________________________________
Epoch 1/100

Epoch 00001: loss improved from inf to 7.42839, saving model to LSTM_basline.hdf5
Epoch 2/100

Epoch 00002: l


Epoch 00039: loss improved from 2.36079 to 2.23857, saving model to LSTM_basline.hdf5
Epoch 40/100

Epoch 00040: loss improved from 2.23857 to 2.10347, saving model to LSTM_basline.hdf5
Epoch 41/100

Epoch 00041: loss improved from 2.10347 to 2.00631, saving model to LSTM_basline.hdf5
Epoch 42/100

Epoch 00042: loss improved from 2.00631 to 1.87989, saving model to LSTM_basline.hdf5
Epoch 43/100

Epoch 00043: loss improved from 1.87989 to 1.77014, saving model to LSTM_basline.hdf5
Epoch 44/100

Epoch 00044: loss improved from 1.77014 to 1.64470, saving model to LSTM_basline.hdf5
Epoch 45/100

Epoch 00045: loss improved from 1.64470 to 1.57237, saving model to LSTM_basline.hdf5
Epoch 46/100

Epoch 00046: loss improved from 1.57237 to 1.43255, saving model to LSTM_basline.hdf5
Epoch 47/100

Epoch 00047: loss improved from 1.43255 to 1.32829, saving model to LSTM_basline.hdf5
Epoch 48/100

Epoch 00048: loss improved from 1.32829 to 1.23555, saving model to LSTM_basline.hdf5
Epoch 49/100



Epoch 00082: loss did not improve from 0.04590
Epoch 83/100

Epoch 00083: loss improved from 0.04590 to 0.04075, saving model to LSTM_basline.hdf5
Epoch 84/100

Epoch 00084: loss improved from 0.04075 to 0.03447, saving model to LSTM_basline.hdf5
Epoch 85/100

Epoch 00085: loss improved from 0.03447 to 0.03094, saving model to LSTM_basline.hdf5
Epoch 86/100

Epoch 00086: loss improved from 0.03094 to 0.02711, saving model to LSTM_basline.hdf5
Epoch 87/100

Epoch 00087: loss improved from 0.02711 to 0.02630, saving model to LSTM_basline.hdf5
Epoch 88/100

Epoch 00088: loss improved from 0.02630 to 0.02336, saving model to LSTM_basline.hdf5
Epoch 89/100

Epoch 00089: loss improved from 0.02336 to 0.02038, saving model to LSTM_basline.hdf5
Epoch 90/100

Epoch 00090: loss did not improve from 0.02038
Epoch 91/100

Epoch 00091: loss did not improve from 0.02038
Epoch 92/100

Epoch 00092: loss did not improve from 0.02038
Epoch 93/100

Epoch 00093: loss did not improve from 0.02038
Epoch 94

In [37]:
from random import randint
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
 
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    for _ in range(n_words):
    # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
    # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [69]:
#test 1
# load cleaned text sequences
in_filename = 'Glenn_Schorr_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
 
# load the model
model = load_model('GRU_0.hdf5')
 
# load the tokenizer
tokenizer = plk.load(open('tokenizer.pkl', 'rb'))
 
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')
 
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

but in your text you point out low rates and slow economic growth as headwinds for ficc you talked about

lower market volumes and volatility and equities i just curious when you look at the quarter we just had in ficc particularly is it more like you had a nice pickup like and you setting us up for keep calm like things have returned back to normal or is this


In [85]:
model.load_weights(filepath)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
result_history = model.fit(X, y, batch_size=200, epochs= 500, callbacks=callbacks_list)

Epoch 1/500

Epoch 00001: loss did not improve from 3.25588
Epoch 2/500

Epoch 00002: loss improved from 3.25588 to 3.23858, saving model to LSTM_basline.hdf5
Epoch 3/500

Epoch 00003: loss improved from 3.23858 to 3.21107, saving model to LSTM_basline.hdf5
Epoch 4/500

Epoch 00004: loss improved from 3.21107 to 3.19372, saving model to LSTM_basline.hdf5
Epoch 5/500

Epoch 00005: loss improved from 3.19372 to 3.16126, saving model to LSTM_basline.hdf5
Epoch 6/500

Epoch 00006: loss improved from 3.16126 to 3.15166, saving model to LSTM_basline.hdf5
Epoch 7/500

Epoch 00007: loss improved from 3.15166 to 3.13680, saving model to LSTM_basline.hdf5
Epoch 8/500

Epoch 00008: loss improved from 3.13680 to 3.12772, saving model to LSTM_basline.hdf5
Epoch 9/500

Epoch 00009: loss improved from 3.12772 to 3.10034, saving model to LSTM_basline.hdf5
Epoch 10/500

Epoch 00010: loss improved from 3.10034 to 3.09116, saving model to LSTM_basline.hdf5
Epoch 11/500

Epoch 00011: loss did not improve 


Epoch 00047: loss improved from 2.66119 to 2.65046, saving model to LSTM_basline.hdf5
Epoch 48/500

Epoch 00048: loss did not improve from 2.65046
Epoch 49/500

Epoch 00049: loss improved from 2.65046 to 2.62651, saving model to LSTM_basline.hdf5
Epoch 50/500

Epoch 00050: loss improved from 2.62651 to 2.60288, saving model to LSTM_basline.hdf5
Epoch 51/500

Epoch 00051: loss did not improve from 2.60288
Epoch 52/500

Epoch 00052: loss did not improve from 2.60288
Epoch 53/500

Epoch 00053: loss improved from 2.60288 to 2.59475, saving model to LSTM_basline.hdf5
Epoch 54/500

Epoch 00054: loss improved from 2.59475 to 2.59408, saving model to LSTM_basline.hdf5
Epoch 55/500

Epoch 00055: loss improved from 2.59408 to 2.57952, saving model to LSTM_basline.hdf5
Epoch 56/500

Epoch 00056: loss improved from 2.57952 to 2.56503, saving model to LSTM_basline.hdf5
Epoch 57/500

Epoch 00057: loss did not improve from 2.56503
Epoch 58/500

Epoch 00058: loss improved from 2.56503 to 2.54420, sav


Epoch 00093: loss did not improve from 2.22497
Epoch 94/500

Epoch 00094: loss improved from 2.22497 to 2.18847, saving model to LSTM_basline.hdf5
Epoch 95/500

Epoch 00095: loss did not improve from 2.18847
Epoch 96/500

Epoch 00096: loss improved from 2.18847 to 2.16585, saving model to LSTM_basline.hdf5
Epoch 97/500

Epoch 00097: loss improved from 2.16585 to 2.14593, saving model to LSTM_basline.hdf5
Epoch 98/500

Epoch 00098: loss did not improve from 2.14593
Epoch 99/500

Epoch 00099: loss did not improve from 2.14593
Epoch 100/500

Epoch 00100: loss improved from 2.14593 to 2.12553, saving model to LSTM_basline.hdf5
Epoch 101/500

Epoch 00101: loss did not improve from 2.12553
Epoch 102/500

Epoch 00102: loss did not improve from 2.12553
Epoch 103/500

Epoch 00103: loss improved from 2.12553 to 2.09937, saving model to LSTM_basline.hdf5
Epoch 104/500

Epoch 00104: loss improved from 2.09937 to 2.07917, saving model to LSTM_basline.hdf5
Epoch 105/500

Epoch 00105: loss did not i


Epoch 00188: loss improved from 1.40204 to 1.37761, saving model to LSTM_basline.hdf5
Epoch 189/500

Epoch 00189: loss did not improve from 1.37761
Epoch 190/500

Epoch 00190: loss did not improve from 1.37761
Epoch 191/500

Epoch 00191: loss improved from 1.37761 to 1.37159, saving model to LSTM_basline.hdf5
Epoch 192/500

Epoch 00192: loss improved from 1.37159 to 1.36629, saving model to LSTM_basline.hdf5
Epoch 193/500

Epoch 00193: loss improved from 1.36629 to 1.34323, saving model to LSTM_basline.hdf5
Epoch 194/500

Epoch 00194: loss did not improve from 1.34323
Epoch 195/500

Epoch 00195: loss improved from 1.34323 to 1.33325, saving model to LSTM_basline.hdf5
Epoch 196/500

Epoch 00196: loss did not improve from 1.33325
Epoch 197/500

Epoch 00197: loss improved from 1.33325 to 1.32840, saving model to LSTM_basline.hdf5
Epoch 198/500

Epoch 00198: loss improved from 1.32840 to 1.31741, saving model to LSTM_basline.hdf5
Epoch 199/500

Epoch 00199: loss did not improve from 1.317


Epoch 00285: loss did not improve from 0.69694
Epoch 286/500

Epoch 00286: loss did not improve from 0.69694
Epoch 287/500

Epoch 00287: loss did not improve from 0.69694
Epoch 288/500

Epoch 00288: loss did not improve from 0.69694
Epoch 289/500

Epoch 00289: loss did not improve from 0.69694
Epoch 290/500

Epoch 00290: loss did not improve from 0.69694
Epoch 291/500

Epoch 00291: loss did not improve from 0.69694
Epoch 292/500

Epoch 00292: loss did not improve from 0.69694
Epoch 293/500

Epoch 00293: loss did not improve from 0.69694
Epoch 294/500

Epoch 00294: loss improved from 0.69694 to 0.64199, saving model to LSTM_basline.hdf5
Epoch 295/500

Epoch 00295: loss did not improve from 0.64199
Epoch 296/500

Epoch 00296: loss improved from 0.64199 to 0.63629, saving model to LSTM_basline.hdf5
Epoch 297/500

Epoch 00297: loss improved from 0.63629 to 0.60208, saving model to LSTM_basline.hdf5
Epoch 298/500

Epoch 00298: loss improved from 0.60208 to 0.59736, saving model to LSTM_bas



KeyboardInterrupt: 

In [93]:
# result_history.history["loss"]

In [87]:
model.save('model.h6')

In [73]:
#test 1
# load cleaned text sequences
in_filename = 'Glenn_Schorr_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
 
# load the model
# model = load_model('model.h6')

model.load_weights(filepath)
 
# load the tokenizer
tokenizer = plk.load(open('tokenizer.pkl', 'rb'))
 
# select a seed text
seed_text = lines[randint(0,len(lines))]
print('Initial text:')
print(seed_text + '\n')
 
# generate new text
print('Generated text:')
generated = generate_seq(model, tokenizer, seq_length, seed_text, 30)
print(generated)

Initial text:
competition and what we should expect on the deposit beta if there are more rate hikes and so maybe bluntly

Generated text:
if we get a couple more hate rate hikes do you capture the majority of it even without as you said a rate story do you have to hike along


In [80]:
#test 2

# load the tokenizer
tokenizer = plk.load(open('tokenizer.pkl', 'rb'))
 
# select a seed text
seed_text = lines[randint(0,len(lines))]
print('Initial text:')
print(seed_text + '\n')
 
# generate new text
print('Generated text:')
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

Initial text:
and are you still on track in your mind for the overhead ratio goals i do want to overly focus

Generated text:
on the dollar amount okay so i wanted to ask an nii question and i saw the comments on ex global markets the net interest yield being up basis points deposit betas are good so the core business that we all focus on is good i do want to talk


## Further improvement 

### 1. Improve the sequence build up based on each question, or expand training data to all questions from same category
### 2. Tune parameter to see if the performace is better in generating questions (batch size, epoch time etc.)
### 3. Research on the question generation process including tones and use of words
### 4. Improve the dropout (0.2, 0.4 0.6 etc.) to LSTM input layer to prevent overfitting for the model and may generate a more natural language

Dropout is a technique where randomly selected neurons are ignored during training. They are “dropped-out” randomly. This means that their contribution to the activation of downstream neurons is temporally removed on the forward pass and any weight updates are not applied to the neuron on the backward pass.
paper about dropout
http://jmlr.org/papers/volume15/srivastava14a/srivastava14a.pdf


