In [1]:
import os

import pickle as pkl
import re

import scipy as sp
import numpy as np
import pandas as pd

import nimfa 

from sklearn.preprocessing import normalize

from gensim.models.phrases import Phrases, Phraser

data_directory = '/'.join(os.getcwd().split("/")[:-2]) + '/data/'

import sys
sys.path.insert(3, '/'.join(os.getcwd().split("/")[:-2]) + '/textacy')

import textacy
from textacy import preprocess_text, Doc, Corpus
from textacy.vsm import Vectorizer, GroupVectorizer
from textacy.tm import TopicModel
en = textacy.load_spacy("en_core_web_sm", disable='parser')


test_set = [173,  74,  20, 101,  83,   1,  38,  39,  72,  50,  21, 164,  57,
       169, 8,  63, 102,  34,  80, 192, 139,  88, 112, 116,  61,  46,
        51, 165, 135,  89, 108,   7,  25,  15, 125,  93, 130,  71]

In [2]:

orig_data = pd.read_csv(data_directory + 'qaData.csv', parse_dates=['Date'])
orig_data['Year'] = orig_data['Date'].dt.year
orig_data['Month'] = orig_data['Date'].dt.month
orig_data['Quarter'] = orig_data['Month'].apply(lambda x: 1 if x < 4 else 2 if x < 7 else 3 if x < 9 else 4)
orig_data['Company'] = orig_data['Company'].str.title().str.replace(" ", "")
orig_data['AnalystName'] = orig_data['AnalystName'].str.title().str.replace(" ", "")
orig_data['Tag'] = orig_data['EarningTag2'].str.title().str.replace(" ", "")

orig_data = orig_data.loc[~orig_data['AnalystName'].isna()].copy()

groups = []
for i, (name, group) in enumerate(orig_data.groupby(['Company', 'Participants', 'Month', 'Year', 'Quarter', 'EventType', 'Date'])):
    g2 = group.copy()
    g2['EventNumber'] = i
    g2.reset_index(drop=True, inplace=True)
    g2.index.name = "QuestionNumber"
    g2.reset_index(inplace=True)
    groups.append(g2)
    
indexed_data = pd.concat(groups)
#train_data = indexed_data.loc[~indexed_data['EventNumber'].isin(test_set)]
q_data = indexed_data[['Date', 'EventNumber', 'QuestionNumber', 'Year', 'Quarter', 'Company', 'AnalystName', 'EventType', 'Tag', 'Question']].copy()


docs = Corpus(lang=en, docs=q_data.apply(lambda x: Doc(content=' '.join(
                                                        [token for token in preprocess_text(text=x['Question'], lowercase=True, no_punct=True, no_contractions=True, no_accents=True, no_currency_symbols=True, no_numbers=True).split(' ') if len(token)>2]),
                                                    lang=en, metadata={'Year':x['Year'],
                                                                       'Quarter':x['Quarter'],
                                                                       'Company':x['Company'],
                                                                       'AnalystName':x["AnalystName"],
                                                                       'Tag':x['Tag'],
                                                                       'EventType':x['EventType'],
                                                                       'EventNumber':x['EventNumber'],
                                                                       'QuestionNumber':x['QuestionNumber']}),axis=1).tolist())

tokenized_docs = [[list(doc.to_terms_list(ngrams=(1), as_strings=True, normalize='lemma', drop_determiners=True)), doc.metadata] for doc in docs if doc.metadata['EventNumber'] not in test_set]


In [4]:
bigram_phraser = Phraser(Phrases([doc[0] for doc in tokenized_docs], min_count=10, threshold=20, delimiter=b' '))
bigram_docs = [bigram_phraser[doc[0]] for doc in tokenized_docs] 

trigram_phraser = Phraser(Phrases(bigram_docs, min_count=5, threshold=10, delimiter=b' '))
trigram_docs = [trigram_phraser[doc] for doc in bigram_docs] 

analysts = [d[1]['AnalystName'] for d in tokenized_docs]
tags = [d[1]['Tag'] for d in tokenized_docs]
companies = [d[1]['Company'] for d in tokenized_docs]

In [31]:
a_rank = 10

analyst_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, analysts)
train_doc_term_matrix = analyst_vec.transform(trigram_docs, analysts)

mod = nimfa.Nmf(V=train_doc_term_matrix, max_iter=200, rank=a_rank)
mod_fit = mod()

analyst_df = pd.SparseDataFrame(normalize(mod_fit.basis()), columns = ['aTopic'+str(i) for i in range(a_rank)], index=analyst_vec.grps_list).fillna(0)**2
analyst_df.index.name = 'AnalystName'
analyst_df.join(analyst_df.idxmax(axis=1).rename('aTopicMax')).reset_index().to_csv(data_directory+"analystTopic.csv", index=False)


In [32]:
t_rank = 2

tag_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, tags)
train_doc_term_matrix = tag_vec.transform(trigram_docs, tags)

mod = nimfa.Nmf(V=train_doc_term_matrix, max_iter=200, rank=t_rank)
mod_fit = mod()

tag_df = pd.SparseDataFrame(normalize(mod_fit.basis()), columns = ['tTopic'+str(i) for i in range(t_rank)], index=tag_vec.grps_list).fillna(0)**2
tag_df.index.name = 'Tag'
tag_df.join(tag_df.idxmax(axis=1).rename('tTopicMax')).reset_index().to_csv(data_directory+"tagTopic.csv", index=False)


In [33]:
c_rank = 2

company_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, companies)
train_doc_term_matrix = company_vec.transform(trigram_docs, companies)

mod = nimfa.Nmf(V=train_doc_term_matrix, max_iter=200, rank=c_rank)
mod_fit = mod()

company_df = pd.SparseDataFrame(normalize(mod_fit.basis()), columns = ['cTopic'+str(i) for i in range(c_rank)], index=company_vec.grps_list).fillna(0)**2
company_df.index.name = 'Company'
company_df.join(tag_df.idxmax(axis=1).rename('cTopicMax')).reset_index().to_csv(data_directory+"companyTopic.csv", index=False)


In [5]:
analyst_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, analysts)
analyst_doc_term_matrix = analyst_vec.transform(trigram_docs, analysts)


tag_vec = GroupVectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs, tags)
tag_doc_term_matrix = tag_vec.transform(trigram_docs, tags)

question_vec = Vectorizer(tf_type='bm25', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='linear').fit(trigram_docs)
question_doc_term_matrix = question_vec.transform(trigram_docs)

In [6]:
tag_affinity_mat = 1- sp.spatial.distance.cdist(question_doc_term_matrix.toarray(), tag_doc_term_matrix.toarray(), 'cosine')
tag_affinity = pd.SparseDataFrame(tag_affinity_mat, columns=tag_vec.grps_list)

analyst_affinity_mat = 1- sp.spatial.distance.cdist(question_doc_term_matrix.toarray(), analyst_doc_term_matrix.toarray(), 'cosine')
analyst_affinity = pd.SparseDataFrame(analyst_affinity_mat, columns=analyst_vec.grps_list)

In [21]:
top3 = ((analyst_affinity['GlennSchorr'] + tag_affinity['Cib'])/2).nlargest(3).reset_index()['index'].values

sample_questions = {}
sample_questions['GlennSchorr__Cib'] = []

for val in top3:
    sample_questions['GlennSchorr__Cib'].append(indexed_data.loc[(indexed_data['EventNumber']==tokenized_docs[val][1]['EventNumber'])&
                 (indexed_data['QuestionNumber']==tokenized_docs[val][1]['QuestionNumber']), "Question"].item())

In [22]:
sample_questions["GlennSchorr__Cib"]

['I appreciate the color around things related to FICC, and kind of a question of overall backdrop. In Q1 2015, the Swiss re-pegged, stuff went bonkers for a couple of weeks and you made tons of money, even more than the overall industry. This quarter, it didn’t seem like there was too many a-ha moments. I mean, credit spreads tightened and we had the aftermath of Brexit and stuff. But in your text, you point out low rates and slow economic growth as headwinds for FICC. You’ve talked about lower market volumes and volatility and equities. I’m just curious, when you look at the quarter we just had in FICC particularly, is it more like you had a nice pickup like Q1 2015 and you’re setting us up for keep calm, like things have returned back to normal? Or is this possibly a little bit higher activity environment given the uncertainty?',
 'So in the past, better DCM revenues eventually led to better secondary revenue. And I heard all the comments on this really low vol in just about every a