In [33]:
from collections import Counter

import pickle as pkl
import re

import numpy as np
import pandas as pd

from gensim.models.phrases import Phrases, Phraser

import textacy
from textacy import preprocess_text, Doc, Corpus
from textacy.vsm import Vectorizer, GroupVectorizer
from textacy.tm import TopicModel
en = textacy.load_spacy("en_core_web_sm", disable='parser')


In [2]:
data = pd.read_csv('data/qaData.csv', parse_dates=['Date'])
ec_data = data.loc[data['EventType']=="Earnings call", ['Date', 'Company', 'Participants', 'AnalystName',	'AnalystCompany', 'EventName', 'EarningTag2', "Question"]].copy()
ec_data['Quarter'] = ec_data['EventName'].str.split("Q").str[0]
ec_data = ec_data.groupby(['Date', "Company", "Participants", "EventName", "Quarter"]).apply(lambda x: x.reset_index()).reset_index(drop=True)
ec_data.columns = ["QuestionOrder", "Date", "Company", "Participants", "AnalystName", "AnalystCompany", "EventName", "Tag", "Question", "Quarter"]
ec_data = ec_data[["Date", "Quarter", "Company", "Participants", "AnalystCompany", "AnalystName", "QuestionOrder", "Tag", "Question"]]

In [63]:
docs = Corpus(lang=en, docs=ec_data.apply(lambda x: Doc(content=' '.join(
                                                        [token for token in preprocess_text(text=x['Question'], lowercase=True, no_punct=True, no_contractions=True, no_accents=True, no_currency_symbols=True, no_numbers=True).split(' ') if len(token)>2]),
                                                    lang=en, metadata={'Quarter':x['Quarter'],
                                                                       'Company':x['Company'],
                                                                       'QuestionOrder':x['QuestionOrder'],
                                                                       'AnalystName':x["AnalystName"],
                                                                       'Tag':x['Tag']}),axis=1).tolist())
tokenized_docs = [list(doc.to_terms_list(ngrams=(1), as_strings=True, normalize='lemma', drop_determiners=True)) for doc in docs]

bigram_phraser = Phraser(Phrases(tokenized_docs, min_count=10, threshold=25, delimiter=b' '))
bigram_docs = [bigram_phraser[doc] for doc in tokenized_docs] 

trigram_phraser = Phraser(Phrases(bigram_docs, min_count=5, threshold=10, delimiter=b' '))
trigram_docs = [trigram_phraser[doc] for doc in bigram_docs]

with open("data/tokeizedQuestion.p", "wb") as f:
    pkl.dump(trigram_docs, f)

In [65]:
count_vec = Vectorizer(tf_type='linear', apply_idf=False, apply_dl=False).fit(trigram_docs)
doc_term_matrix = count_vec.transform(trigram_docs)

In [64]:
aa = np.sum(doc_term_matrix[ec_data.loc[ec_data['AnalystName']=="Glenn Schorr"].index.tolist(),:], axis=1)

array([18, 13, 37, 23, 30, 27, 27, 20, 18, 34, 31, 30, 17, 14, 35, 21, 20,
       28, 20, 25, 24, 30, 41, 40, 35, 35, 20, 34, 28, 53, 16, 20, 23, 33,
       54, 28, 58,  9, 56, 36, 34, 37, 55, 38, 49, 20, 29, 14, 30, 29, 12,
       12, 33, 20, 30, 45, 47, 25, 31, 30, 18, 39, 47, 35, 21, 41, 21, 22,
       26, 14, 54, 19, 50, 59, 50, 27, 29, 17, 24, 38, 67, 44,  5, 40, 35,
       31, 30, 28, 18, 53, 48, 26, 14, 25, 40, 30, 42,  6, 32, 20, 11, 35,
       19, 16, 31, 14, 22, 33, 27, 24, 47, 41, 17, 15, 24, 17,  7, 30, 28,
       32, 20, 15, 33, 13, 11, 35, 35, 33, 34, 35, 19, 23, 14, 54, 34, 26,
       18, 19, 38, 39, 28, 38, 23, 37, 30, 35, 39, 30, 41, 29, 14, 39, 26,
       50, 10, 19, 24, 27, 24, 52, 34, 20, 25, 31, 37, 16, 32, 12])

In [51]:
np.argsort(np.ravel(aa))[::-1][:10]

array([ 80,  73,  36,  38,  42,  34, 133,  70,  89,  29])

In [62]:
count_vec.id_to_term[29]

'130bps'