In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import datetime
from collections import defaultdict

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk import ngrams

# Gensim
import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt 

from scipy.stats import pearsonr

import statsmodels
from statsmodels.tsa.stattools import grangercausalitytests

  from collections import Mapping


In [2]:
stop_words = stopwords.words('english')
# stop_words.extend(['mr', 'ms', 'said'])

  and should_run_async(code)


In [3]:
# def lemmatize(content, tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en', disable=['parser', 'ner'])
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

# Tokenize and remove stop words from content
def tokenize(content, lemmatize=False):
    words = gensim.utils.simple_preprocess(content, deacc=True)  # tokenizes
    return words

def remove_stopwords(content):
    words = []
    for word in content:
        if word in stop_words:
            continue
        words.append(word)
    return words

  and should_run_async(code)


We are not lemmatizing or stemming. If we need to increase accuracy in the future, we can consider it.

In [4]:
# New York Times Data
rows = []
dates = []
articles = []
for month in range(5, 11):
    with open("Data/NYTimes/"+ str(month) + ".txt") as f:
        for i, line in enumerate(f):
            date, article = line.split(",", 1)
            timestamp = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z").date()
            tokenized = tokenize(article)
            destopped = remove_stopwords(tokenized)

            articles.append(destopped)
            dates.append(timestamp)
            rows.append([timestamp, destopped])

nytimes = pd.DataFrame(rows, columns=["Date", "Content"]) 
unique_dates = sorted(list(set(nytimes["Date"])))
# print (unique_dates)
nytimes

  and should_run_async(code)


Unnamed: 0,Date,Content
0,2000-05-03,"[two, years, ago, homer, bush, came, yankee, b..."
1,2000-05-02,"[texas, record, tell, op, ed, april, paul, bur..."
2,2000-05-01,"[top, foreign, policy, adviser, gov, george, b..."
3,2000-05-03,"[aides, gov, george, bush, fought, back, today..."
4,2000-05-03,"[gov, tommy, thompson, wisconsin, named, chair..."
...,...,...
5801,2000-10-31,"[new, york, times, cbs, news, poll, var, strin..."
5802,2000-10-31,"[tick, tock, diner, ted, friedrich, stockbroke..."
5803,2000-11-01,"[difference, us, vital, issue, would, go, wash..."
5804,2000-11-01,"[bush, administration, wanted, overturn, would..."


There are 3 days missing from the stock market data: 6/07, 6/08, 11/01. There are several ways we can deal with this. 
1. Toss out the three days from the NYTimes data
2. Condense 6/07 --> 6/06; 6/08 --> 6/09 (or something similar) and toss out 11/01. 
3. Something else that I can't think of at the moment

I also haven't looked at the paper to see how they deal with it yet.

Edit: Reading over some articles about time series, it seems that we should pad the missing datapoints with previous days

In [5]:
# Time Series Data
ts_months = ["May", "Jun", "Jul", "Aug", "Sep", "Oct"]
cols = ['Date', 'LastPrice']
stock_prices = pd.DataFrame()
for month in ts_months:
    ts_df = pd.read_csv("Data/PriceHistory/" + month + ".txt", delim_whitespace=True)
    ts_df['Date'] =  ts_df['Date'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%y").date())
    
    Gore = ts_df.loc[ts_df['Contract'] == 'Dem'][['Date', 'LastPrice']].fillna(0).reset_index()
    Bush = ts_df.loc[ts_df['Contract'] == 'Rep'][['Date', 'LastPrice']].fillna(0).reset_index()

    # Gore/(Gore + Bush)
    relation = list(zip(Gore['Date'], (Gore['LastPrice']/(Gore['LastPrice'] + Bush['LastPrice'])).fillna(0)))
    stock_prices = stock_prices.append(relation, ignore_index=True)

stock_prices.columns = cols
stock_prices

  and should_run_async(code)


Unnamed: 0,Date,LastPrice
0,2000-05-01,0.523810
1,2000-05-02,0.504970
2,2000-05-03,0.509491
3,2000-05-04,0.511466
4,2000-05-05,0.520875
...,...,...
177,2000-10-27,0.384310
178,2000-10-28,0.296488
179,2000-10-29,0.345703
180,2000-10-30,0.380711


In [6]:
# for i in range(len(unique_dates)):
#     if unique_dates[i] not in list(stock_prices[0]):
#         print (unique_dates[i])

# bigram = Phrases(articles, min_count=1)
# bigrams = [b for b in bigram[articles]]
# articles = bigrams
# bigrams

  and should_run_async(code)


In [7]:
# Create Dictionary
id2word = corpora.Dictionary(articles)

# Attempt at filtering out words that appear too frequently
# id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# id2word.filter_extremes(no_above=0.5)


# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in articles]
doc_word_cnts = (np.array([np.array([(id2word[id], freq) for id, freq in cp]) for cp in corpus]))

# TF-IDF seems to give better coherence (but it wasn't in the paper...)
tfidf = models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in tfidf_corpus[:1]]

  and should_run_async(code)
  doc_word_cnts = (np.array([np.array([(id2word[id], freq) for id, freq in cp]) for cp in corpus]))


[[('ago', 0.07712049873418031),
  ('awesome', 0.23220574510227418),
  ('backup', 0.2198823985449398),
  ('backups', 0.2515408271170864),
  ('bases', 0.19264069440348208),
  ('bellinger', 0.27548950382241366),
  ('bench', 0.1896343958919212),
  ('bush', 0.007894722475376273),
  ('came', 0.08612993720379283),
  ('catcher', 0.26148042600790294),
  ('clay', 0.2135830725972484),
  ('games', 0.1562902360625982),
  ('girardi', 0.27548950382241366),
  ('homer', 0.21658937110880933),
  ('jim', 0.1245222966630543),
  ('joe', 0.1146922085996351),
  ('leyritz', 0.27548950382241366),
  ('speed', 0.17969479700110466),
  ('stole', 0.20587332073042908),
  ('strength', 0.13402729061444735),
  ('turner', 0.2108175460504276),
  ('two', 0.04788545375938528),
  ('versatility', 0.27548950382241366),
  ('whose', 0.0887458288821732),
  ('yankee', 0.20825706839694694),
  ('yankees', 0.19264069440348208),
  ('years', 0.05159983565074285)]]

In [8]:
date_term_cnts = defaultdict(lambda: [])

for index, row in nytimes.iterrows():
    date = row["Date"]
    content = row["Content"]
    
    date_term_cnts[date] += content
    
date_term_cnts = dict(date_term_cnts)
date_term_cnts = [id2word.doc2bow(text) for text in date_term_cnts.values()]
date_term_cnts = [[(id2word[id], freq) for id, freq in date] for date in date_term_cnts]
date_term_cnts[0]

  and should_run_async(code)


[('ago', 4),
 ('awesome', 2),
 ('backup', 2),
 ('backups', 2),
 ('bases', 2),
 ('bellinger', 2),
 ('bench', 2),
 ('bush', 150),
 ('came', 4),
 ('catcher', 2),
 ('clay', 2),
 ('games', 2),
 ('girardi', 2),
 ('homer', 2),
 ('jim', 2),
 ('joe', 4),
 ('leyritz', 2),
 ('speed', 2),
 ('stole', 2),
 ('strength', 2),
 ('turner', 2),
 ('two', 8),
 ('versatility', 2),
 ('whose', 4),
 ('yankee', 2),
 ('yankees', 2),
 ('years', 16),
 ('across', 2),
 ('also', 14),
 ('april', 2),
 ('chief', 4),
 ('conservative', 8),
 ('george', 22),
 ('get', 6),
 ('gov', 18),
 ('governor', 24),
 ('held', 2),
 ('little', 2),
 ('long', 2),
 ('name', 2),
 ('one', 20),
 ('part', 2),
 ('perhaps', 2),
 ('point', 2),
 ('president', 38),
 ('question', 2),
 ('record', 2),
 ('republican', 24),
 ('running', 10),
 ('social', 16),
 ('state', 12),
 ('states', 8),
 ('texas', 12),
 ('trying', 2),
 ('would', 46),
 ('year', 12),
 ('accounts', 6),
 ('administration', 14),
 ('adviser', 4),
 ('al', 22),
 ('argued', 4),
 ('assertion', 2)

In [9]:
# Build LDA model
k = 10
lda_model = gensim.models.ldamodel.LdaModel(corpus=tfidf_corpus,
                                           id2word=id2word,
                                           num_topics=k, 
#                                            minimum_phi_value=0.5, # min threshold for word probabilities
                                           passes=2,
                                           alpha='auto',  # assuming that topic distribution is assymetric. Not all topics equally represented in corpus.
                                           eta='auto',
                                           update_every=1, # online or batch processing (everything is on disk, so use online)
                                           per_word_topics=True)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=articles, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -8.636790510956109

Coherence Score:  0.5518203637253917


In [10]:
# Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis

  and should_run_async(code)


Initial thoughts:

We need to de-pluralize the words (governments vs government).
Get the coherence score above 50 would be a good start probably.

Need to extend stop words to include mr.

But topic coherency is still very low

Also, we can double check our topic coherence by comparing with Wikipedia (and other checks the paper did)

In [11]:
# Select the model and print the topics
def get_topics(lda_model, num_topics=-1, num_words=100, prob_thresh=0.8):
    topics = []
    for topic, topic_words in lda_model.print_topics(num_topics=num_topics, num_words=num_words):
        words = topic_words.split(" + ")
        all_words = []
        all_prob = 0
        for elem in words:
            prob, word = elem.split("*")
            all_prob += float(prob)
            all_words.append(word.split('"')[1])

            if all_prob >= prob_thresh:
                break
        topics.append((topic, all_words))

    return topics
topics = get_topics(lda_model)
topics

  and should_run_async(code)


[(0,
  ['mr',
   'lieberman',
   'military',
   'visited',
   'gore',
   'said',
   'clinton',
   'misstated',
   'bush',
   'states',
   'president',
   'yesterday',
   'administration',
   'vice',
   'defense',
   'would',
   'texas',
   'vidal',
   'campaign',
   'page',
   'united',
   'republican',
   'sept',
   'gov',
   'presidential',
   'balkans',
   'russia',
   'abortion',
   'gun',
   'troops',
   'article',
   'front',
   'environmental',
   'national',
   'hillary',
   'cheney',
   'policy',
   'missile',
   'rights',
   'math',
   'support',
   'al',
   'entertainment',
   'senate',
   'democrats',
   'chernomyrdin',
   'law',
   'state',
   'fuzzy',
   'governor',
   'george',
   'foreign',
   'franks',
   'republicans',
   'american',
   'candidates',
   'roe',
   'senator',
   'new',
   'style',
   'weapons',
   'nominee',
   'first',
   'world',
   'food',
   'system',
   'standards',
   'convention',
   'ones',
   'error',
   'running',
   'california',
   'issue',


In [12]:
# Select the model and print the topics

pd.options.display.max_colwidth = None
display(pd.DataFrame(lda_model.print_topics()))

  and should_run_async(code)


Unnamed: 0,0,1
0,0,"0.003*""mr"" + 0.002*""lieberman"" + 0.002*""military"" + 0.002*""visited"" + 0.002*""gore"" + 0.001*""said"" + 0.001*""clinton"" + 0.001*""misstated"" + 0.001*""bush"" + 0.001*""states"""
1,1,"0.002*""nato"" + 0.002*""bosses"" + 0.002*""bosnia"" + 0.001*""resent"" + 0.001*""lindsey"" + 0.001*""somalia"" + 0.001*""weakening"" + 0.001*""haiti"" + 0.001*""caption"" + 0.001*""upstate"""
2,2,"0.002*""glamorous"" + 0.002*""gorey"" + 0.002*""milosevic"" + 0.002*""atlantic"" + 0.001*""madison"" + 0.001*""bushnell"" + 0.001*""anthony"" + 0.001*""embarrass"" + 0.001*""lewis"" + 0.001*""playboy"""
3,3,"0.003*""string"" + 0.002*""voucher"" + 0.001*""refuge"" + 0.001*""arctic"" + 0.001*""kyoto"" + 0.001*""kelly"" + 0.000*""indian"" + 0.000*""indians"" + 0.000*""glover"" + 0.000*""diary"""
4,4,"0.001*""cuba"" + 0.001*""castro"" + 0.001*""bethany"" + 0.001*""ore"" + 0.001*""revival"" + 0.001*""dee"" + 0.001*""herbert"" + 0.001*""greens"" + 0.000*""monument"" + 0.000*""influencing"""
5,5,"0.002*""peacekeeping"" + 0.001*""ridgewood"" + 0.001*""sheriff"" + 0.001*""leone"" + 0.001*""biting"" + 0.001*""minimize"" + 0.001*""taylor"" + 0.000*""holbrooke"" + 0.000*""criner"" + 0.000*""henryk"""
6,6,"0.018*""var"" + 0.016*""string"" + 0.012*""else"" + 0.004*""pat"" + 0.003*""buchanan"" + 0.003*""ralph"" + 0.003*""nader"" + 0.002*""gore"" + 0.002*""cbs"" + 0.002*""bush"""
7,7,"0.003*""heating"" + 0.001*""entertainment"" + 0.001*""leno"" + 0.001*""wilensky"" + 0.001*""ploy"" + 0.001*""tame"" + 0.001*""lieberman"" + 0.001*""supplies"" + 0.001*""ordered"" + 0.001*""secondary"""
8,8,"0.006*""mr"" + 0.003*""gore"" + 0.003*""said"" + 0.003*""debate"" + 0.003*""bush"" + 0.002*""campaign"" + 0.002*""would"" + 0.002*""tax"" + 0.002*""clinton"" + 0.002*""percent"""
9,9,"0.003*""bushnell"" + 0.002*""blondes"" + 0.002*""candace"" + 0.002*""monthly"" + 0.002*""sex"" + 0.002*""crude"" + 0.002*""author"" + 0.001*""manhattan"" + 0.001*""surname"" + 0.001*""collins"""


In [13]:
document_topics = lda_model.get_document_topics(corpus)
date_doc_topics = list(zip(nytimes["Date"], lda_model.get_document_topics(corpus)))
for l in document_topics[:10]:
    print (l)

  and should_run_async(code)


[(2, 0.04045957), (8, 0.9339579)]
[(0, 0.3243524), (8, 0.6666913)]
[(0, 0.30909115), (8, 0.6894434)]
[(0, 0.04109392), (6, 0.026417011), (8, 0.93162614)]
[(0, 0.2568817), (8, 0.74014485)]
[(2, 0.036716957), (8, 0.95034015)]
[(0, 0.18049702), (7, 0.011881296), (8, 0.80311793)]
[(0, 0.08385625), (1, 0.0243626), (5, 0.019589037), (8, 0.86802256)]
[(0, 0.06893439), (6, 0.032124456), (8, 0.89596695)]
[(0, 0.13616636), (8, 0.84633857)]


In [14]:
# for any given day, you look at all the diff topics and identify the prob of that topic
# should I normalize? Paper doesn't seem to normalize...
date_topic_prob = np.zeros((len(unique_dates), k))
for date, article in date_doc_topics:
    i = unique_dates.index(date)
    for topic, prob in article:
        date_topic_prob[i][topic] += prob 

# Figure out how to normalize [reread paper/rewatch lecture]
# date_topic_prob = date_topic_prob/date_topic_prob.max(axis=0)    

  and should_run_async(code)


In [15]:
date_topic = pd.DataFrame(date_topic_prob, index=unique_dates)
date_topic["Date"] = unique_dates
date_topic

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Date
2000-05-01,1.621332,0.102854,0.094026,0.025079,0.000000,0.019590,0.079944,0.011881,9.911343,0.042537,2000-05-01
2000-05-02,0.808829,0.015675,0.063167,0.027439,0.000000,0.000000,0.017249,0.000000,7.951341,0.000000,2000-05-02
2000-05-03,3.981537,0.000000,0.080919,0.660461,0.000000,0.000000,0.361854,1.318624,25.241359,0.000000,2000-05-03
2000-05-04,3.127851,0.029079,0.000000,0.000000,0.000000,0.234628,0.277770,0.022733,14.108353,0.000000,2000-05-04
2000-05-05,2.719636,0.178353,0.000000,0.000000,0.000000,0.000000,0.344190,0.123833,22.396716,0.000000,2000-05-05
...,...,...,...,...,...,...,...,...,...,...,...
2000-10-28,2.505886,0.000000,0.000000,0.000000,0.000000,0.026498,14.066949,0.109456,26.743063,0.046997,2000-10-28
2000-10-29,10.246954,0.136531,0.313021,0.000000,0.198791,0.317390,1.180757,0.077100,58.986601,1.632303,2000-10-29
2000-10-30,2.568369,0.360951,0.224304,0.018136,0.013114,0.000000,8.405873,0.010417,20.968395,0.031752,2000-10-30
2000-10-31,5.339139,0.000000,0.106858,0.000000,0.000000,0.048223,23.992102,0.050947,63.308257,0.000000,2000-10-31


In [16]:
date_topic_prices = date_topic.set_index('Date').join(stock_prices.set_index('Date')).dropna()
date_topic_prices

  and should_run_async(code)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,LastPrice
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-05-01,1.621332,0.102854,0.094026,0.025079,0.000000,0.019590,0.079944,0.011881,9.911343,0.042537,0.523810
2000-05-02,0.808829,0.015675,0.063167,0.027439,0.000000,0.000000,0.017249,0.000000,7.951341,0.000000,0.504970
2000-05-03,3.981537,0.000000,0.080919,0.660461,0.000000,0.000000,0.361854,1.318624,25.241359,0.000000,0.509491
2000-05-04,3.127851,0.029079,0.000000,0.000000,0.000000,0.234628,0.277770,0.022733,14.108353,0.000000,0.511466
2000-05-05,2.719636,0.178353,0.000000,0.000000,0.000000,0.000000,0.344190,0.123833,22.396716,0.000000,0.520875
...,...,...,...,...,...,...,...,...,...,...,...
2000-10-27,5.295602,0.713348,0.398398,0.000000,0.000000,0.000000,22.628495,0.260327,63.268936,0.156157,0.384310
2000-10-28,2.505886,0.000000,0.000000,0.000000,0.000000,0.026498,14.066949,0.109456,26.743063,0.046997,0.296488
2000-10-29,10.246954,0.136531,0.313021,0.000000,0.198791,0.317390,1.180757,0.077100,58.986601,1.632303,0.345703
2000-10-30,2.568369,0.360951,0.224304,0.018136,0.013114,0.000000,8.405873,0.010417,20.968395,0.031752,0.380711


In [17]:
# temp_df = date_topic_prices[[0, 'LastPrice']].copy()
# significance = 0.1

# dt = pd.DataFrame(np.zeros((len(temp_df.columns), len(temp_df.columns))), columns=temp_df.columns, index=temp_df.columns)

# relevant_topics = []
# for c in temp_df.columns:
#     for r in temp_df.columns:
#         test_result = grangercausalitytests(temp_df[[r, c]], maxlag=5, verbose=False)
#         p_values = [round(test_result[i+1][0]['ssr_ftest'][1], 4) for i in range(5)]
        
#         max_p_value_i = np.argmax(p_values)
#         max_p_value = p_values[max_p_value_i]
#         dt.loc[r, c] = max_p_value
        
# if dt.iloc[0, 1] > significance or dt.iloc[1, 0] > significance:
#     relevant_topics.append(dt.columns[0])

# dt.iloc[0, 1] * -1,dt, relevant_topics

  and should_run_async(code)


In [18]:
# https://stackoverflow.com/questions/58005681/is-it-possible-to-run-a-vector-autoregression-analysis-on-a-large-gdp-data-with
def grangers_causality_matrix(data, variables, maxlag=5, test='ssr_ftest', verbose=False):
    dataset = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    lags    = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    
    for c in dataset.columns:
        for r in dataset.index:            
            test_result = grangercausalitytests(data[[r,c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1], 4) for i in range(maxlag)]
            
            if verbose: 
                print(f'Y = {r}, X = {c}, P Values = {p_values}')

            max_p_value_i = np.argmax(p_values)
            max_p_value = p_values[max_p_value_i]
            dataset.loc[r, c] = max_p_value
            
            lags.loc[r, c] = max_p_value_i
    
    return dataset, lags

# grangers_causality_matrix(dataset, variables = dataset.columns)
causality, lags = grangers_causality_matrix(date_topic_prices, variables=date_topic_prices.columns, verbose=False)
display(causality)
display(lags)

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,LastPrice
0,1.0,0.4773,0.4331,0.9868,0.6332,0.586,0.1958,0.3013,0.8192,0.5624,0.8051
1,0.1746,1.0,0.6159,0.3993,0.4145,0.9056,0.9589,0.4811,0.1122,0.8977,0.3061
2,0.0555,0.864,1.0,0.2492,0.7706,0.8359,0.9347,0.7319,0.0143,0.1878,0.9954
3,0.9602,0.7892,0.9448,1.0,0.9686,0.6534,0.9742,0.8327,0.9615,0.7886,0.6963
4,0.4507,0.4466,0.6567,0.8875,1.0,0.6834,0.9614,0.884,0.5303,0.5881,0.986
5,0.4801,0.8156,0.2516,0.7899,0.5134,1.0,0.6086,0.4321,0.5463,0.4863,0.0231
6,0.3561,0.8975,0.5211,0.8655,0.6051,0.4055,1.0,0.2722,0.1119,0.7538,0.9456
7,0.1512,0.7041,0.9861,0.9717,0.6293,0.5585,0.8922,1.0,0.4557,0.6103,0.9181
8,0.646,0.5934,0.113,0.9833,0.3455,0.4971,0.5629,0.8804,1.0,0.2143,0.9283
9,0.4072,0.5711,0.7885,0.9204,0.9275,0.9502,0.232,0.5567,0.1296,1.0,0.89


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,LastPrice
0,0.0,4.0,4.0,4.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0
1,2.0,0.0,0.0,2.0,1.0,4.0,3.0,2.0,4.0,2.0,4.0
2,2.0,2.0,0.0,2.0,0.0,3.0,0.0,1.0,0.0,0.0,4.0
3,4.0,1.0,4.0,0.0,0.0,4.0,4.0,3.0,4.0,3.0,4.0
4,3.0,4.0,1.0,4.0,0.0,4.0,4.0,1.0,4.0,0.0,1.0
5,1.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0
6,0.0,0.0,1.0,4.0,1.0,0.0,0.0,4.0,0.0,0.0,4.0
7,2.0,4.0,0.0,1.0,2.0,0.0,4.0,0.0,4.0,0.0,0.0
8,2.0,4.0,4.0,3.0,1.0,4.0,3.0,4.0,0.0,2.0,0.0
9,2.0,1.0,4.0,4.0,2.0,4.0,0.0,3.0,0.0,0.0,1.0


In [42]:
def get_causal_vars(data, significance=0.95, getLags=False, getCausalSig=False, verbose=False):
    cols = data.columns[:-1]
    causal_vars = []
    causal_lags = []
    
#     i = 0
    for col in cols:
        gc, lags = grangers_causality_matrix(data[[col, 'LastPrice']], 
                                             variables=[col, 'LastPrice'], 
                                             verbose=False)
        
#         if i < 10:
#             display(gc)
#             i += 1
        
        col_causes = gc.loc['LastPrice', col] >= significance
        col_causedBy =  gc.loc[col, 'LastPrice'] >= significance
        if col_causes or col_causedBy:
            if getCausalSig:
                causal_vars.append((col, max(gc.loc['LastPrice', col], gc.loc[col, 'LastPrice'])))
            else:
                causal_vars.append(col)
            
            if getLags:
                # if sig. granger causality for topic causing ts and ts causing topic, choose whichever is higher
                if col_causes and col_causedBy:
                    if gc.loc['LastPrice', col] >= gc.loc[col, 'LastPrice']:
                        causal_lags.append(lags.loc['LastPrice', col])
                    else:
                        causal_lags.append(lags.loc[col, 'LastPrice'] * -1)
                elif col_causes:
                    causal_lags.append(lags.loc['LastPrice', col])
                else:
                    causal_lags.append(lags.loc[col, 'LastPrice'] * -1)
    if getLags:
        return causal_vars, causal_lags
    return causal_vars
                
causal_topics, ct_lags = get_causal_vars(date_topic_prices, getLags=True, getCausalSig=True)
causal_topics, ct_lags

  and should_run_async(code)


Unnamed: 0,0,LastPrice
0,1.0,0.8051
LastPrice,0.8799,1.0


Unnamed: 0,1,LastPrice
1,1.0,0.3061
LastPrice,0.7234,1.0


Unnamed: 0,2,LastPrice
2,1.0,0.9954
LastPrice,0.2516,1.0


Unnamed: 0,3,LastPrice
3,1.0,0.6963
LastPrice,0.4499,1.0


Unnamed: 0,4,LastPrice
4,1.0,0.986
LastPrice,0.5516,1.0


Unnamed: 0,5,LastPrice
5,1.0,0.0231
LastPrice,0.7473,1.0


Unnamed: 0,6,LastPrice
6,1.0,0.9456
LastPrice,0.5313,1.0


Unnamed: 0,7,LastPrice
7,1.0,0.9181
LastPrice,0.8637,1.0


Unnamed: 0,8,LastPrice
8,1.0,0.9283
LastPrice,0.8776,1.0


Unnamed: 0,9,LastPrice
9,1.0,0.89
LastPrice,0.6199,1.0


([(2, 0.9954), (4, 0.986)], [-4.0, -1.0])

In [20]:
# def get_causal_topics(gc, lags, significance=0.95):
#     keep_topics = gc[gc['LastPrice' ] > significance].index[:-1] 
#     keep_lags = list(lags.loc[keep_topics, 'LastPrice'])

#     keep_topics_temp = (gc.loc["LastPrice", gc.loc["LastPrice"] > significance].index[:-1])
#     keep_topics = keep_topics.append(keep_topics_temp)
#     keep_lags += list(lags.loc["LastPrice", keep_topics_temp])

#     keep_topics = list(keep_topics)
#     return list(zip(keep_topics, keep_lags))

# causal_topics = get_causal_topics(causality, lags)
# causal_topics

  and should_run_async(code)


In [21]:
# # for all topics, if a majority of docs from one date are in that topic, you're going to label that topic with that date
# def get_topic_date(date_doc_topics, causal_topics):
#     topic_date_cnts = {key: {} for key in causal_topics}
    
#     for date, doc in date_doc_topics:
#         for topic, prob in doc:
#             if topic in causal_topics:
#                 try:
#                     topic_date_cnts[topic][date] += 1
#                 except KeyError:
#                     topic_date_cnts[topic][date] = 1
    
#     topic_date = []
    
#     for topic in topic_date_cnts:
#         max_date = max(topic_date_cnts[topic], key=lambda key: topic_date_cnts[topic][key])
#         topic_date += [(topic, max_date)]
        
#     return topic_date
    
# causalTopic_dates = get_topic_date(date_doc_topics, [topic for topic, lag in causal_topics])
# causalTopic_dates

  and should_run_async(code)


In [23]:
def get_word_stream(nytimes, topics, causal_topics):
    ct_ws = []
    for ct in causal_topics:
        causal_vocab = list(set(topics[ct][1]))
        date_terms = pd.DataFrame(np.zeros((len(unique_dates), len(causal_vocab))), index=unique_dates, columns=causal_vocab)

        for date, doc in zip(nytimes['Date'], date_term_cnts):
            for word, count in doc:
                try:
                     date_terms.loc[date, word] += int(count)
                except KeyError:
                    pass
        ct_ws.append((ct, date_terms))
    
    return ct_ws

ct_ws = get_word_stream(nytimes, topics, causal_topics)
ct_ws

  and should_run_async(code)


[(2,
                said  weakening  hypothetical  baron  bushehr  iranian  memoir  \
  2000-05-01   574.0        0.0           1.0    0.0      0.0      0.0     0.0   
  2000-05-02   287.0        0.0           0.0    6.0      0.0      0.0     0.0   
  2000-05-03  1552.0        1.0           1.0    0.0     13.0      3.0     1.0   
  2000-05-04   912.0        2.0           1.0    0.0      1.0      0.0     2.0   
  2000-05-05  1485.0        2.0           1.0    6.0      0.0      0.0     6.0   
  ...            ...        ...           ...    ...      ...      ...     ...   
  2000-10-28     0.0        0.0           0.0    0.0      0.0      0.0     0.0   
  2000-10-29     0.0        0.0           0.0    0.0      0.0      0.0     0.0   
  2000-10-30     0.0        0.0           0.0    0.0      0.0      0.0     0.0   
  2000-10-31     0.0        0.0           0.0    0.0      0.0      0.0     0.0   
  2000-11-01     0.0        0.0           0.0    0.0      0.0      0.0     0.0   
  
        

In [43]:
def get_impact_words(topic_wordstream, significance=0.95, verbose=False):
    topic_impact_words = []
    
    first = True
    for topic, ws in topic_wordstream:
        ws_prices = ws.join(stock_prices.set_index('Date')).dropna()        
        ws_gc = get_causal_vars(ws_prices, significance=significance, getCausalSig=True, verbose=verbose)
        
#         if first:
#             display(ws_gc)
#             first = False
        
        pos = []
        neg = []
        for word, sig in ws_gc:                
            corr = pearsonr(ws_prices[word], stock_prices['LastPrice'])[0]
            if corr > 0:
                pos.append((word, sig))
            else:
                neg.append((word, sig))
                
        topic_impact_words.append((topic, pos, neg))
    
    return topic_impact_words
        

impact_words = get_impact_words(ct_ws)
impact_words

  and should_run_async(code)


Unnamed: 0,said,LastPrice
said,1.0,0.8869
LastPrice,0.9901,1.0


Unnamed: 0,weakening,LastPrice
weakening,1.0,0.9843
LastPrice,0.7032,1.0


Unnamed: 0,hypothetical,LastPrice
hypothetical,1.0,0.9255
LastPrice,0.8779,1.0


  (res2down.ssr - res2djoint.ssr)
  fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  lr = -2 * (res2down.llf - res2djoint.llf)
  F /= J


Unnamed: 0,baron,LastPrice
baron,,
LastPrice,0.9171,1.0


Unnamed: 0,bushehr,LastPrice
bushehr,1.0,0.9151
LastPrice,0.9909,1.0


Unnamed: 0,iranian,LastPrice
iranian,1.0,0.209
LastPrice,0.9964,1.0


  (res2down.ssr - res2djoint.ssr)
  fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  lr = -2 * (res2down.llf - res2djoint.llf)
  F /= J


Unnamed: 0,memoir,LastPrice
memoir,,
LastPrice,0.8972,1.0


Unnamed: 0,leak,LastPrice
leak,1.0,0.9942
LastPrice,0.9469,1.0


Unnamed: 0,unsustainable,LastPrice
unsustainable,1.0,0.0338
LastPrice,0.9996,1.0


Unnamed: 0,muted,LastPrice
muted,1.0,0.4837
LastPrice,0.2544,1.0


Unnamed: 0,court,LastPrice
court,1.0,0.8544
LastPrice,0.802,1.0


Unnamed: 0,dee,LastPrice
dee,1.0,1.0
LastPrice,0.9792,1.0


Unnamed: 0,campaign,LastPrice
campaign,1.0,0.8514
LastPrice,0.9762,1.0


  (res2down.ssr - res2djoint.ssr)
  fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  lr = -2 * (res2down.llf - res2djoint.llf)
  F /= J


Unnamed: 0,profitably,LastPrice
profitably,,
LastPrice,0.936,1.0


Unnamed: 0,defying,LastPrice
defying,1.0,0.6302
LastPrice,0.8843,1.0


Unnamed: 0,said,LastPrice
said,1.0,0.8869
LastPrice,0.9901,1.0


Unnamed: 0,improbable,LastPrice
improbable,1.0,0.9809
LastPrice,0.9799,1.0


Unnamed: 0,greens,LastPrice
greens,1.0,0.9077
LastPrice,0.9952,1.0


Unnamed: 0,kiss,LastPrice
kiss,1.0,0.8741
LastPrice,0.9433,1.0


Unnamed: 0,standardized,LastPrice
standardized,1.0,0.9923
LastPrice,0.6147,1.0


  (res2down.ssr - res2djoint.ssr)
  fgc2 = res2down.nobs * (res2down.ssr - res2djoint.ssr) / res2djoint.ssr
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  lr = -2 * (res2down.llf - res2djoint.llf)
  F /= J


[(2,
  [('iranian', 0.9964),
   ('unsustainable', 0.9996),
   ('replacing', 0.9689),
   ('clemens', 0.9835),
   ('honda', 0.973),
   ('parking', 0.9977),
   ('ministers', 0.9885),
   ('graeme', 0.9947),
   ('logo', 0.9961)],
  [('said', 0.9901),
   ('weakening', 0.9843),
   ('bushehr', 0.9909),
   ('leak', 0.9942),
   ('glamorous', 0.9884),
   ('pure', 0.9876),
   ('publicized', 0.9946),
   ('georgewbush', 0.9701),
   ('mr', 0.9506),
   ('gorey', 0.9999),
   ('innings', 0.9993),
   ('bushnell', 0.9987),
   ('milosevic', 0.9927),
   ('embarrassing', 0.9977),
   ('zoo', 0.992),
   ('embarrass', 0.9839),
   ('wells', 0.9701),
   ('pushing', 0.9939),
   ('bomb', 0.9873),
   ('erased', 0.9736),
   ('arnold', 0.9533),
   ('visiting', 0.9933),
   ('anthony', 0.9968),
   ('hefner', 0.9964),
   ('anachronism', 0.9782),
   ('concerts', 1.0),
   ('isolationist', 0.9991),
   ('triumphs', 0.9996),
   ('nassau', 0.9993),
   ('london', 0.9888),
   ('toronto', 0.9891),
   ('morris', 0.9983),
   ('mode

In [None]:
ws_prices = ct_ws.join(stock_prices.set_index('Date')).dropna()
ws_impact, ws_lags = grangers_causality_matrix(ws_prices, variables=ws_prices.columns, verbose=False)

display(ws_prices)
display(ws_impact)
display(ws_lags)

In [None]:
corr = pearsonr(ws_prices["cleansing"], stock_prices['LastPrice'])[0]
corr

In [None]:
stock_prices

In [None]:
ct_ws[0][1].columns

In [None]:
# pass prob to 