In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import datetime

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk import ngrams

# Gensim
import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases # TODO: to create bigrams with

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt 

from scipy.stats import pearsonr

import statsmodels
from statsmodels.tsa.stattools import grangercausalitytests

  from collections import Mapping


In [2]:
stop_words = stopwords.words('english')
# stop_words.extend(['mr', 'ms', 'said'])

  and should_run_async(code)


In [3]:
# def lemmatize(content, tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en', disable=['parser', 'ner'])
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

# Tokenize and remove stop words from content
def tokenize(content, lemmatize=False):
    words = gensim.utils.simple_preprocess(content, deacc=True)  # tokenizes
    return words

def remove_stopwords(content):
    words = []
    for word in content:
        if word in stop_words:
            continue
        words.append(word)
    return words

  and should_run_async(code)


We are not lemmatizing or stemming. If we need to increase accuracy in the future, we can consider it.

In [4]:
# New York Times Data
rows = []
dates = []
articles = []
for month in range(5, 11):
    with open("Data/NYTimes/"+ str(month) + ".txt") as f:
        for i, line in enumerate(f):
            date, article = line.split(",", 1)
            timestamp = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z").date()
            tokenized = tokenize(article)
            destopped = remove_stopwords(tokenized)

            articles.append(destopped)
            dates.append(timestamp)
            rows.append([timestamp, destopped])

nytimes = pd.DataFrame(rows, columns=["Date", "Content"]) 
unique_dates = sorted(list(set(nytimes["Date"])))
# print (unique_dates)
nytimes

  and should_run_async(code)


Unnamed: 0,Date,Content
0,2000-05-03,"[two, years, ago, homer, bush, came, yankee, b..."
1,2000-05-02,"[texas, record, tell, op, ed, april, paul, bur..."
2,2000-05-01,"[top, foreign, policy, adviser, gov, george, b..."
3,2000-05-03,"[aides, gov, george, bush, fought, back, today..."
4,2000-05-03,"[gov, tommy, thompson, wisconsin, named, chair..."
...,...,...
5801,2000-10-31,"[new, york, times, cbs, news, poll, var, strin..."
5802,2000-10-31,"[tick, tock, diner, ted, friedrich, stockbroke..."
5803,2000-11-01,"[difference, us, vital, issue, would, go, wash..."
5804,2000-11-01,"[bush, administration, wanted, overturn, would..."


There are 3 days missing from the stock market data: 6/07, 6/08, 11/01. There are several ways we can deal with this. 
1. Toss out the three days from the NYTimes data
2. Condense 6/07 --> 6/06; 6/08 --> 6/09 (or something similar) and toss out 11/01. 
3. Something else that I can't think of at the moment

I also haven't looked at the paper to see how they deal with it yet.

Edit: Reading over some articles about time series, it seems that we should pad the missing datapoints with previous days

In [5]:
# Time Series Data
ts_months = ["May", "Jun", "Jul", "Aug", "Sep", "Oct"]
cols = ['Date', 'LastPrice']
stock_prices = pd.DataFrame()
for month in ts_months:
    ts_df = pd.read_csv("Data/PriceHistory/" + month + ".txt", delim_whitespace=True)
    ts_df['Date'] =  ts_df['Date'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%y").date())
    
    Gore = ts_df.loc[ts_df['Contract'] == 'Dem'][['Date', 'LastPrice']].fillna(0).reset_index()
    Bush = ts_df.loc[ts_df['Contract'] == 'Rep'][['Date', 'LastPrice']].fillna(0).reset_index()

    # Gore/(Gore + Bush)
    relation = list(zip(Gore['Date'], (Gore['LastPrice']/(Gore['LastPrice'] + Bush['LastPrice'])).fillna(0)))
    stock_prices = stock_prices.append(relation, ignore_index=True)

stock_prices.columns = cols
stock_prices

  and should_run_async(code)


Unnamed: 0,Date,LastPrice
0,2000-05-01,0.523810
1,2000-05-02,0.504970
2,2000-05-03,0.509491
3,2000-05-04,0.511466
4,2000-05-05,0.520875
...,...,...
177,2000-10-27,0.384310
178,2000-10-28,0.296488
179,2000-10-29,0.345703
180,2000-10-30,0.380711


In [6]:
# for i in range(len(unique_dates)):
#     if unique_dates[i] not in list(stock_prices[0]):
#         print (unique_dates[i])

# bigram = Phrases(articles, min_count=1)
# bigrams = [b for b in bigram[articles]]
# articles = bigrams
# bigrams

  and should_run_async(code)


In [91]:
# Create Dictionary
id2word = corpora.Dictionary(articles)

# Attempt at filtering out words that appear too frequently
# id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# id2word.filter_extremes(no_above=0.5)


# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in articles]
doc_word_cnts = (np.array([np.array([(id2word[id], freq) for id, freq in cp]) for cp in corpus]))

# TF-IDF seems to give better coherence (but it wasn't in the paper...)
tfidf = models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in tfidf_corpus[:1]]

  and should_run_async(code)
  doc_word_cnts = (np.array([np.array([(id2word[id], freq) for id, freq in cp]) for cp in corpus]))


[[('ago', 0.07712049873418031),
  ('awesome', 0.23220574510227418),
  ('backup', 0.2198823985449398),
  ('backups', 0.2515408271170864),
  ('bases', 0.19264069440348208),
  ('bellinger', 0.27548950382241366),
  ('bench', 0.1896343958919212),
  ('bush', 0.007894722475376273),
  ('came', 0.08612993720379283),
  ('catcher', 0.26148042600790294),
  ('clay', 0.2135830725972484),
  ('games', 0.1562902360625982),
  ('girardi', 0.27548950382241366),
  ('homer', 0.21658937110880933),
  ('jim', 0.1245222966630543),
  ('joe', 0.1146922085996351),
  ('leyritz', 0.27548950382241366),
  ('speed', 0.17969479700110466),
  ('stole', 0.20587332073042908),
  ('strength', 0.13402729061444735),
  ('turner', 0.2108175460504276),
  ('two', 0.04788545375938528),
  ('versatility', 0.27548950382241366),
  ('whose', 0.0887458288821732),
  ('yankee', 0.20825706839694694),
  ('yankees', 0.19264069440348208),
  ('years', 0.05159983565074285)]]

In [78]:
# Build LDA model
k = 10
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=k, 
#                                            minimum_phi_value=0.5, # min threshold for word probabilities
                                           passes=2,
                                           alpha='auto',  # assuming that topic distribution is assymetric. Not all topics equally represented in corpus.
                                           eta='auto',
                                           update_every=1, # online or batch processing (everything is on disk, so use online)
                                           per_word_topics=True)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=articles, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Perplexity:  -8.051365068593375

Coherence Score:  0.35735223993332144


In [9]:
# Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis

  and should_run_async(code)


Initial thoughts:

We need to de-pluralize the words (governments vs government).
Get the coherence score above 50 would be a good start probably.

Need to extend stop words to include mr.

But topic coherency is still very low

Also, we can double check our topic coherence by comparing with Wikipedia (and other checks the paper did)

In [43]:
# Select the model and print the topics
def get_topics(lda_model, num_topics=-1, num_words=100, prob_thresh=0.8):
    topics = []
    for topic, topic_words in lda_model.print_topics(num_topics=num_topics, num_words=num_words):
        words = topic_words.split(" + ")
        all_words = []
        all_prob = 0
        for elem in words:
            prob, word = elem.split("*")
            all_prob += float(prob)
            all_words.append(word.split('"')[1])

            if all_prob >= prob_thresh:
                break
        topics.append((topic, all_words))

    return topics
topics = get_topics(lda_model)
topics

  and should_run_async(code)


[(0,
  ['kiss',
   'simon',
   'arthritis',
   'puzzlement',
   'minimize',
   'clemens',
   'anthony',
   'disconcerting',
   'casey',
   'flash',
   'inexperienced',
   'medicine',
   'cherished',
   'diary',
   'bigotry',
   'ethics',
   'spells',
   'clark',
   'pledges',
   'navy',
   'import',
   'worthy',
   'wells',
   'salt',
   'lieberman',
   'pryor',
   'homer',
   'teams',
   'manufacturer',
   'wooing',
   'glass',
   'claritin',
   'eliot',
   'persuasion',
   'stained',
   'abm',
   'performances',
   'uncertainty',
   'lent',
   'canal',
   'lovers',
   'cleansing',
   'techniques',
   'diane',
   'shiloh',
   'negotiators',
   'page',
   'schering',
   'upset',
   'accepts',
   'plough',
   'front',
   'humans',
   'aug',
   'evaluate',
   'toronto',
   'mr',
   'medication',
   'dissolved',
   'governorship',
   'deepest',
   'raiservice',
   'pet',
   'jews',
   'treaty',
   'beans',
   'menu',
   'lodine',
   'identity',
   'restaurant',
   'delayed',
   'torricell

In [44]:
# Select the model and print the topics

pd.options.display.max_colwidth = None
display(pd.DataFrame(lda_model.print_topics()))

  and should_run_async(code)


Unnamed: 0,0,1
0,0,"0.001*""kiss"" + 0.001*""simon"" + 0.001*""arthritis"" + 0.001*""puzzlement"" + 0.000*""minimize"" + 0.000*""clemens"" + 0.000*""anthony"" + 0.000*""disconcerting"" + 0.000*""casey"" + 0.000*""flash"""
1,1,"0.001*""kyoto"" + 0.001*""jesus"" + 0.001*""biting"" + 0.000*""terminal"" + 0.000*""allen"" + 0.000*""philosopher"" + 0.000*""buildings"" + 0.000*""waterfront"" + 0.000*""juries"" + 0.000*""humiliated"""
2,2,"0.005*""mr"" + 0.003*""said"" + 0.003*""gore"" + 0.003*""tax"" + 0.003*""would"" + 0.003*""plan"" + 0.002*""bush"" + 0.002*""oil"" + 0.002*""campaign"" + 0.002*""debate"""
3,3,"0.002*""mcginn"" + 0.002*""muslim"" + 0.001*""reprieve"" + 0.001*""dna"" + 0.001*""execution"" + 0.001*""surname"" + 0.001*""inmate"" + 0.001*""dow"" + 0.001*""evasions"" + 0.001*""misspelled"""
4,4,"0.001*""haass"" + 0.001*""location"" + 0.001*""goldsmith"" + 0.001*""upstate"" + 0.001*""prescribed"" + 0.001*""awarded"" + 0.001*""leone"" + 0.001*""holbrooke"" + 0.001*""chronicle"" + 0.001*""india"""
5,5,"0.002*""bushwick"" + 0.002*""gail"" + 0.001*""corzine"" + 0.001*""brooklyn"" + 0.001*""diner"" + 0.001*""quincy"" + 0.001*""ridgewood"" + 0.001*""detective"" + 0.001*""mccollum"" + 0.001*""adams"""
6,6,"0.002*""gorey"" + 0.002*""bosses"" + 0.001*""rats"" + 0.001*""choose"" + 0.001*""geeks"" + 0.001*""cuomo"" + 0.001*""purely"" + 0.001*""entirety"" + 0.000*""candace"" + 0.000*""mccall"""
7,7,"0.029*""string"" + 0.025*""var"" + 0.018*""else"" + 0.006*""pat"" + 0.006*""buchanan"" + 0.005*""ralph"" + 0.005*""nader"" + 0.003*""gore"" + 0.002*""cbs"" + 0.002*""green"""
8,8,"0.005*""mr"" + 0.004*""lieberman"" + 0.003*""gore"" + 0.003*""nader"" + 0.002*""said"" + 0.002*""debate"" + 0.002*""percent"" + 0.002*""voters"" + 0.002*""campaign"" + 0.002*""bush"""
9,9,"0.002*""barrels"" + 0.001*""carbon"" + 0.001*""leno"" + 0.001*""fires"" + 0.001*""retarded"" + 0.001*""yorkers"" + 0.001*""moscow"" + 0.001*""madison"" + 0.001*""misidentified"" + 0.001*""ordered"""


In [45]:
document_topics = lda_model.get_document_topics(corpus)
date_doc_topics = list(zip(nytimes["Date"], lda_model.get_document_topics(corpus)))
for l in document_topics[:10]:
    print (l)

  and should_run_async(code)


[(0, 0.010476502), (2, 0.11714508), (3, 0.0112605505), (4, 0.010195531), (5, 0.0109097), (7, 0.6463436), (8, 0.16330336), (9, 0.011147026)]
[(2, 0.59236306), (3, 0.010329096), (5, 0.010007265), (7, 0.011145482), (8, 0.32933918), (9, 0.01022496)]
[(2, 0.6083541), (8, 0.35104874)]
[(2, 0.85344124), (8, 0.11085776)]
[(2, 0.03919201), (8, 0.9143367)]
[(2, 0.07631529), (8, 0.85570115)]
[(2, 0.66931486), (8, 0.28913808)]
[(2, 0.29158968), (3, 0.041484423), (8, 0.61582947)]
[(2, 0.50365764), (8, 0.4427186)]
[(0, 0.012239824), (1, 0.011007043), (2, 0.07444505), (3, 0.013155841), (4, 0.0119115645), (5, 0.012745936), (6, 0.011446362), (7, 0.014195635), (8, 0.82582957), (9, 0.013023207)]


In [46]:
# for any given day, you look at all the diff topics and identify the prob of that topic
# should I normalize? Paper doesn't seem to normalize...
date_topic_prob = np.zeros((len(unique_dates), k))
for date, article in date_doc_topics:
    i = unique_dates.index(date)
    for topic, prob in article:
        date_topic_prob[i][topic] += prob 

# Figure out how to normalize [reread paper/rewatch lecture]
# date_topic_prob = date_topic_prob/date_topic_prob.max(axis=0)    

  and should_run_async(code)


In [47]:
date_topic = pd.DataFrame(date_topic_prob, index=unique_dates)
date_topic["Date"] = unique_dates
date_topic

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Date
2000-05-01,0.024491,0.022024,6.592127,0.077900,0.023834,0.025503,0.022903,0.050060,4.373290,0.269631,2000-05-01
2000-05-02,0.038868,0.083333,2.914256,0.062829,0.295377,0.060871,0.036349,0.067795,5.089809,0.062195,2000-05-02
2000-05-03,0.129160,0.041622,16.590956,0.602970,0.086395,1.166008,0.782960,2.514574,8.791069,0.179511,2000-05-03
2000-05-04,0.088061,0.031278,10.866314,0.115434,0.085700,0.724263,0.063018,0.145146,5.017906,0.283471,2000-05-04
2000-05-05,0.200214,0.000000,12.561383,0.489530,0.041574,0.515298,0.020171,0.117207,10.978853,0.087822,2000-05-05
...,...,...,...,...,...,...,...,...,...,...,...
2000-10-28,0.445706,0.243035,17.024804,0.426414,0.283709,0.324562,0.262792,10.955701,12.338105,0.470065,2000-10-28
2000-10-29,0.599238,0.395619,31.018345,0.598833,0.351309,1.139376,0.337588,3.085665,33.364619,0.925718,2000-10-29
2000-10-30,0.231184,0.169742,12.074686,0.258669,0.224984,0.240744,0.770272,6.368464,11.529713,0.256061,2000-10-30
2000-10-31,1.119083,0.474256,37.445542,1.131816,0.576392,1.020954,0.533998,17.982314,30.437469,0.901880,2000-10-31


In [48]:
joined = date_topic.set_index('Date').join(stock_prices.set_index('Date')).dropna()
joined

  and should_run_async(code)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,LastPrice
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-05-01,0.024491,0.022024,6.592127,0.077900,0.023834,0.025503,0.022903,0.050060,4.373290,0.269631,0.523810
2000-05-02,0.038868,0.083333,2.914256,0.062829,0.295377,0.060871,0.036349,0.067795,5.089809,0.062195,0.504970
2000-05-03,0.129160,0.041622,16.590956,0.602970,0.086395,1.166008,0.782960,2.514574,8.791069,0.179511,0.509491
2000-05-04,0.088061,0.031278,10.866314,0.115434,0.085700,0.724263,0.063018,0.145146,5.017906,0.283471,0.511466
2000-05-05,0.200214,0.000000,12.561383,0.489530,0.041574,0.515298,0.020171,0.117207,10.978853,0.087822,0.520875
...,...,...,...,...,...,...,...,...,...,...,...
2000-10-27,0.824211,0.624444,38.928351,1.926377,0.782506,1.223071,0.960232,17.073540,28.986691,1.087956,0.384310
2000-10-28,0.445706,0.243035,17.024804,0.426414,0.283709,0.324562,0.262792,10.955701,12.338105,0.470065,0.296488
2000-10-29,0.599238,0.395619,31.018345,0.598833,0.351309,1.139376,0.337588,3.085665,33.364619,0.925718,0.345703
2000-10-30,0.231184,0.169742,12.074686,0.258669,0.224984,0.240744,0.770272,6.368464,11.529713,0.256061,0.380711


In [64]:
# https://stackoverflow.com/questions/58005681/is-it-possible-to-run-a-vector-autoregression-analysis-on-a-large-gdp-data-with
# Can I rewrite this so that I only measure stock_price on topic and not topic on topic
# 11/24 - changed from chi^2 to f
def grangers_causality_matrix(data, variables, maxlag=5, test='ssr_ftest', verbose=False):
    dataset = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    lags    = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    
    for c in dataset.columns:
        for r in dataset.index:            
            test_result = grangercausalitytests(data[[r,c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1], 4) for i in range(maxlag)]
            
            if verbose: 
                print(f'Y = {r}, X = {c}, P Values = {p_values}')

            max_p_value_i = np.argmax(p_values)
            max_p_value = p_values[max_p_value_i]
            dataset.loc[r, c] = max_p_value
            
            lags.loc[r, c] = max_p_value_i
    
    return dataset, lags

# grangers_causality_matrix(dataset, variables = dataset.columns)
causality, lags = grangers_causality_matrix(joined, variables=joined.columns, verbose=False)
display(causality)
display(lags)

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,LastPrice
0,1.0,0.9159,0.0223,0.2516,0.642,0.51,0.9862,0.1433,0.1708,0.5967,0.9832
1,0.8656,1.0,0.2317,0.1685,0.94,0.4512,0.6934,0.0683,0.4399,0.6672,0.5237
2,0.5992,0.6411,1.0,0.7782,0.8251,0.3832,0.9611,0.4004,0.3621,0.1449,0.8541
3,0.8043,0.2619,0.2321,1.0,0.7057,0.5495,0.6683,0.4888,0.1535,0.1931,0.3009
4,0.5951,0.7258,0.4393,0.3318,1.0,0.2978,0.6507,0.6678,0.9207,0.657,0.9249
5,0.3412,0.0692,0.1614,0.2148,0.6802,1.0,0.8395,0.0383,0.0905,0.1102,0.1438
6,0.9873,0.5242,0.8196,0.9135,0.6386,0.6923,1.0,0.0853,0.8756,0.3177,0.0068
7,0.0139,0.0761,0.0118,0.0278,0.0708,0.2113,0.8127,1.0,0.0163,0.0047,0.8669
8,0.8207,0.9228,0.1371,0.8602,0.7071,0.7979,0.7649,0.5049,1.0,0.3198,0.924
9,0.5745,0.6655,0.1161,0.4774,0.9286,0.5451,0.6443,0.3038,0.1192,1.0,0.3419


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,LastPrice
0,0.0,2.0,0.0,4.0,3.0,0.0,0.0,2.0,0.0,3.0,4.0
1,0.0,0.0,4.0,4.0,0.0,1.0,0.0,2.0,0.0,3.0,2.0
2,4.0,4.0,0.0,3.0,2.0,2.0,1.0,4.0,0.0,4.0,0.0
3,4.0,0.0,4.0,0.0,1.0,4.0,0.0,4.0,4.0,3.0,0.0
4,1.0,4.0,2.0,1.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0
5,4.0,0.0,0.0,4.0,3.0,0.0,0.0,2.0,0.0,4.0,2.0
6,4.0,2.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0
7,3.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,1.0,2.0
8,4.0,4.0,3.0,4.0,2.0,3.0,2.0,2.0,0.0,4.0,3.0
9,4.0,0.0,0.0,0.0,4.0,4.0,2.0,2.0,0.0,0.0,4.0


In [62]:
def get_causal_topics(gc, lags, significance=0.95):
    keep_topics = gc[gc['LastPrice' ] > significance].index[:-1] 
    keep_lags = list(lags.loc[keep_topics, 'LastPrice'])

    keep_topics_temp = (gc.loc["LastPrice", gc.loc["LastPrice"] > significance].index[:-1])
    keep_topics = keep_topics.append(keep_topics_temp)
    keep_lags += list(lags.loc["LastPrice", keep_topics_temp])

    keep_topics = list(keep_topics)
    return list(zip(keep_topics, keep_lags))

causal_topics = get_causal_topics(causality, lags)
causal_topics

  and should_run_async(code)


[(0, 4.0), (6, 2.0)]

In [51]:
# # for all topics, if a majority of docs from one date are in that topic, you're going to label that topic with that date
# def get_topic_date(date_doc_topics, causal_topics):
#     topic_date_cnts = {key: {} for key in causal_topics}
    
#     for date, doc in date_doc_topics:
#         for topic, prob in doc:
#             if topic in causal_topics:
#                 try:
#                     topic_date_cnts[topic][date] += 1
#                 except KeyError:
#                     topic_date_cnts[topic][date] = 1
    
#     topic_date = []
    
#     for topic in topic_date_cnts:
#         max_date = max(topic_date_cnts[topic], key=lambda key: topic_date_cnts[topic][key])
#         topic_date += [(topic, max_date)]
        
#     return topic_date
    
# causalTopic_dates = get_topic_date(date_doc_topics, [topic for topic, lag in causal_topics])
# causalTopic_dates

  and should_run_async(code)


[(0, datetime.date(2000, 10, 27)), (6, datetime.date(2000, 10, 27))]

In [104]:
def get_word_stream(nytimes, topics, causal_topics):
    ct_ws = []
    for ct in causal_topics:
        causal_vocab = list(set(topics[ct][1]))
        date_terms = pd.DataFrame(np.zeros((len(unique_dates), len(causal_vocab))), index=unique_dates, columns=causal_vocab)

        for date, doc in zip(nytimes['Date'], doc_word_cnts):
            for word, count in doc:
                try:
                     date_terms.loc[date, word] += int(count)
                except KeyError:
                    pass
        ct_ws.append((ct, date_terms))
    
    return ct_ws

ct_ws = get_word_stream(nytimes, topics, [topic for topic, lag in causal_topics])
ct_ws

  and should_run_async(code)


[(0,
              cleansing  willey  legendary  woolsey  recapture  evaluate  \
  2000-05-01        2.0     0.0        0.0      0.0        0.0       0.0   
  2000-05-02        0.0     0.0        0.0      0.0        0.0       0.0   
  2000-05-03        0.0     0.0        0.0      0.0        0.0       0.0   
  2000-05-04        0.0     0.0        0.0      0.0        0.0       0.0   
  2000-05-05        0.0     0.0        0.0      0.0        0.0       0.0   
  ...               ...     ...        ...      ...        ...       ...   
  2000-10-28        0.0     0.0        0.0      0.0        0.0       1.0   
  2000-10-29        0.0     0.0        0.0      0.0        0.0       0.0   
  2000-10-30        0.0     0.0        0.0      0.0        0.0       0.0   
  2000-10-31        0.0     0.0        2.0      0.0        0.0       0.0   
  2000-11-01        0.0     0.0        0.0      0.0        0.0       0.0   
  
              lovers  humans  glorious  salt  ...  negotiating  claritin  pryor 

In [116]:
def get_impact_words(topic_wordstream, significance=0.95):
    topic_impact_words = []
    
    for topic, ws in topic_wordstream:
        ws_prices = ws.join(stock_prices.set_index('Date')).dropna()
        ws_gc, _ = grangers_causality_matrix(ws_prices, variables=ws_prices.columns, verbose=False)
        
#         display(ws_gc)
        keep_words = ws_gc[ws_gc['LastPrice' ] > significance].index[:-1] 

        keep_words_temp = (ws_gc.loc["LastPrice", ws_gc.loc["LastPrice"] > significance].index[:-1])
        keep_words = keep_words.append(keep_words_temp)
        
        pos = []
        neg = []
        for word in keep_words:
            corr = pearsonr(ws_prices[word], stock_prices['LastPrice'])[0]
            if abs(corr) >= significance:
                print (word, corr)
                if corr > 0:
                    pos.append((word, corr))
                else:
                    neg.append((word, corr))
        topic_impact_words.append((topic, pos, neg))
    
    return topic_impact_words
        

impact_words = get_impact_words(ct_ws)
impact_words

  and should_run_async(code)


[(0, [], []), (6, [], [])]

In [59]:
ws_prices = ct_ws.join(stock_prices.set_index('Date')).dropna()
ws_impact, ws_lags = grangers_causality_matrix(ws_prices, variables=ws_prices.columns, verbose=False)

display(ws_prices)
display(ws_impact)
display(ws_lags)

  and should_run_async(code)


Unnamed: 0,cleansing,willey,oversight,legendary,imports,woolsey,recapture,penn,evaluate,lovers,...,sizzle,dissolved,persuasion,choose,peek,underpaid,racicot,candace,publicized,LastPrice
2000-05-01,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.523810
2000-05-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.504970
2000-05-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.509491
2000-05-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.511466
2000-05-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.520875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000-10-27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.384310
2000-10-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.296488
2000-10-29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.345703
2000-10-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.380711


Unnamed: 0,cleansing,willey,oversight,legendary,imports,woolsey,recapture,penn,evaluate,lovers,...,sizzle,dissolved,persuasion,choose,peek,underpaid,racicot,candace,publicized,LastPrice
cleansing,1.0000,1.0000,0.9721,0.8977,0.0000,1.0000,0.9995,1.0000,0.2825,0.9998,...,1.0000,0.9999,0.8639,0.0241,0.8763,1.0000,0.9983,0.8490,0.7411,0.5432
willey,0.9997,1.0000,0.9998,0.9981,0.9995,1.0000,0.8549,1.0000,0.9993,0.9998,...,0.9999,0.9998,0.9999,0.9427,0.9997,1.0000,0.9999,0.9979,0.9988,0.9999
oversight,0.8024,0.9999,1.0000,0.8517,0.9693,0.9998,0.9841,0.9996,0.9326,0.9981,...,0.9994,0.9904,0.8078,0.8801,0.9974,0.9999,0.9223,0.9514,0.8684,0.8391
legendary,0.0638,0.9956,0.9571,1.0000,0.6603,0.9253,0.9027,0.0000,0.7894,0.9525,...,0.9931,0.9490,0.6402,0.9061,0.7268,0.9798,0.7368,0.6727,0.7776,0.9877
imports,0.0099,0.9996,0.9792,0.4114,1.0000,0.9559,0.9763,0.9980,0.7877,0.9294,...,0.9914,0.9633,0.9958,0.8559,0.7863,0.9996,0.9861,0.8753,0.2851,0.9984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
underpaid,0.9997,1.0000,0.9998,0.9983,0.9995,1.0000,0.9993,1.0000,0.9993,0.9998,...,0.9999,0.9998,0.9999,0.8082,0.9997,1.0000,0.9999,0.9979,0.9988,0.9950
racicot,0.9975,1.0000,0.8146,0.8918,0.9944,0.9999,0.7644,0.9695,0.9985,0.9446,...,0.9998,0.8301,0.9997,0.0887,0.9468,1.0000,1.0000,0.7221,0.9874,0.6653
candace,0.7297,0.9973,0.9206,0.9033,0.0000,0.9965,0.9487,0.9865,0.9386,0.9491,...,0.9807,0.9669,0.9964,0.7863,0.9329,0.9973,0.7908,1.0000,0.7252,0.9957
publicized,0.5756,0.9993,0.9064,0.6458,0.7762,0.9366,0.9552,0.9961,0.8131,0.7951,...,0.9832,0.8723,0.9816,0.6685,0.9533,0.9993,0.9271,0.7581,1.0000,0.9793


Unnamed: 0,cleansing,willey,oversight,legendary,imports,woolsey,recapture,penn,evaluate,lovers,...,sizzle,dissolved,persuasion,choose,peek,underpaid,racicot,candace,publicized,LastPrice
cleansing,0.0,4.0,4.0,3.0,0.0,4.0,4.0,4.0,0.0,4.0,...,4.0,4.0,0.0,0.0,1.0,4.0,4.0,1.0,1.0,4.0
willey,4.0,0.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
oversight,4.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,...,4.0,2.0,0.0,4.0,4.0,4.0,4.0,2.0,1.0,4.0
legendary,4.0,4.0,4.0,0.0,4.0,1.0,4.0,0.0,1.0,4.0,...,4.0,4.0,4.0,1.0,2.0,2.0,0.0,4.0,3.0,1.0
imports,0.0,4.0,4.0,3.0,0.0,1.0,4.0,4.0,1.0,3.0,...,4.0,3.0,3.0,1.0,2.0,4.0,4.0,3.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
underpaid,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,1.0
racicot,4.0,4.0,0.0,1.0,4.0,4.0,0.0,4.0,4.0,4.0,...,4.0,0.0,4.0,2.0,1.0,4.0,0.0,0.0,4.0,2.0
candace,1.0,4.0,2.0,4.0,0.0,4.0,4.0,4.0,4.0,2.0,...,4.0,2.0,4.0,2.0,0.0,4.0,1.0,0.0,1.0,0.0
publicized,4.0,4.0,4.0,2.0,2.0,1.0,4.0,4.0,2.0,2.0,...,4.0,2.0,4.0,1.0,3.0,4.0,4.0,4.0,0.0,4.0


In [71]:
corr = pearsonr(ws_prices["cleansing"], stock_prices['LastPrice'])[0]
corr

  and should_run_async(code)


0.06858530091902565

In [68]:
stock_prices

  and should_run_async(code)


Unnamed: 0,Date,LastPrice
0,2000-05-01,0.523810
1,2000-05-02,0.504970
2,2000-05-03,0.509491
3,2000-05-04,0.511466
4,2000-05-05,0.520875
...,...,...
177,2000-10-27,0.384310
178,2000-10-28,0.296488
179,2000-10-29,0.345703
180,2000-10-30,0.380711


In [115]:
ct_ws[0][1].columns

  and should_run_async(code)


Index(['cleansing', 'willey', 'legendary', 'woolsey', 'recapture', 'evaluate',
       'lovers', 'humans', 'glorious', 'salt', 'kicking', 'page', 'spells',
       'casey', 'glass', 'toronto', 'andrea', 'mr', 'flash', 'torricelli',
       'lent', 'techniques', 'diane', 'renews', 'redesign', 'plough',
       'accepts', 'arthritis', 'deepest', 'identity', 'embassies', 'schering',
       'lieberman', 'panama', 'convention', 'fated', 'generosity',
       'restaurant', 'worthy', 'aug', 'shiloh', 'speculation', 'laboratories',
       'performances', 'beans', 'clemens', 'ratified', 'abm', 'minimize',
       'pledges', 'bigotry', 'kiss', 'wells', 'puzzlement', 'stained',
       'graeme', 'inexperienced', 'compares', 'simon', 'clark', 'diary',
       'upset', 'eliot', 'said', 'antonetty', 'teams', 'cherished', 'anthony',
       'canal', 'raiservice', 'treaty', 'menu', 'negotiators', 'lodine',
       'ethics', 'lloyd', 'import', 'wooing', 'homer', 'carrier',
       'governorship', 'gentle', 'medic

In [None]:
m = {1: (
        {'ssr_ftest': (0.0, 1.0, 179.0, 1), 
         'ssr_chi2test': (0.0, 1.0, 1), 
         'lrtest': (-0.0, 1.0, 1), 
         'params_ftest': (2.6551965154348363, 0.1049705333686674, 179.0, 1.0)}, 
         [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc9c0120e80>, 
          <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc9c0120640>, 
          array([[0., 1., 0.]])]), 
     2: ({'ssr_ftest': (0.0, 1.0, 177.0, 2), 
          'ssr_chi2test': (0.0, 1.0, 2), 
          'lrtest': (-0.0, 1.0, 2), 
          'params_ftest': (28.669712759208675, 1.6380216282244424e-11, 177.0, 2.0)}, 
         [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc9c0120940>, 
          <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26c160>, 
          array([[0., 0., 1., 0., 0.],
                 [0., 0., 0., 1., 0.]])]), 
     3: ({'ssr_ftest': (0.0, 1.0, 175.0, 3), 
          'ssr_chi2test': (0.0, 1.0, 3), 
          'lrtest': (-0.0, 1.0, 3), 
          'params_ftest': (18.935898175907948, 1.1078151069488875e-10, 175.0, 3.0)}, 
         [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26cc40>, 
          <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26cac0>, 
          array([[0., 0., 0., 1., 0., 0., 0.],
                 [0., 0., 0., 0., 1., 0., 0.],
                 [0., 0., 0., 0., 0., 1., 0.]])]), 
     4: ({'ssr_ftest': (0.0, 1.0, 173.0, 4), 
          'ssr_chi2test': (0.0, 1.0, 4), 
          'lrtest': (-0.0, 1.0, 4), 
          'params_ftest': (24.90653594920218, 2.677610622848283e-16, 173.0, 4.0)}, [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26c760>, <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26c5e0>, array([[0., 0., 0., 0., 1., 0., 0., 0., 0.],/
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],/
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],/
       [0., 0., 0., 0., 0., 0., 0., 1., 0.]])]), 
     5: ({'ssr_ftest': (-3.955967903576136e-15, 1.0, 171.0, 5), 
          'ssr_chi2test': (-2.047386897464842e-14, 1.0, 5), 
          'lrtest': (-0.0, 1.0, 5), 
          'params_ftest': (20.40580635942483, 6.031245590056417e-16, 171.0, 5.0)}, 
         [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26c580>, 
          <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26c610>, 
          array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],/
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],/
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],/
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],/
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])])}

{1: ({'ssr_ftest': (6.669662363748857, 0.010610382296105314, 178.0, 1), 
      'ssr_chi2test': (6.78207240358732, 0.009207792227606254, 1), 
      'lrtest': (6.658097641379641, 0.009870623339659977, 1), 
      'params_ftest': (6.669662363748866, 0.010610382296105314, 178.0, 1.0)}, 
     [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc98e26c370>, 
      <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc9ed2ef220>, 
      array([[0., 1., 0.]])]), 
 2: ({'ssr_ftest': (11.169212517137323, 2.7215054361967665e-05, 175.0, 2), 
      'ssr_chi2test': (22.976665749539638, 1.0248974821774139e-05, 2), 
      'lrtest': (21.62415140965004, 2.015464641527244e-05, 2), 
      'params_ftest': (11.169212517137293, 2.721505436196856e-05, 175.0, 2.0)},
     [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc9ed2ef550>, 
      <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc9ed2ef5b0>, 
      array([[0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.]])]), 
 3: ({'ssr_ftest': (21.28195759280595, 8.991511901217905e-12, 172.0, 3), 
      'ssr_chi2test': (66.44425132172555, 2.4625076931282265e-14, 3), 
      'lrtest': (56.50744198114023, 3.2737765892104785e-12, 3), 
      'params_ftest': (21.28195759280595, 8.991511901217905e-12, 172.0, 3.0)}, 
     [<statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x7fc9ed