In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import datetime
from collections import defaultdict

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk import ngrams

# Gensim
import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt 

from scipy.stats import pearsonr

import statsmodels
from statsmodels.tsa.stattools import grangercausalitytests

  and should_run_async(code)


In [3]:
stop_words = stopwords.words('english')
# stop_words.extend(['mr', 'ms', 'said'])

  and should_run_async(code)


In [4]:
# def lemmatize(content, tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en', disable=['parser', 'ner'])
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

# Tokenize and remove stop words from content
def tokenize(content, lemmatize=False):
    words = gensim.utils.simple_preprocess(content, deacc=True)  # tokenizes
    return words

def remove_stopwords(content):
    words = []
    for word in content:
        if word in stop_words:
            continue
        words.append(word)
    return words

  and should_run_async(code)


We are not lemmatizing or stemming. If we need to increase accuracy in the future, we can consider it.

In [5]:
# New York Times Data
rows = []
dates = []
articles = []
for month in range(5, 11):
    with open("Data/NYTimes/"+ str(month) + ".txt") as f:
        for i, line in enumerate(f):
            date, article = line.split(",", 1)
            timestamp = datetime.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z").date()
            tokenized = tokenize(article)
            destopped = remove_stopwords(tokenized)

            articles.append(destopped)
            dates.append(timestamp)
            rows.append([timestamp, destopped])

nytimes = pd.DataFrame(rows, columns=["Date", "Content"]) 
unique_dates = sorted(list(set(nytimes["Date"])))
# print (unique_dates)
nytimes

  and should_run_async(code)


Unnamed: 0,Date,Content
0,2000-05-03,"[two, years, ago, homer, bush, came, yankee, b..."
1,2000-05-02,"[texas, record, tell, op, ed, april, paul, bur..."
2,2000-05-01,"[top, foreign, policy, adviser, gov, george, b..."
3,2000-05-03,"[aides, gov, george, bush, fought, back, today..."
4,2000-05-03,"[gov, tommy, thompson, wisconsin, named, chair..."
...,...,...
5801,2000-10-31,"[new, york, times, cbs, news, poll, var, strin..."
5802,2000-10-31,"[tick, tock, diner, ted, friedrich, stockbroke..."
5803,2000-11-01,"[difference, us, vital, issue, would, go, wash..."
5804,2000-11-01,"[bush, administration, wanted, overturn, would..."


There are 3 days missing from the stock market data: 6/07, 6/08, 11/01. There are several ways we can deal with this. 
1. Toss out the three days from the NYTimes data
2. Condense 6/07 --> 6/06; 6/08 --> 6/09 (or something similar) and toss out 11/01. 
3. Something else that I can't think of at the moment

I also haven't looked at the paper to see how they deal with it yet.

Edit: Reading over some articles about time series, it seems that we should pad the missing datapoints with previous days

In [6]:
# Time Series Data
ts_months = ["May", "Jun", "Jul", "Aug", "Sep", "Oct"]
cols = ['Date', 'LastPrice']
stock_prices = pd.DataFrame()
for month in ts_months:
    ts_df = pd.read_csv("Data/PriceHistory/" + month + ".txt", delim_whitespace=True)
    ts_df['Date'] =  ts_df['Date'].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%y").date())
    
    Gore = ts_df.loc[ts_df['Contract'] == 'Dem'][['Date', 'LastPrice']].fillna(0).reset_index()
    Bush = ts_df.loc[ts_df['Contract'] == 'Rep'][['Date', 'LastPrice']].fillna(0).reset_index()

    # Gore/(Gore + Bush)
    relation = list(zip(Gore['Date'], (Gore['LastPrice']/(Gore['LastPrice'] + Bush['LastPrice'])).fillna(0.001)))
    stock_prices = stock_prices.append(relation, ignore_index=True)

stock_prices.columns = cols
stock_prices

  and should_run_async(code)


Unnamed: 0,Date,LastPrice
0,2000-05-01,0.523810
1,2000-05-02,0.504970
2,2000-05-03,0.509491
3,2000-05-04,0.511466
4,2000-05-05,0.520875
...,...,...
177,2000-10-27,0.384310
178,2000-10-28,0.296488
179,2000-10-29,0.345703
180,2000-10-30,0.380711


In [7]:
# for i in range(len(unique_dates)):
#     if unique_dates[i] not in list(stock_prices[0]):
#         print (unique_dates[i])

# bigram = Phrases(articles, min_count=1)
# bigrams = [b for b in bigram[articles]]
# articles = bigrams
# bigrams

  and should_run_async(code)


In [8]:
# Create Dictionary
id2word = corpora.Dictionary(articles)

# Attempt at filtering out words that appear too frequently
# id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
# id2word.filter_extremes(no_above=0.5)


# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in articles]
# doc_word_cnts = (np.array([np.array([(id2word[id], freq) for id, freq in cp]) for cp in corpus]))

# TF-IDF seems to give better coherence (but it wasn't in the paper...)
tfidf = models.TfidfModel(corpus)
tfidf_corpus = tfidf[corpus]

# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in tfidf_corpus[:1]][0][:5]

  and should_run_async(code)


[('ago', 0.07712049873418031),
 ('awesome', 0.23220574510227418),
 ('backup', 0.2198823985449398),
 ('backups', 0.2515408271170864),
 ('bases', 0.19264069440348208)]

In [9]:
# bow by date?
date_term_cnts = defaultdict(lambda: [])

for index, row in nytimes.iterrows():
    date = row["Date"]
    content = row["Content"]
    
    date_term_cnts[date] += content
    
date_term_cnts = list(date_term_cnts.items())
# date_term_cnts
date_term_cnts = [(date, {id2word[id]: freq for id, freq in id2word.doc2bow(text)}) for date, text in date_term_cnts]
date_term_cnts = sorted(date_term_cnts, key=lambda x: x[0])
date_term_cnts = pd.DataFrame([date_term_cnts[i][1] for i in range(len(date_term_cnts))], index=[date_term_cnts[i][0] for i in range(len(date_term_cnts))]).fillna(0.001)
date_term_cnts

  and should_run_async(code)


Unnamed: 0,ago,bush,two,years,also,chief,conservative,executive,george,get,...,pitted,reappeared,actuary,deerfield,issimmee,madame,subtraction,tussaud,unchartered,buffett
2000-05-01,1.000,74,2.0,6.0,6.0,2.0,1.000,2.000,16,5.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
2000-05-02,0.001,13,2.0,1.0,4.0,1.0,1.000,1.000,8,2.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
2000-05-03,4.000,150,8.0,16.0,14.0,4.0,8.000,0.001,22,6.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
2000-05-04,0.001,86,2.0,2.0,7.0,1.0,2.000,0.001,19,0.001,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
2000-05-05,6.000,246,14.0,16.0,14.0,2.0,0.001,0.001,38,16.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2000-10-28,1.000,368,15.0,7.0,23.0,2.0,13.000,2.000,94,7.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
2000-10-29,24.000,464,50.0,42.0,32.0,8.0,22.000,4.000,76,16.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
2000-10-30,6.000,271,13.0,9.0,11.0,4.0,0.001,2.000,43,9.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001
2000-10-31,2.000,592,30.0,26.0,24.0,14.0,0.001,4.000,134,16.000,...,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001,0.001


In [46]:
# Build LDA model
k = 10
lda_model = gensim.models.ldamodel.LdaModel(corpus=tfidf_corpus,
                                           id2word=id2word,
                                           num_topics=k, 
                                           passes=2,
                                           alpha='auto',  # assuming that topic distribution is assymetric. Not all topics equally represented in corpus.
                                           eta='auto',
                                           update_every=1, # online or batch processing (everything is on disk, so use online)
                                           per_word_topics=True)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=articles, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)


TypeError: bad operand type for unary -: 'NoneType'

In [11]:
# Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis

  and should_run_async(code)


Initial thoughts:

We need to de-pluralize the words (governments vs government).
Get the coherence score above 50 would be a good start probably.

Need to extend stop words to include mr.

But topic coherency is still very low

Also, we can double check our topic coherence by comparing with Wikipedia (and other checks the paper did)

In [12]:
# Select the model and print the topics
def get_topics(lda_model, num_topics=-1, num_words=100, prob_thresh=0.8):
    topics = []
    for topic, topic_words in lda_model.print_topics(num_topics=num_topics, num_words=num_words):
        words = topic_words.split(" + ")
        all_words = []
        all_prob = 0
        for elem in words:
            prob, word = elem.split("*")
            all_prob += float(prob)
            all_words.append(word.split('"')[1])

            if all_prob >= prob_thresh:
                break
        topics.append((topic, all_words))

    return topics
topics = get_topics(lda_model, prob_thresh=0.3)
topics

  and should_run_async(code)


[(0,
  ['gorey',
   'intervention',
   'bushwick',
   'wrongful',
   'stuyvesant',
   'bedford',
   'predicting',
   'insults',
   'derby',
   'monetary',
   'patriot',
   'exemptions',
   'assent',
   'intervene',
   'stewart',
   'chernobyl',
   'greenhouse',
   'hale',
   'wyly',
   'trent',
   'harlem',
   'inmate',
   'shalom',
   'mr',
   'riot',
   'dance',
   'exposure',
   'enron',
   'passengers',
   'solicitor',
   'jewish',
   'factors',
   'depart',
   'brooklyn',
   'marshall',
   'humanities',
   'recycling',
   'trash',
   'reforms',
   'lockstep',
   'interventionist',
   'said',
   'goren',
   'bone',
   'olympic',
   'gore',
   'counts',
   'amusing',
   'drily',
   'cultural',
   'police',
   'neighborhood',
   'sgt',
   'capelludo',
   'priest',
   'nikas',
   'elias',
   'lourdes',
   'sodomy',
   'busing',
   'starr',
   'presumptive',
   'absence',
   'insistence',
   'opera',
   'president',
   'wrongly',
   'rectify',
   'humanitarian',
   'beautiful',
   'tri

In [13]:
# Select the model and print the topics

pd.options.display.max_colwidth = None
display(pd.DataFrame(lda_model.print_topics()))

  and should_run_async(code)


Unnamed: 0,0,1
0,0,"0.002*""gorey"" + 0.000*""intervention"" + 0.000*""bushwick"" + 0.000*""wrongful"" + 0.000*""stuyvesant"" + 0.000*""bedford"" + 0.000*""predicting"" + 0.000*""insults"" + 0.000*""derby"" + 0.000*""monetary"""
1,1,"0.006*""mr"" + 0.003*""gore"" + 0.003*""said"" + 0.003*""bush"" + 0.003*""debate"" + 0.002*""campaign"" + 0.002*""would"" + 0.002*""clinton"" + 0.002*""tax"" + 0.002*""president"""
2,2,"0.003*""heating"" + 0.003*""barrels"" + 0.002*""supply"" + 0.002*""petroleum"" + 0.001*""carbon"" + 0.001*""lieberman"" + 0.001*""reserves"" + 0.001*""coal"" + 0.001*""tame"" + 0.001*""kelly"""
3,3,"0.002*""sanctions"" + 0.002*""franks"" + 0.001*""letterman"" + 0.001*""miller"" + 0.001*""upstate"" + 0.001*""location"" + 0.001*""corzine"" + 0.001*""fires"" + 0.001*""nelson"" + 0.001*""reserves"""
4,4,"0.001*""martinez"" + 0.001*""ridgewood"" + 0.001*""queens"" + 0.001*""india"" + 0.001*""judiciary"" + 0.001*""clemens"" + 0.000*""musicals"" + 0.000*""certification"" + 0.000*""bushwick"" + 0.000*""brooklyn"""
5,5,"0.002*""mcginn"" + 0.001*""surname"" + 0.001*""misspelled"" + 0.001*""gail"" + 0.001*""reprieve"" + 0.001*""dow"" + 0.001*""dna"" + 0.001*""wilensky"" + 0.001*""ricky"" + 0.001*""misidentified"""
6,6,"0.003*""vidal"" + 0.002*""glamorous"" + 0.002*""atlantic"" + 0.001*""friedman"" + 0.001*""revival"" + 0.001*""quotation"" + 0.001*""bipartisanship"" + 0.001*""theater"" + 0.001*""overlooked"" + 0.001*""goldsmith"""
7,7,"0.016*""string"" + 0.014*""var"" + 0.008*""else"" + 0.003*""pat"" + 0.003*""lieberman"" + 0.003*""nader"" + 0.003*""buchanan"" + 0.002*""ralph"" + 0.002*""gore"" + 0.002*""sept"""
8,8,"0.005*""bushnell"" + 0.003*""candace"" + 0.002*""blondes"" + 0.002*""milosevic"" + 0.002*""sex"" + 0.002*""monthly"" + 0.002*""author"" + 0.001*""manhattan"" + 0.001*""love"" + 0.001*""carnahan"""
9,9,"0.001*""bat"" + 0.001*""madison"" + 0.001*""bauer"" + 0.001*""gorelick"" + 0.001*""hills"" + 0.001*""beverly"" + 0.001*""exhibited"" + 0.001*""ellis"" + 0.001*""lacks"" + 0.001*""resident"""


In [14]:
document_topics = lda_model.get_document_topics(corpus)
date_doc_topics = list(zip(nytimes["Date"], lda_model.get_document_topics(corpus)))
for l in document_topics[:10]:
    print (l)

  and should_run_async(code)


[(1, 0.97512877)]
[(1, 0.8761766), (7, 0.115408726)]
[(1, 0.99818856)]
[(1, 0.98162955), (7, 0.017368853)]
[(1, 0.9553608), (3, 0.021237914), (7, 0.020965328)]
[(1, 0.9356058), (7, 0.055053487)]
[(1, 0.9701896), (7, 0.012223703), (8, 0.01579854)]
[(1, 0.9935671)]
[(1, 0.95881176), (7, 0.018327104), (9, 0.019884065)]
[(1, 0.97860914)]


In [15]:
# for any given day, you look at all the diff topics and identify the prob of that topic
# should I normalize? Paper doesn't seem to normalize...
date_topic_prob = np.zeros((len(unique_dates), k))
for date, article in date_doc_topics:
    i = unique_dates.index(date)
    for topic, prob in article:
        date_topic_prob[i][topic] += prob 

# Figure out how to normalize [reread paper/rewatch lecture]
# date_topic_prob = date_topic_prob/date_topic_prob.max(axis=0)    

  and should_run_async(code)


In [16]:
date_topic = pd.DataFrame(date_topic_prob, index=unique_dates)
date_topic["Date"] = unique_dates
date_topic

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Date
2000-05-01,0.000000,11.291009,0.209941,0.011937,0.000000,0.000000,0.000000,0.376509,0.015799,0.000000,2000-05-01
2000-05-02,0.000000,8.359794,0.027041,0.109103,0.000000,0.000000,0.000000,0.273454,0.000000,0.114625,2000-05-02
2000-05-03,0.000000,27.681651,0.134054,0.392987,0.000000,0.000000,1.382118,1.227326,0.055857,0.784119,2000-05-03
2000-05-04,0.104755,16.353308,0.078732,0.025968,0.000000,0.040312,0.511124,0.574773,0.000000,0.111492,2000-05-04
2000-05-05,0.000000,24.959853,0.000000,0.000000,0.000000,0.000000,0.000000,0.825362,0.000000,0.000000,2000-05-05
...,...,...,...,...,...,...,...,...,...,...,...
2000-10-28,0.000000,27.917017,0.082861,0.144940,0.014141,0.000000,0.000000,15.463583,0.000000,0.032488,2000-10-28
2000-10-29,0.137953,65.868831,0.150507,0.262732,0.126084,0.000000,0.700213,3.704698,1.635098,0.433084,2000-10-29
2000-10-30,0.000000,22.511357,0.030421,0.047213,0.040909,0.000000,0.030712,9.893672,0.000000,0.133887,2000-10-30
2000-10-31,0.000000,67.429348,0.000000,0.334038,0.000000,0.000000,0.130364,24.906475,0.054707,0.237854,2000-10-31


In [17]:
date_topic_prices = date_topic.set_index('Date').join(stock_prices.set_index('Date')).dropna()
date_topic_prices

  and should_run_async(code)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,LastPrice
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-05-01,0.000000,11.291009,0.209941,0.011937,0.000000,0.000000,0.000000,0.376509,0.015799,0.000000,0.523810
2000-05-02,0.000000,8.359794,0.027041,0.109103,0.000000,0.000000,0.000000,0.273454,0.000000,0.114625,0.504970
2000-05-03,0.000000,27.681651,0.134054,0.392987,0.000000,0.000000,1.382118,1.227326,0.055857,0.784119,0.509491
2000-05-04,0.104755,16.353308,0.078732,0.025968,0.000000,0.040312,0.511124,0.574773,0.000000,0.111492,0.511466
2000-05-05,0.000000,24.959853,0.000000,0.000000,0.000000,0.000000,0.000000,0.825362,0.000000,0.000000,0.520875
...,...,...,...,...,...,...,...,...,...,...,...
2000-10-27,0.115797,67.148907,0.000000,0.349068,0.221768,0.193153,0.271317,24.210089,0.202570,0.196642,0.384310
2000-10-28,0.000000,27.917017,0.082861,0.144940,0.014141,0.000000,0.000000,15.463583,0.000000,0.032488,0.296488
2000-10-29,0.137953,65.868831,0.150507,0.262732,0.126084,0.000000,0.700213,3.704698,1.635098,0.433084,0.345703
2000-10-30,0.000000,22.511357,0.030421,0.047213,0.040909,0.000000,0.030712,9.893672,0.000000,0.133887,0.380711


In [19]:
# https://stackoverflow.com/questions/58005681/is-it-possible-to-run-a-vector-autoregression-analysis-on-a-large-gdp-data-with
def grangers_causality_matrix(data, variables, maxlag=5, test='ssr_ftest', verbose=False):
    dataset = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    lags    = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    
    for c in dataset.columns:
        for r in dataset.index:            
            test_result = grangercausalitytests(data[[r,c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1], 4) for i in range(maxlag)]
            
            if verbose: 
                print(f'Y = {r}, X = {c}, P Values = {p_values}')

            min_p_value_i = np.argmin(p_values)
            min_p_value = p_values[min_p_value_i]
            dataset.loc[r, c] = min_p_value
            
            lags.loc[r, c] = min_p_value_i
    
    return dataset, lags

# grangers_causality_matrix(dataset, variables = dataset.columns)
c, l = grangers_causality_matrix(date_topic_prices, variables=date_topic_prices.columns, verbose=False)
display(c)
display(l)

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,LastPrice
0,1.0,0.0218,0.3601,0.1663,0.5077,0.0637,0.1733,0.4518,0.6023,0.0005,0.6692
1,0.0259,1.0,0.0038,0.009,0.0138,0.0362,0.0002,0.0197,0.0431,0.083,0.434
2,0.3061,0.0217,1.0,0.1437,0.2862,0.7057,0.1844,0.1384,0.3022,0.3477,0.0986
3,0.1269,0.0335,0.2073,1.0,0.5669,0.7322,0.0006,0.5505,0.3216,0.075,0.1788
4,0.1769,0.0459,0.1006,0.0608,1.0,0.2151,0.0307,0.2922,0.134,0.5107,0.0173
5,0.0162,0.2134,0.165,0.0101,0.2922,1.0,0.0006,0.2404,0.6789,0.0895,0.0339
6,0.3582,0.0001,0.1267,0.0439,0.0797,0.2238,1.0,0.04,0.1066,0.111,0.7078
7,0.6616,0.0,0.0048,0.0002,0.0563,0.0399,0.0011,1.0,0.0121,0.0001,0.373
8,0.5174,0.003,0.0572,0.231,0.7419,0.0551,0.27,0.0112,1.0,0.1759,0.708
9,0.4426,0.0058,0.0056,0.0608,0.2126,0.3692,0.0018,0.1294,0.0095,1.0,0.0778


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,LastPrice
0,0.0,0.0,0.0,1.0,4.0,1.0,0.0,2.0,4.0,1.0,2.0
1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0,0.0,2.0
2,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0,3.0,2.0
3,0.0,1.0,3.0,0.0,0.0,3.0,3.0,1.0,0.0,3.0,0.0
4,2.0,0.0,0.0,0.0,0.0,4.0,3.0,1.0,0.0,0.0,0.0
5,0.0,0.0,3.0,4.0,2.0,0.0,3.0,0.0,0.0,1.0,0.0
6,0.0,1.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,2.0
7,0.0,1.0,2.0,2.0,0.0,0.0,2.0,0.0,2.0,2.0,0.0
8,0.0,3.0,3.0,0.0,3.0,4.0,0.0,1.0,0.0,3.0,3.0
9,0.0,1.0,3.0,0.0,0.0,0.0,3.0,3.0,1.0,0.0,2.0


In [20]:
def get_causal_vars(data, significance=0.95, getLags=False, getCausalSig=False, verbose=False):
    cols = data.columns[:-1]
    causal_vars = []
    causal_lags = []
    
#     i = 0
    for col in cols:
        try:
            gc, lags = grangers_causality_matrix(data[[col, 'LastPrice']], 
                                             variables=[col, 'LastPrice'], 
                                             verbose=False)
        except:
            raise Exception(data[[col, 'LastPrice']])
        
        gc = 1 - gc
#         if i < 10:
#             display(gc)
#             i += 1
        
        col_causes = gc.loc['LastPrice', col] >= significance
        col_causedBy = gc.loc[col, 'LastPrice'] >= significance
        if col_causes or col_causedBy:
            if getCausalSig:
                causal_vars.append((col, max(gc.loc['LastPrice', col], gc.loc[col, 'LastPrice'])))
            else:
                causal_vars.append(col)
            
            if getLags:
                # if sig. granger causality for topic causing ts and ts causing topic, choose whichever is higher
                if col_causes and col_causedBy:
                    if gc.loc['LastPrice', col] >= gc.loc[col, 'LastPrice']:
                        causal_lags.append(lags.loc['LastPrice', col])
                    else:
                        causal_lags.append(lags.loc[col, 'LastPrice'] * -1)
                elif col_causes:
                    causal_lags.append(lags.loc['LastPrice', col])
                else:
                    causal_lags.append(lags.loc[col, 'LastPrice'] * -1)
    if getLags:
        return causal_vars, causal_lags
    return causal_vars
                
causal_topics, ct_lags = get_causal_vars(date_topic_prices, getLags=True)
causal_topics, ct_lags

  and should_run_async(code)


([4, 5, 9], [-0.0, -0.0, 3.0])

In [23]:
%%time
def get_word_stream(nytimes, topics, causal_topics):
    ct_ws = []
    first = True
    for ct in causal_topics:
        causal_vocab = list(set(topics[ct][1]))
        date_terms = pd.DataFrame(np.zeros((len(unique_dates), len(causal_vocab))), index=unique_dates, columns=causal_vocab)

        
        for word in causal_vocab:
            if first:
                first = False
                print (word)
            date_terms[word] = date_term_cnts[word]
            
        ct_ws.append((ct, date_terms))
    
    return ct_ws

ct_ws = get_word_stream(nytimes, topics, causal_topics)
ct_ws

section
CPU times: user 71.4 ms, sys: 10.2 ms, total: 81.6 ms
Wall time: 69.1 ms


  and should_run_async(code)


[(4,
              section  iranian  broadway  homer  cooee  dolls  antonetty  \
  2000-05-01    0.001    0.001     0.001  0.001  0.001  0.001      0.001   
  2000-05-02    0.001    0.001     0.001  0.001  0.001  0.001      0.001   
  2000-05-03    0.001    0.001     0.001  2.000  0.001  0.001      0.001   
  2000-05-04    0.001    0.001     0.001  0.001  0.001  0.001      0.001   
  2000-05-05    2.000    0.001     0.001  0.001  0.001  0.001      0.001   
  ...             ...      ...       ...    ...    ...    ...        ...   
  2000-10-28    0.001    0.001     0.001  0.001  0.001  0.001      0.001   
  2000-10-29    0.001    0.001     0.001  0.001  0.001  0.001      0.001   
  2000-10-30    0.001    0.001     0.001  0.001  0.001  0.001      0.001   
  2000-10-31    0.001    0.001     2.000  0.001  0.001  0.001      0.001   
  2000-11-01    0.001    0.001     0.001  0.001  0.001  0.001      0.001   
  
              reinvigorated  armor  clemens  ...  productivity  housing  \
  200

In [24]:
%%time
def get_impact_words(topic_wordstream, significance=0.95, verbose=False):
    topic_impact_words = []
    
    first = True
    for topic, ws in topic_wordstream:
        ws_prices = ws.join(stock_prices.set_index('Date')).dropna()        
        ws_gc = get_causal_vars(ws_prices, significance=significance, getCausalSig=True, verbose=verbose)
        
#         if first:
#             display(ws_gc)
#             first = False
        
        pos = []
        neg = []
        for word, sig in ws_gc:                
            corr = pearsonr(ws_prices[word], stock_prices['LastPrice'])[0]
            if corr > 0:
                pos.append((word, sig))
            else:
                neg.append((word, sig))
                
        topic_impact_words.append((topic, pos, neg))
    
    return topic_impact_words
        

impact_words = get_impact_words(ct_ws)
impact_words

  and should_run_async(code)


CPU times: user 16.9 s, sys: 46.9 ms, total: 16.9 s
Wall time: 16.9 s


[(4,
  [('iranian', 0.9678),
   ('cooee', 0.9824),
   ('graeme', 0.9812),
   ('observer', 0.9857),
   ('duke', 0.9518),
   ('jamaica', 0.9513),
   ('rows', 0.9998),
   ('drastically', 0.9887),
   ('branch', 0.9919),
   ('certification', 0.9859),
   ('nuclear', 0.9947),
   ('donation', 0.9675),
   ('stealth', 0.9806),
   ('lloyd', 0.9805),
   ('wyden', 0.9894),
   ('bushehr', 0.9844),
   ('wells', 0.9956),
   ('lumia', 0.9824),
   ('squabble', 0.9645)],
  [('dolls', 1.0),
   ('reinvigorated', 0.9949),
   ('armor', 0.9811),
   ('makersvice', 0.9746),
   ('atomic', 1.0),
   ('reporting', 0.9973),
   ('plopped', 0.982),
   ('bushwick', 0.9804),
   ('souvenir', 0.982),
   ('stardom', 0.9648),
   ('walking', 0.9691),
   ('courteously', 0.982),
   ('sleeping', 0.9903),
   ('judiciary', 0.9976),
   ('queens', 0.9765)]),
 (5,
  [('hits', 0.9706),
   ('unwelcome', 0.9881),
   ('kanchanalak', 0.9999),
   ('medicare', 0.9951),
   ('raping', 0.9624),
   ('absentee', 0.9732),
   ('chagrin', 0.9835),

In [39]:
# ws_prices = ct_ws.join(stock_prices.set_index('Date')).dropna()
# ws_impact, ws_lags = grangers_causality_matrix(ws_prices, variables=ws_prices.columns, verbose=False)

# display(ws_prices)
# display(ws_impact)
# display(ws_lags)

  and should_run_async(code)


In [44]:
def calculate_purity(pWords, nWords):
    n = float(len(pWords) + len(nWords))
    pProb = len(pWords)/n
    nProb = len(nWords)/n
        
    entropy = pProb * np.log2(pProb) + nProb * np.log2(nProb)
    purity = 100 + 100 * entropy
    return purity

  and should_run_async(code)


In [45]:
calculate_purity(impact_words[0][1], impact_words[0][2])

  and should_run_async(code)


1.0007208442481357

In [36]:
impact_words[0]

  and should_run_async(code)


(4,
 [('iranian', 0.9678),
  ('cooee', 0.9824),
  ('graeme', 0.9812),
  ('observer', 0.9857),
  ('duke', 0.9518),
  ('jamaica', 0.9513),
  ('rows', 0.9998),
  ('drastically', 0.9887),
  ('branch', 0.9919),
  ('certification', 0.9859),
  ('nuclear', 0.9947),
  ('donation', 0.9675),
  ('stealth', 0.9806),
  ('lloyd', 0.9805),
  ('wyden', 0.9894),
  ('bushehr', 0.9844),
  ('wells', 0.9956),
  ('lumia', 0.9824),
  ('squabble', 0.9645)],
 [('dolls', 1.0),
  ('reinvigorated', 0.9949),
  ('armor', 0.9811),
  ('makersvice', 0.9746),
  ('atomic', 1.0),
  ('reporting', 0.9973),
  ('plopped', 0.982),
  ('bushwick', 0.9804),
  ('souvenir', 0.982),
  ('stardom', 0.9648),
  ('walking', 0.9691),
  ('courteously', 0.982),
  ('sleeping', 0.9903),
  ('judiciary', 0.9976),
  ('queens', 0.9765)])

In [71]:
def construct_prior(impact_words, curr_k, sig=0.95):
    # find number of topics that we are splitting
    new_k = curr_k + len(impact_words)
    word_priors = np.zeros((new_k, date_term_cnts.shape[1])) + 0.01

    i = 0
    for num, pos, neg in impact_words:
        print (i)
        pos_denom = sum([granger-sig for word, granger in pos])
        neg_denom = sum([granger-sig for word, granger in neg])
        
        if len(pos) < 0.1 * len(neg):
            # num neg words >> num pos
            for word, granger in pos:              
                word_priors[i, id2word.token2id[word]] = 0
            for word, granger in neg:
                word_priors[i, id2word.token2id[word]] = (granger-sig)/neg_denom 
            
        elif len(neg) < 0.1 * len(pos):
            # num pos words >> num neg
            for word, granger in pos:              
                word_priors[i, id2word.token2id[word]] = (granger-sig)/pos_denom 
            for word, granger in neg:
                word_priors[i, id2word.token2id[word]] = 0
            

        for word, granger in pos:              
            word_priors[i, id2word.token2id[word]] = (granger-sig)/pos_denom 
        
        for word, granger in neg:
            word_priors[i + 1, id2word.token2id[word]] = (granger-sig)/neg_denom 
        
        i += 2
    return word_priors
            
        

  and should_run_async(code)


In [73]:
word_priors = construct_prior(impact_words, 10)
word_priors

0
2
4


  and should_run_async(code)


array([[0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01],
       [0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01],
       [0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01],
       ...,
       [0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01],
       [0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01],
       [0.01, 0.01, 0.01, ..., 0.01, 0.01, 0.01]])

In [74]:
set(word_priors[0])

  and should_run_async(code)


{0.002256552681826204,
 0.003124457559451519,
 0.01,
 0.025169241451137022,
 0.03037667071688949,
 0.03089741364346468,
 0.052942197535150184,
 0.05311577851067525,
 0.054157264363825626,
 0.056240236070126765,
 0.05971185558062841,
 0.06196840826245442,
 0.06231557021350455,
 0.06717583752820688,
 0.06839090435688233,
 0.07273042874500948,
 0.07759069605971182,
 0.07915292483943759,
 0.086443325811491}