In [17]:
# Preprocess

import pandas as pd
import os
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk

def pre_process(session):
    raw_path = 'raw/'
    speech_tab = f'{raw_path}speeches_{session}.tab'
    speaker_path = 'speaker_names/'
    house_path = 'house/'
    senate_path = 'senate/'
    house_xlsx = f'House{session}Speakers.xlsx'
    senate_xlsx = f'Senate{session}Speakers.xlsx'
    rt = pd.read_csv(speech_tab, sep = '\t')
    rt.loc[:,'doc_id'] = speech_tab
    rt.speaker = rt.speaker.str.lower()
    rt.speaker = rt.speaker.str.replace('[^\w\s]','')
    house = pd.read_excel(os.path.join(speaker_path,house_path,house_xlsx))
    house.speaker = house.speaker.str.lower()
    house_data = pd.merge(left=rt,right=house,on='speaker')
    senate = pd.read_excel(os.path.join(speaker_path,senate_path,senate_xlsx))
    senate.speaker = senate.speaker.str.lower()
    senate_data = pd.merge(left=rt, right=senate, on='speaker')
    rt = rt[~rt.speaker.isin(house.speaker) & ~rt.speaker.isin(senate.speaker)]
    return rt, house_data, senate_data

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def tokenize_lemmatize(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        token = lemmatize_stemming(token)
        if token not in stoplist and len(token)>3:
            result.append(token)
    return result



In [18]:
import blingfire as bf

In [19]:
rt = pd.DataFrame()
house_data = pd.DataFrame()
senate_data = pd.DataFrame()
for i in range(103,105):
    rt_temp, house_data_temp, senate_data_temp = pre_process(i)
    rt = pd.concat([rt,rt_temp])
    house_data = pd.concat([house_data, house_data_temp])
    senate_data = pd.concat([senate_data, senate_data_temp])
    

In [20]:
#rt, house_data = pre_process(103)

In [21]:
# Contain only speech
house_data = house_data[house_data.speech.str.len()>8]

In [22]:
# Legislator and Congress
house_data = house_data.groupby(['Legislator ID', 'Congress', 'District', 'Party', 'State ID', 'speaker' 
                                #, 'day', 'month', 'year'
                                ])['speech'].apply(','.join).reset_index()

stemmer = SnowballStemmer("english")

stoplist = ["absent", "amend", "adjourn", "ask", "can", "chairman",
              "committee", "con", "democrat", "etc", "gentleladies",
              "gentlelady","gentleman","gentlemen","gentlewoman","gentlewomen",
              "hereabout","hereafter","hereat","hereby","herein",
              "hereinafter","hereinbefore","hereinto","hereof","hereon",
              "hereto","heretofore","hereunder","hereunto","hereupon",
              "herewith","madam", "month","move","mr","mrs","nai","nay","none","now","part","per", "peopl",
              "pro","rise","republican","say","senator","shall","sir","speak",
              "speaker","tell","thank","thereabout","thereafter","thereagainst","thereat","therebefore",
              "therebeforn","thereby","therefor","therefore","therefrom",
              "therein","thereinafter","thereof","thereon","thereto","theretofore",
              "thereunder","thereunto","thereupon","therewith",
              "therewithal","time", "today","use","whereabouts","whereafter","whereas",
              "whereat","whereby","wherefore","wherefrom","wherein",
              "whereinto","whereof","whereon","whereto","whereunder",
              "whereupon","wherever","wherewith","wherewithal","will",
              "yea","year","yes","yield"]

stoplist += list(gensim.parsing.preprocessing.STOPWORDS)

In [11]:
def bf_token(text):
    return bf.text_to_words(text).split(',')

In [7]:
house_data.head(10).speech.apply(tokenize_lemmatize)

0    [correct, consum, brook, permiss, revis, exten...
1    [dummi, prop, befor, want, know, critic, fello...
2                     [minut, distinguish, york, forb]
3    [good, friend, becaus, like, attent, good, fri...
4    [extraordinari, event, hear, whatsoev, airport...
5                                   [dingel, michigan]
6    [hear, great, deal, talk, differ, better, cong...
7    [join, colleagu, fellow, veteran, lane, evan, ...
8    [implement, author, feder, trade, commiss, col...
9    [strong, opposit, real, onli, hous, fail, cons...
Name: speech, dtype: object

In [12]:
house_data.head(10).speech.apply(bf_token)

0    [The Chairman is correct . ,  Mr . Chairman , ...
1    [Mr . Speaker ,  I do n't have any dummies or ...
2    [Mr . Speaker ,  I yield 4 minutes to the dist...
3    [Mr . Speaker ,  I thank my good friend for yi...
4    [Mr . Chairman ,  this is a most extraordinary...
5                         [Mr . Dingell of Michigan .]
6    [Mr . Speaker ,  I have heard a great deal of ...
7    [Mr . Speaker ,  I rise to join my colleague a...
8    [Madam Speaker ,  H.R. 395 ,  the Do - Not - C...
9    [Mr . Chairman ,  I rise in strong opposition ...
Name: speech, dtype: object

In [15]:
gensim.utils.simple_preprocess(house_data.loc[0,'speech'])

['the',
 'chairman',
 'is',
 'correct',
 'mr',
 'chairman',
 'yield',
 'myself',
 'such',
 'time',
 'as',
 'may',
 'consume',
 'mr',
 'brooks',
 'asked',
 'and',
 'was',
 'given',
 'permission',
 'to',
 'revise',
 'and',
 'extend',
 'his',
 'remarks',
 'mr',
 'chairman',
 'rise',
 'in',
 'opposition',
 'to',
 'this',
 'amendment',
 'offered',
 'by',
 'my',
 'good',
 'friend',
 'and',
 'distinguished',
 'member',
 'of',
 'the',
 'committee',
 'mr',
 'ramstad',
 'the',
 'gentleman',
 'from',
 'minnesota',
 'because',
 'the',
 'attorney',
 'general',
 'already',
 'has',
 'the',
 'power',
 'to',
 'remove',
 'any',
 'independent',
 'counsel',
 'for',
 'good',
 'cause',
 'this',
 'amendment',
 'is',
 'unnecessary',
 'but',
 'of',
 'equally',
 'great',
 'concern',
 'to',
 'me',
 'is',
 'that',
 'this',
 'amendment',
 'spells',
 'out',
 'two',
 'but',
 'only',
 'two',
 'of',
 'the',
 'grounds',
 'which',
 'might',
 'constitute',
 'good',
 'cause',
 'under',
 'the',
 'statute',
 'because',
 'go

In [23]:
house_data.loc[:,'processed_speeches'] = house_data.speech.map(tokenize_lemmatize)


In [24]:
dictionary = gensim.corpora.Dictionary(house_data.loc[:,'processed_speeches'])

# eliminate 5% most common and words that appear 10 or fewer times. Also eliminate words that appear in 80% of topics

dictionary.filter_extremes(no_below=4, no_above=0.8)

In [25]:
house_data.loc[:,'bow_corpus'] = [dictionary.doc2bow(speech) for speech in house_data.loc[:,'processed_speeches']]

tfidf_model = gensim.models.TfidfModel(house_data.loc[:,'bow_corpus'])

house_data.loc[:,'corpus_tfidf'] = tfidf_model[house_data.loc[:,'bow_corpus']]

In [26]:
# lda_model = gensim.models.LdaMulticore(house_data.loc[:,'bow_corpus'], 
#                                        id2word = dictionary, 
#                                        passes = 2, 
#                                        workers=4)

In [27]:
import os
duration = 1  # seconds
freq = 440  # Hz
os.system('play -nq -t alsa synth {} sine {}'.format(duration, freq))

32512

In [28]:
lda_model_tfidf = gensim.models.LdaMulticore(house_data.loc[:,'corpus_tfidf'], 
                                       id2word = dictionary, 
                                       passes = 2, 
                                       workers=4)


# rt = rt.drop_duplicates('doc_id')

In [29]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

NameError: name 'lda_model' is not defined

In [30]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

Topic: 0 Word: 0.003*"ought" + 0.003*"wetland" + 0.003*"crime" + 0.003*"rule" + 0.003*"fund" + 0.003*"attorney" + 0.003*"court" + 0.003*"feder" + 0.003*"prison" + 0.003*"senat"


Topic: 1 Word: 0.003*"coleman" + 0.002*"tyndal" + 0.002*"peanut" + 0.002*"waiver" + 0.002*"collin" + 0.002*"cleveland" + 0.001*"african" + 0.001*"diego" + 0.001*"coastal" + 0.001*"bootl"


Topic: 2 Word: 0.004*"cyprus" + 0.004*"greek" + 0.003*"peacekeep" + 0.002*"turkish" + 0.002*"porter" + 0.002*"pittsburgh" + 0.002*"picatinni" + 0.002*"natcher" + 0.002*"missil" + 0.002*"cypriot"


Topic: 3 Word: 0.003*"wyom" + 0.002*"saif" + 0.002*"thrift" + 0.001*"graffiti" + 0.001*"rainey" + 0.001*"ranch" + 0.001*"gambl" + 0.001*"leas" + 0.001*"livestock" + 0.001*"bishop"


Topic: 4 Word: 0.002*"superfund" + 0.002*"monterey" + 0.002*"salmon" + 0.002*"pipelin" + 0.001*"plainfield" + 0.001*"woodbridg" + 0.001*"crime" + 0.001*"liabil" + 0.001*"secondhand" + 0.001*"supermand"


Topic: 5 Word: 0.004*"agricultur" + 0.003*"stamp"

In [31]:
import os
import os.path
import pickle
import time
import shelve

import chainer
from chainer import cuda
from chainer import serializers
import chainer.optimizers as O
import numpy as np

import lda2vec as l2v
import matplotlib.pyplot as plt
import seaborn as sns
from lda2vec import utils, prepare_topics, print_top_words_per_topic, topic_coherence
from lda2vec_model import LDA2Vec

Using TensorFlow backend.


ImportError: cannot import name 'prepare_topics'

In [37]:
gpu_id = int(os.getenv('CUDA_GPU', 0))
cuda.get_device(gpu_id).use()
print "Using GPU " + str(gpu_id)

SyntaxError: invalid syntax (<ipython-input-37-d70cf0ba4b05>, line 3)

In [None]:
data_dir = os.getenv('data_dir', '../data/')
fn_vocab = '{data_dir:s}/vocab.pkl'.format(data_dir=data_dir)
fn_corpus = '{data_dir:s}/corpus.pkl'.format(data_dir=data_dir)
fn_flatnd = '{data_dir:s}/flattened.npy'.format(data_dir=data_dir)
fn_docids = '{data_dir:s}/doc_ids.npy'.format(data_dir=data_dir)
fn_vectors = '{data_dir:s}/vectors.npy'.format(data_dir=data_dir)
vocab = pickle.load(open(fn_vocab, 'r'))
corpus = pickle.load(open(fn_corpus, 'r'))
flattened = np.load(fn_flatnd)
doc_ids = np.load(fn_docids)
vectors = np.load(fn_vectors)

# Model Parameters
# Number of documents
n_docs = doc_ids.max() + 1
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_topics = int(os.getenv('n_topics', 20))
batchsize = 4096
# Power for neg sampling
power = float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
pretrained = bool(int(os.getenv('pretrained', True)))
# Sampling temperature
temperature = float(os.getenv('temperature', 1.0))
# Number of dimensions in a single word vector
n_units = int(os.getenv('n_units', 300))
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq

for key in sorted(locals().keys()):
    val = locals()[key]
    if len(str(val)) < 100 and '<' not in str(val):
        print key, val

model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
                n_samples=15, power=power, temperature=temperature)
if os.path.exists('lda2vec.hdf5'):
    print "Reloading from saved"
    serializers.load_hdf5("lda2vec.hdf5", model)
if pretrained:
    model.sampler.W.data[:, :] = vectors[:n_vocab, :]
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)

j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
for epoch in range(200):
    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
                          cuda.to_cpu(model.mixture.factors.W.data).copy(),
                          cuda.to_cpu(model.sampler.W.data).copy(),
                          words)
    top_words = print_top_words_per_topic(data)
    if j % 100 == 0 and j > 100:
        coherence = topic_coherence(top_words)
        for j in range(n_topics):
            print j, coherence[(j, 'cv')]
        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
        progress[str(epoch)] = pickle.dumps(kw)
    data['doc_lengths'] = doc_lengths
    data['term_frequency'] = term_frequency
    np.savez('topics.pyldavis', **data)
    for d, f in utils.chunks(batchsize, doc_ids, flattened):
        t0 = time.time()
        optimizer.zero_grads()
        l = model.fit_partial(d.copy(), f.copy())
        prior = model.prior()
        loss = prior * fraction
        loss.backward()
        optimizer.update()
        msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
               "P:{prior:1.3e} R:{rate:1.3e}")
        prior.to_cpu()
        loss.to_cpu()
        t1 = time.time()
        dt = t1 - t0
        rate = batchsize / dt
        logs = dict(loss=float(l), epoch=epoch, j=j,
                    prior=float(prior.data), rate=rate)
        print msg.format(**logs)
        j += 1
serializers.save_hdf5("lda2vec.hdf5", model)