In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries
from shared_lib import utils, vocabulary, tf_embed_viz



### Parse data from ../data/processed

In [59]:
def get_vocab2(raw):
    tokens = nltk.word_tokenize(raw)
    words = [w.lower() for w in tokens]
    vocab = sorted(set(words))
    sents = nltk.sent_tokenize(raw.lower())
    return vocab, sents

    
def get_vocab(filename):
    raw = open(filename).read()
    return get_vocab2(raw)

dir = '../data/processed/'
pres_dict = {}

def add_to_dict(president, vocab, sents):
    vocab_, sents_ = pres_dict.get(president, (None, None))
    if (vocab_ == None):
        pres_dict[president] = (vocab, sents)
    else:
        vocab_ += vocab
        sents_ += sents
        pres_dict[president] = (vocab_, sents_)

def print_dict(pres_dict):        
    for key in pres_dict.keys():
        vocab, sents = pres_dict.get(key, (None, None))
        print "%s: vocab length %s, sents length %s" % (key, len(vocab), len(sents))
        print "Samples: ", sents[-4:]
    
for filename in os.listdir(dir):
    arr = filename.split("_")
    president = arr[0]

    try:
        vocab, sents = get_vocab(dir + filename)
        add_to_dict(president, vocab, sents)
    except UnicodeDecodeError as err:
        print filename, ":", err
        
print_dict(pres_dict)

Lincoln_SOTU_1862.txt : 'ascii' codec can't decode byte 0xe2 in position 12: ordinal not in range(128)
Obama_SOTU_2015.txt : 'ascii' codec can't decode byte 0xe2 in position 9: ordinal not in range(128)
Obama_SOTU_2009.txt : 'ascii' codec can't decode byte 0xe2 in position 8: ordinal not in range(128)
Obama_SOTU_2014.txt : 'ascii' codec can't decode byte 0xe2 in position 9: ordinal not in range(128)
Obama_SOTU_2013.txt : 'ascii' codec can't decode byte 0xe2 in position 15: ordinal not in range(128)
Obama_SOTU_2016.txt : 'ascii' codec can't decode byte 0xe2 in position 7: ordinal not in range(128)
Obama_SOTU_2012.txt : 'ascii' codec can't decode byte 0xe2 in position 18: ordinal not in range(128)
Obama_SOTU_2010.txt : 'ascii' codec can't decode byte 0xe2 in position 13: ordinal not in range(128)
Obama_SOTU_2011.txt : 'ascii' codec can't decode byte 0xe2 in position 13: ordinal not in range(128)
Lincoln: vocab length 14005, sents length 1795
Samples:  ['passion has helped us; but can do 

### Parse json data from ../data/unprocessed
#### I don't think we should include this data because they records the conversation between a president and other people so it's very hard to extract just the speech of a president

In [69]:
dir = '../data/unprocessed/'

for json_file in os.listdir(dir):
    json_data=open(dir + json_file)
    data = json.load(json_data)
    json_data.close()
    attrName = 'debate' if 'Debate' in json_file else 'speeches'
    for data2 in data[attrName]:
        if attrName == 'speeches':
            #print data2['date'], data2['name'], data2['speaker']#, debate['text'], debate['url'], 
            vocab, sents = get_vocab2(data2['text'])
            arr = data2['speaker'].split(' ')
            president = arr[len(arr)-1]
            print "%s: %s: new vocab length %s, sents length %s" % (data2['speaker'], president, len(vocab), len(sents))
            # Remove the last few sentences because they are not part of the speech
            #print len(sents)
            sents = sents[:len(sents)-10]
            print sents[-2:]
            #print len(sents)
            add_to_dict(president, vocab, sents)

print_dict(pres_dict)


George W. Bush: Bush: new vocab length 1539, sents length 586
[u'he deserves it, and so does the country.', u'</p><p>it has been a honor to work with you.']
Barack Obama: Obama: new vocab length 1782, sents length 424
[u"without acknowledging that by definition, part of any stimulus package would include spending--that's the point.", u"then what i get a sense of is that there's some ideological blockage there that needs to be cleared up."]
Barack Obama: Obama: new vocab length 1275, sents length 281
[u"there's been extraordinary cooperation, and we expect that that will continue.", u'</p><p>and prime minister harper is right.']
Barack Obama: Obama: new vocab length 1729, sents length 453
[u"we haven't immediately eliminated the influence of lobbyists in washington.", u'we have not immediately eliminated wasteful pork projects.']
Barack Obama: Obama: new vocab length 1517, sents length 338
[u"there's one last thing that i should mention that i love about great britain, and that is the q