In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries
from shared_lib import utils, vocabulary, tf_embed_viz



### Parse data from ../data/processed

In [2]:
def get_vocab2(raw):
    tokens = nltk.word_tokenize(raw) #.decode("utf8"))
    words = [w.lower() for w in tokens]
    vocab = sorted(set(words))
    sents = nltk.sent_tokenize(raw.lower())
    return vocab, sents

    
def get_vocab(filename):
    raw = open(filename).read().decode("UTF-8")
    return get_vocab2(raw)

dir = '../data/processed/'
pres_dict = {}

def add_to_dict(president, vocab, sents):
    words = list()
    for s in sents:
        words.extend(nltk.word_tokenize(s))

    vocab_, sents_, words_ = pres_dict.get(president, (None, None, None))
    if (vocab_ == None):
        pres_dict[president] = (vocab, sents, words)
    else:
        vocab_ += vocab
        sents_ += sents
        words_.extend(words)
        pres_dict[president] = (vocab_, sents_, words_)

def print_dict(pres_dict):        
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None, None))
        print "%s: vocab length %s, sents length %s, words length %s" % (key, len(vocab), len(sents), len(words))
        #print "Samples: ", sents[-4:]
    
for filename in os.listdir(dir):
    arr = filename.split("_")
    president = arr[0]
    #vocab, sents = get_vocab(dir + filename)
    #add_to_dict(president, vocab, sents)

    try:
        vocab, sents = get_vocab(dir + filename)
        add_to_dict(president, vocab, sents)
    except UnicodeDecodeError as err:
        print filename, ":", err
        

        
print_dict(pres_dict)

Lincoln: vocab length 15975, sents length 2102, words length 66144
Trump: vocab length 544, sents length 90, words length 1679
Obama: vocab length 15305, sents length 3106, words length 64466


### Parse json data from ../data/unprocessed

In [3]:
from bs4 import BeautifulSoup
import re


dir = '../data/unprocessed/'

for json_file in os.listdir(dir):
    json_data=open(dir + json_file)
    data = json.load(json_data)
    json_data.close()
    attrName = 'debate' if 'Debate' in json_file else 'speeches'
    for data2 in data[attrName]:
        raw = BeautifulSoup(data2['text']).get_text()
        # Remove []
        raw = re.sub(' \[.*?\]',' ', raw, flags=re.DOTALL)
        # Remove ()
        raw = re.sub(' \(.*?\)',' ', raw, flags=re.DOTALL)
        if attrName == 'speeches':
            # print data2['date'], data2['name'], data2['speaker']#, debate['text'], debate['url'], 
            # data2['text'] has a lot of htmtl tags in there. We still need to parse it            
            # Removing the questions
            raw = re.sub(' Q\..*? The President\.','',raw, flags=re.DOTALL)
            
            #print data2['text'][0:200]
            #print data2['url']
            #print raw
            #break
            
            vocab, sents = get_vocab2(raw)
            arr = data2['speaker'].split(' ')
            president = arr[len(arr)-1]
            #print "*** %s: %s: new vocab length %s, sents length %s" % (data2['speaker'], president, len(vocab), len(sents))
            # Remove the last few sentences because they are not part of the speech
            #print sents[-10:]
            sents = sents[:len(sents)-10]
            #print sents[-2:]
            #print len(sents)
            add_to_dict(president, vocab, sents)

        ########################################################
        # TODO: extract debate data for TRUMP, OBAMA
        #elif ('OBAMA' in raw): #('TRUMP' in raw or 'OBAMA' in raw)):
        #    print data2['name'] #, raw[0:3000]


print_dict(pres_dict)




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Lincoln: vocab length 15975, sents length 2102, words length 66144
Trump: vocab length 11642, sents length 3657, words length 58709
Bush: vocab length 1322, sents length 401, words length 6840
Obama: vocab length 237934, sents length 46326, words length 1137458


#### Set up train and test data

In [8]:
# Create train and test data set
# Number of words used by 1 president
def get_train_test(pres_dict, num_words_limit):
    print "Max number of words:", num_words_limit
    president_int = {"Lincoln": [1,0,0,0], "Trump": [0,1,0,0], "Bush": [0,0,1,0], "Obama": [0,0,0,1]}

    def reshape_y(y):
        return np.reshape(y,[len(y),len(y[0])])
    
    y_train = list()
    X_train = list()
    y_test = list()
    X_test = list()
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None))
        if (len(words) >= num_words_limit):
            print "Processing data for", key
            y = (num_words_limit * [president_int[key]])
            #print y
            X = words[0:num_words_limit]
            #print len(X),len(y)
            # train = 80%, test = 20%
            train_len = int(num_words_limit * 0.8)
            y_train += y[0:train_len]
            X_train += X[0:train_len]
            y_test += y[train_len:]
            X_test += X[train_len:]
    return reshape_y(y_train), X_train, reshape_y(y_test), X_test

y_train, X_train, y_test, X_test = get_train_test(pres_dict, 58700)

print len(X_train), y_train.shape, len(X_test), y_test.shape
###### Shuffle data?

# We are generate new vocab
vocab = vocabulary.Vocabulary(X_train + X_test)
train_ids = vocab.words_to_ids(X_train)
test_ids = vocab.words_to_ids(X_test)

#print len(train_ids), len(test_ids),

#train_ids = np.array(ids * 10000, dtype=int)
#test_ids = np.array(ids * 100, dtype=int)

Max number of words: 58700
Processing data for Lincoln
Processing data for Trump
Processing data for Obama
140880 (140880, 4) 35220 (35220, 4)


In [5]:
#shuffle = np.random.permutation(np.arange(len(train_ids)))
#x = [train_ids[i] for i in shuffle]