In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries
from shared_lib import utils, vocabulary, tf_embed_viz

# Parse data from ../data/processed

In [13]:
def get_vocab2(raw):
    tokens = nltk.word_tokenize(raw) #.decode("utf8"))
    words = [w.lower() for w in tokens]
    vocab = sorted(set(words))
    sents = nltk.sent_tokenize(raw.lower())
    return vocab, sents

    
def get_vocab(filename):
    raw = open(filename).read().decode("UTF-8")
    return get_vocab2(raw)

dir = '../data/processed/'
pres_dict = {}

def add_to_dict(president, vocab, sents):
    words = list()
    for s in sents:
        words.extend(nltk.word_tokenize(s))

    vocab_, sents_, words_ = pres_dict.get(president, (None, None, None))
    if (vocab_ == None):
        pres_dict[president] = (vocab, sents, words)
    else:
        vocab_ += vocab
        sents_ += sents
        words_.extend(words)
        pres_dict[president] = (vocab_, sents_, words_)

def print_dict(pres_dict):        
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None, None))
        print "%s: vocab length %s, sents length %s, words length %s" % (key, len(vocab), len(sents), len(words))
        #print "Samples: ", sents[-4:]
    
for filename in os.listdir(dir):
    arr = filename.split("_")
    president = arr[0]
    #vocab, sents = get_vocab(dir + filename)
    #add_to_dict(president, vocab, sents)

    try:
        vocab, sents = get_vocab(dir + filename)
        add_to_dict(president, vocab, sents)
    except UnicodeDecodeError as err:
        print filename, ":", err
        

        
print_dict(pres_dict)

Lincoln: vocab length 15975, sents length 2102, words length 66144
Trump: vocab length 544, sents length 90, words length 1679
Obama: vocab length 15305, sents length 3106, words length 64466


# Parse json data from ../data/unprocessed

In [14]:
from bs4 import BeautifulSoup
import re


dir = '../data/unprocessed/'

for json_file in os.listdir(dir):
    json_data=open(dir + json_file)
    data = json.load(json_data)
    json_data.close()
    attrName = 'debate' if 'Debate' in json_file else 'speeches'
    for data2 in data[attrName]:
        raw = BeautifulSoup(data2['text'], "html.parser").get_text()
        # Remove []
        raw = re.sub(' \[.*?\]',' ', raw, flags=re.DOTALL)
        # Remove ()
        raw = re.sub(' \(.*?\)',' ', raw, flags=re.DOTALL)
        if (attrName == 'speeches' and 'News Conference With' not in data2['name']):
            # print data2['date'], data2['name'], data2['speaker']#, debate['text'], debate['url'], 
            # data2['text'] has a lot of htmtl tags in there. We still need to parse it            
            # Removing the questions
            raw = re.sub('[A-Z,\s,\.]Q\..*? The President\.','\.',raw, flags=re.DOTALL)
            raw = re.sub('^[A-Z,\s]*THE PRESIDENT\.','',raw, flags=re.DOTALL)
            raw = re.sub('[A-Z,\s,\.]Q\..*?THE PRESIDENT\.','\.',raw, flags=re.DOTALL)
            
            #print "\n\n***", data2['name'], '\n', data2['text'][-1000:], "\n***", raw[-1000:]
            #print data2['text'][0:200]
            #print data2['url']
            #print raw
            #break
            
            vocab, sents = get_vocab2(raw)
            arr = data2['speaker'].split(' ')
            president = arr[len(arr)-1]
            #print "*** %s: %s: new vocab length %s, sents length %s" % (data2['speaker'], president, len(vocab), len(sents))
            # Remove the last few sentences because they are not part of the speech
            #print sents[-10:]
            sents = sents[:len(sents)-10]
            #print sents[-2:]
            #print len(sents)
            add_to_dict(president, vocab, sents)

        ########################################################
        # TODO: extract debate data for TRUMP, OBAMA
        #elif ('OBAMA' in raw): #('TRUMP' in raw or 'OBAMA' in raw)):
        #    print data2['name'] #, raw[0:3000]


print_dict(pres_dict)


Lincoln: vocab length 15975, sents length 2102, words length 66144
Hoover: vocab length 74728, sents length 3027, words length 103986
Clinton: vocab length 73996, sents length 16625, words length 359552
Trump: vocab length 6006, sents length 2708, words length 38761
Johnson: vocab length 118961, sents length 22235, words length 445977
Kennedy: vocab length 61138, sents length 11051, words length 258231
Carter: vocab length 61404, sents length 10961, words length 244037
Reagan: vocab length 48017, sents length 8770, words length 192872
Roosevelt: vocab length 123191, sents length 16084, words length 399627
Ford: vocab length 34320, sents length 6283, words length 135621
Bush: vocab length 158682, sents length 42195, words length 747184
Truman: vocab length 139865, sents length 31957, words length 413257
Eisenhower: vocab length 161616, sents length 22835, words length 601774
Nixon: vocab length 40528, sents length 7426, words length 191118
Obama: vocab length 203503, sents length 38956,

# Set up train and test data

In [15]:
# Create train and test data set
# Number of words used by 1 president
#president_int = {"Lincoln": [1,0,0,0], "Trump": [0,1,0,0], "Bush": [0,0,1,0], "Obama": [0,0,0,1]}

def append_matrices(a,b):
    if (a == None):
        return b
    else:
        return np.concatenate((a, b))

def get_train_test(pres_dict, num_words_limit, batch_size=100):
    print "Max number of words:", num_words_limit

    def reshape_y(y):
        return np.reshape(y,[len(y),len(y[0])])
            
    y_train = None
    X_train = None
    y_test = None
    X_test = None
    all_words = list()
    president_int = {}

    # Set up president_int: find out how many meets the word count requirement
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None))
        if (len(words) >= num_words_limit):
            president_int[key] = None # initialize this mapping
    i = 0
    for p in president_int.keys():
        arr = [0]* len(president_int.keys())
        arr[i] = 1
        president_int[p] = arr
        i +=1
    #print president_int
        
    # Then use president_int to build y matrices
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None))
        if (len(words) >= num_words_limit):
            print "Processing data for", key            
            X = words[0:num_words_limit]
            all_words += X
            X = np.reshape(X, [len(X)/batch_size, batch_size])
            #y = (num_words_limit * [president_int[key]])
            y = reshape_y(X.shape[0] * [president_int[key]])
            #print y
            #print X.shape,y.shape
            # train = 80%, test = 20%
            train_len = int(y.shape[0] * 0.8)
            #print train_len
            ## add new rows to y_train
            y_train = append_matrices(y_train, y[:train_len])
            #print y_train.shape
                            
            X_train = append_matrices(X_train, X[:train_len])
            y_test = append_matrices(y_test, y[train_len:])
            X_test = append_matrices(X_test, X[train_len:])
        
    return president_int, vocabulary.Vocabulary(all_words), y_train, X_train, y_test, X_test

# Convert 2d matrix of words into 2d matrix of word ids
def word_matrix_2ids(vocab, word_matrix):
    # convert to 1d
    word_1d = word_matrix.flatten()
    ids = vocab.words_to_ids(word_1d)
    return np.reshape(ids, (-1, word_matrix.shape[1]))
                      
# Now, use the function
batch_size = 100
num_of_words = 100000
president_int, vocab, y_train, X_train, y_test, X_test = get_train_test(pres_dict, num_of_words, batch_size)

#print X_train.shape, y_train.shape, X_test.shape, y_test.shape
#print X_train[1]
###### Shuffle data?

# Convert words to ids
X_train = word_matrix_2ids(vocab, X_train) 
X_test = word_matrix_2ids(vocab, X_test)

# X_train.shape = (1407, 100): the number of original words were 140700. They were broken into batches of 100
# y_train.shape = (1407, 4): each batch is mapped to 1 of the 4 presidents
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

Max number of words: 100000
Processing data for Hoover
Processing data for Clinton
Processing data for Johnson
Processing data for Kennedy
Processing data for Carter
Processing data for Reagan
Processing data for Roosevelt




Processing data for Ford
Processing data for Bush
Processing data for Truman
Processing data for Eisenhower
Processing data for Nixon
Processing data for Obama
(10400, 100) (10400, 13) (2600, 100) (2600, 13)


# Create Keras Model and Training

In [11]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import np_utils


# Instantiate and build model
model = Sequential()
model.add(Dense(units=1000, input_dim=batch_size))
model.add(Activation('sigmoid'))
model.add(Dense(units=1000, input_dim=200))
model.add(Activation('sigmoid'))
model.add(Dense(units=1000, input_dim=200))
model.add(Activation('sigmoid'))
model.add(Dense(units=len(president_int.keys())))
model.add(Activation('softmax'))

# Compile w/ chosen loss, optimization fns; specific output metrics
model.compile(loss='categorical_crossentropy',
              optimizer='Adagrad',
              metrics=['categorical_accuracy','accuracy'])
# Train
print "Training..."
model.fit(X_train, y_train, epochs=80, batch_size=100)


Training...
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


<keras.callbacks.History at 0x7fbf8b35b9d0>

# Test and predict

In [12]:
# Evaluate performance
print "Evaluating test data..."
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=128)

# Make some predictions
print "\n\nPredicting using test data..."
predictions = model.predict(X_test, batch_size=128)

#print (classes)

print "\n\n",model.metrics_names
print loss_and_metrics


def prediction_to_01(predictions):
    max_p = predictions.max(axis=1)

    pred_int = None
    for i in range(len(max_p)):
        pred_int = append_matrices(pred_int, [(predictions[i] == max_p[i]).astype(int)])
    return pred_int

print prediction_to_01(predictions)
print y_test

Evaluating test data...


Predicting using test data...


['loss', 'categorical_accuracy', 'acc']
[5.7467836438692537, 0.094615384661234342, 0.094615384661234342]
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 1 0]]
[[0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]


