In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# Helper libraries
from shared_lib import utils, vocabulary, tf_embed_viz

# Parse data from ../data/processed

In [2]:
def get_vocab2(raw):
    tokens = nltk.word_tokenize(raw) #.decode("utf8"))
    words = [w.lower() for w in tokens]
    vocab = sorted(set(words))
    sents = nltk.sent_tokenize(raw.lower())
    return vocab, sents

    
def get_vocab(filename):
    raw = open(filename).read().decode("UTF-8")
    return get_vocab2(raw)

dir = '../data/processed/'
pres_dict = {}

def add_to_dict(president, vocab, sents):
    words = list()
    for s in sents:
        words.extend(nltk.word_tokenize(s))

    vocab_, sents_, words_ = pres_dict.get(president, (None, None, None))
    if (vocab_ == None):
        pres_dict[president] = (vocab, sents, words)
    else:
        vocab_ += vocab
        sents_ += sents
        words_.extend(words)
        pres_dict[president] = (vocab_, sents_, words_)

def print_dict(pres_dict):        
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None, None))
        print "%s: vocab count %s, sentence count %s, word count %s" % (key, len(vocab), len(sents), len(words))
        #print "Samples: ", sents[-4:]

full_name = {"Obama" : "Barack Obama", "Lincoln": "Abraham Lincoln", "Trump": "Donald J. Trump"}

for filename in os.listdir(dir):
    arr = filename.split("_")
    president = arr[0]
    
    try:
        vocab, sents = get_vocab(dir + filename)
        add_to_dict(full_name[president], vocab, sents)
    except UnicodeDecodeError as err:
        print filename, ":", err
        
        
print_dict(pres_dict)

Donald J. Trump: vocab count 14307, sentence count 4385, word count 65436
Barack Obama: vocab count 15305, sentence count 3106, word count 64466
Abraham Lincoln: vocab count 15975, sentence count 2102, word count 66144


# Parse json data from ../data/unprocessed

In [3]:
from bs4 import BeautifulSoup
import re

dir = '../data/unprocessed/'

for json_file in os.listdir(dir):
    json_data=open(dir + json_file)
    data = json.load(json_data)
    json_data.close()
    attrName = 'debate' if 'Debate' in json_file else 'speeches'
    for data2 in data[attrName]:
        # data2['text'] has a lot of htmtl tags in there. We still need to parse it            
        raw = BeautifulSoup(data2['text'], "html.parser").get_text()
        # Remove []
        raw = re.sub(' \[.*?\]',' ', raw, flags=re.DOTALL)
        # Remove ()
        raw = re.sub(' \(.*?\)',' ', raw, flags=re.DOTALL)
        if (attrName == 'speeches' and 'News Conference With' not in data2['name']):
            # Cleaning up the data: eemoving the questions
            raw = re.sub('[A-Z,\s,\.]Q\..*? The President\.','\.',raw, flags=re.DOTALL)
            raw = re.sub('^[A-Z,\s]*THE PRESIDENT\.','',raw, flags=re.DOTALL)
            raw = re.sub('[A-Z,\s,\.]Q\..*?THE PRESIDENT\.','\.',raw, flags=re.DOTALL)
            
            vocab, sents = get_vocab2(raw)
            #arr = data2['speaker'].split(' ')
            #president = arr[len(arr)-1]
            president = data2['speaker'] 

            sents = sents[:len(sents)-10]
            #print sents[-2:]
            #print len(sents)
            add_to_dict(president, vocab, sents)

        ########################################################
        # TODO: extract debate data for TRUMP, OBAMA
        #elif ('OBAMA' in raw): #('TRUMP' in raw or 'OBAMA' in raw)):
        #    print data2['name'] #, raw[0:3000]


print_dict(pres_dict)


Lyndon B. Johnson: vocab count 118961, sentence count 22235, word count 445977
Gerald R. Ford: vocab count 34320, sentence count 6283, word count 135621
Richard Nixon: vocab count 40528, sentence count 7426, word count 191118
Franklin D. Roosevelt: vocab count 123191, sentence count 16084, word count 399627
William J. Clinton: vocab count 73996, sentence count 16625, word count 359552
Harry S. Truman: vocab count 139865, sentence count 31957, word count 413257
Abraham Lincoln: vocab count 15975, sentence count 2102, word count 66144
George Bush: vocab count 87076, sentence count 21374, word count 385669
John F. Kennedy: vocab count 61138, sentence count 11051, word count 258231
Dwight D. Eisenhower: vocab count 161616, sentence count 22835, word count 601774
Ronald Reagan: vocab count 48017, sentence count 8770, word count 192872
George W. Bush: vocab count 71606, sentence count 20821, word count 361515
Herbert Hoover: vocab count 74728, sentence count 3027, word count 103986
Barack Ob

# Set up train and test data
This part of code will use num_of_words and the threshold of words to select from each president speeches. If a president doesn't have word counts (less than num_of_words), he will be skipped.

The data will be converted into 2d matrix give a batch size, and then split into 80% for training and 20% for test

In [8]:
# Create train and test data set
# Number of words used by 1 president
#president_int = {"Lincoln": [1,0,0,0], "Trump": [0,1,0,0], "Bush": [0,0,1,0], "Obama": [0,0,0,1]}

def append_matrices(a,b):
    if (a == None):
        return b
    else:
        return np.concatenate((a, b))

def get_train_test(pres_dict, num_words_limit, batch_size=100):
    print "Max number of words:", num_words_limit

    def reshape_y(y):
        return np.reshape(y,[len(y),len(y[0])])
            
    y_train = None
    X_train = None
    y_test = None
    X_test = None
    all_words = list()
    president_int = {}

    # Set up president_int: find out how many meets the word count requirement
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None))
        if (len(words) >= num_words_limit):
            president_int[key] = None # initialize this mapping
    i = 0
    for p in president_int.keys():
        arr = [0]* len(president_int.keys())
        arr[i] = 1
        president_int[p] = arr
        i +=1
    #print president_int
        
    # Then use president_int to build y matrices
    for key in pres_dict.keys():
        vocab, sents, words = pres_dict.get(key, (None, None))
        if (len(words) >= num_words_limit):
            print "Processing data for", key            
            X = words[0:num_words_limit]
            all_words += X
            X = np.reshape(X, [len(X)/batch_size, batch_size])
            #y = (num_words_limit * [president_int[key]])
            y = reshape_y(X.shape[0] * [president_int[key]])
            #print y
            #print X.shape,y.shape
            # train = 80%, test = 20%
            train_len = int(y.shape[0] * 0.8)
            #print train_len
            ## add new rows to y_train
            y_train = append_matrices(y_train, y[:train_len])
            #print y_train.shape
                            
            X_train = append_matrices(X_train, X[:train_len])
            y_test = append_matrices(y_test, y[train_len:])
            X_test = append_matrices(X_test, X[train_len:])
        
    return president_int, vocabulary.Vocabulary(all_words), y_train, X_train, y_test, X_test

# Convert 2d matrix of words into 2d matrix of word ids
def word_matrix_2ids(vocab, word_matrix):
    # convert to 1d
    word_1d = word_matrix.flatten()
    ids = vocab.words_to_ids(word_1d)
    return np.reshape(ids, (-1, word_matrix.shape[1]))
                      
# Now, use the function
batch_size = 100
num_of_words = 100000
president_int, vocab, y_train, X_train, y_test, X_test = get_train_test(pres_dict, num_of_words, batch_size)

#print X_train.shape, y_train.shape, X_test.shape, y_test.shape
#print X_train[1]
###### Shuffle data?

# Convert words to ids
X_train = word_matrix_2ids(vocab, X_train) 
X_test = word_matrix_2ids(vocab, X_test)

# X_train.shape = (1407, 100): the number of original words were 140700. They were broken into batches of 100
# y_train.shape = (1407, 4): each batch is mapped to 1 of the 4 presidents
print X_train.shape, y_train.shape, X_test.shape, y_test.shape

Max number of words: 100000
Processing data for Lyndon B. Johnson
Processing data for Gerald R. Ford
Processing data for Richard Nixon
Processing data for Franklin D. Roosevelt
Processing data for William J. Clinton
Processing data for Harry S. Truman
Processing data for George Bush




Processing data for John F. Kennedy
Processing data for Dwight D. Eisenhower
Processing data for Ronald Reagan
Processing data for George W. Bush
Processing data for Herbert Hoover
Processing data for Barack Obama
Processing data for Donald J. Trump
Processing data for Jimmy Carter
(12000, 100) (12000, 15) (3000, 100) (3000, 15)


# Create Keras Model and Training

In [9]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import np_utils


# Instantiate and build model
model = Sequential()
model.add(Dense(units=400, input_dim=batch_size))
model.add(Activation('tanh'))
model.add(Dense(units=400, input_dim=200))
model.add(Activation('tanh'))
model.add(Dense(units=400, input_dim=200))
model.add(Activation('tanh'))
model.add(Dense(units=len(president_int.keys())))
model.add(Activation('softmax'))

# Compile w/ chosen loss, optimization fns; specific output metrics
model.compile(loss='categorical_crossentropy',
              optimizer='Adagrad',
              metrics=['categorical_accuracy','accuracy'])
# Train
print "Training..."
model.fit(X_train, y_train, epochs=60, batch_size=100)


Training...
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.callbacks.History at 0x7f60a0575110>

# Test and predict

In [10]:
# Evaluate performance
print "Evaluating test data..."
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=128)

# Make some predictions
print "\n\nPredicting using test data..."
predictions = model.predict(X_test, batch_size=128)

#print (classes)

print "\n\n",model.metrics_names
print loss_and_metrics


def prediction_to_01(predictions):
    max_p = predictions.max(axis=1)

    pred_int = None
    for i in range(len(max_p)):
        pred_int = append_matrices(pred_int, [(predictions[i] == max_p[i]).astype(int)])
    return pred_int

print prediction_to_01(predictions)
print y_test

Evaluating test data...

Predicting using test data...


['loss', 'categorical_accuracy', 'acc']
[4.8289436213175456, 0.073666666726271313, 0.073666666726271313]
[[0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
[[1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


