In [1]:
import pattern
from pattern.web import URL, DOM


url = URL('https://en.wikisource.org/wiki/Portal:State_of_the_Union_Speeches_by_United_States_Presidents')
html = url.download()
dom = DOM(html)

urls = []
for el in dom.by_id("mw-content-text").by_tag('ul')[5:-2]:
    for a in el.by_tag('a'):
        if 'title' in a.attributes:
            if 'Address' in a.content:
                urls.append('https://en.wikisource.org' + a.href)

In [2]:
from HTMLParser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)
def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
import nltk
from nltk import tokenize
from nltk.collocations import *
from bs4 import BeautifulSoup
from collections import Counter
from elasticsearch import Elasticsearch
import re

tokenizer = nltk.WhitespaceTokenizer()

allData = {}
for url in urls[:2]:
    u = URL(url)
    h = u.download()
    soup = BeautifulSoup(h, 'lxml')
    text = ''.join([p.text + ' ' for p in soup.find("div", {"id": "mw-content-text"}).find_all('p')[:-1]])
    sentences = [tokenizer.tokenize(sent) for sent in nltk.sent_tokenize(text)]
    html = unicode(''.join([str(p) for p in soup.find("div", {"id":"mw-content-text"}).find_all('p')[:-1]]), errors='ignore')
    headOfState = soup.find("span", {"class" : "fn"}).text
    if "by " in headOfState:
        headOfState = headOfState.strip("by ")
    try:
        year = int(re.search(r'\d+', soup.find("div", {"class" : "gen_header_title"}).text).group())
    except: 
        year = year + 1
    title = "State of the Union Address " + str(year)
    
    if headOfState not in allData.keys():
        allData[headOfState] = {}
    documentData = {}
    documentData['full text'] = text
    documentData['html'] = html
    documentData['sentences'] = sentences
    documentData['year'] = year
    allData[headOfState][title] = documentData


[[u'I', u'embrace', u'with', u'great', u'satisfaction', u'the', u'opportunity,', u'which', u'now', u'presents', u'itself,', u'of', u'congratulating', u'you', u'on', u'the', u'present', u'favourable', u'prospects', u'of', u'our', u'public', u'affairs.'], [u'The', u'recent', u'accession', u'of', u'the', u'important', u'state', u'of', u'north', u'Carolina', u'to', u'the', u'Constitution', u'of', u'the', u'United', u'States', u'(of', u'which', u'official', u'information', u'has', u'been', u'received)\u2014', u'the', u'rising', u'credit', u'and', u'respectability', u'of', u'our', u'Country', u'\u2014', u'the', u'general', u'and', u'increasing', u'good', u'will', u'towards', u'the', u'Government', u'of', u'the', u'Union', u'\u2014', u'and', u'the', u'concord,', u'peace', u'and', u'plenty,', u'with', u'which', u'we', u'are', u'blessed,', u'are', u'circumstances,', u'auspicious', u'in', u'an', u'eminent', u'degree', u'to', u'our', u'national', u'prosperity.'], [u'In', u'resuming', u'your', u'c

In [9]:
import pandas as pd
import certifi

dataList = []
for headofstate, address in allData.iteritems():
    for key, value in address.iteritems():
        l = value.values()
        l.append(key)
        l.append(headofstate)
        dataList.append(l)


df = pd.DataFrame(dataList)
df.columns = ['full text', 'html', 'year', 'sentences', 'title', 'head of state']
df = df.sort_values('year').reset_index(drop=True)

es = Elasticsearch()

df.head()

Unnamed: 0,full text,html,year,sentences,title,head of state
0,Fellow-Citizens of the Senate and the House of...,<p><i>Fellow-Citizens of the Senate and the Ho...,1790,"[[Fellow-Citizens, of, the, Senate, and, the, ...",State of the Union Address 1790,George Washington
1,Fellow-Citizens of the Senate and the House of...,<p><i>Fellow-Citizens of the Senate and the Ho...,1791,"[[Fellow-Citizens, of, the, Senate, and, the, ...",State of the Union Address 1791,George Washington
2,Fellow-Citizens of the Senate and of the House...,<p><i>Fellow-Citizens of the Senate and of the...,1792,"[[Fellow-Citizens, of, the, Senate, and, of, t...",State of the Union Address 1792,George Washington
3,Fellow Citizens of the Senate and of the House...,<p><i>Fellow Citizens of the Senate and of the...,1793,"[[Fellow, Citizens, of, the, Senate, and, of, ...",State of the Union Address 1793,George Washington
4,Fellow Citizens of the Senate and of the House...,<p><i>Fellow Citizens of the Senate and of the...,1794,"[[Fellow, Citizens, of, the, Senate, and, of, ...",State of the Union Address 1794,George Washington


In [10]:
dfIter = list(df[['full text', 'head of state', 'year', 'sentences', 'title', 'html']].iterrows())
for row in dfIter:
    ID = row[0] + 1
    es.index(index="unions", doc_type="union", id=ID, \
             body={"speaker" : row[1]['head of state'], "year" : row[1]['year'],
                   "text" : row[1]['full text'], "sentences" : row[1]['sentences'], 
                   "title" : row[1]['title'], "html" : row[1]['html']})