In [1]:
import os.path
import zipfile

import nltk
import numpy as np #For arrays
import pandas #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer

from nltk.tag import StanfordNERTagger
from nltk.tag import StanfordPOSTagger
from nltk.parse import stanford
from nltk.tokenize import word_tokenize

To using the [Stanford NLP group](http://nlp.stanford.edu/) programs with nltk requires a bit of setup. We are basing these instructions on those provided by nltk, [here](https://github.com/nltk/nltk/wiki/Installing-Third-Party-Software#stanford-tagger-ner-tokenizer-and-parser), but with a couple of changes for the notebooks.

1. Install [Java 1.8+](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html)
2. Download the following zip files from the Stanford NLP group, where DATE is the release date of the files, this will be the value of `stanfordVersion`
    + `stanford-postagger-full-DATE.zip`
    + `stanford-ner-DATE.zip`
    + `stanford-parser-full-DATE.zip`
3. Unzip the files and place the resulting directories in the same location, this will become `stanfordDir`
4. Lookup the version number used by the parser `stanford-parser-VERSION-models.jar` and set to to be `parserVersion`

In [2]:
#This is the date at the end of each of the zip files, e.g.
#the date in stanford-ner-2016-10-31.zip
stanfordVersion = '2016-10-31'

#This is the version numbers of the parser models, these
#are files in `stanford-parser-full-2016-10-31.zip`, e.g.
#stanford-parser-3.7.0-models.jar
parserVersion = '3.7.0'

#This is where the zip files were unzipped.Make sure to
#unzip into directories named after the zip files
#Don't just put all the files in `stanford-NLP`
stanfordDir = '/Users/Reid/Desktop/stanford-NLP'


Setting up NER tagger

In [3]:
nerClassifierPath = os.path.join(stanfordDir,'stanford-ner-{}'.format(stanfordVersion), 'classifiers/english.all.3class.distsim.crf.ser.gz')

nerJarPath = os.path.join(stanfordDir,'stanford-ner-{}'.format(stanfordVersion), 'stanford-ner.jar')

nerTagger = StanfordNERTagger(nerClassifierPath, nerJarPath)


setting up POS Tagger

In [4]:

postClassifierPath = os.path.join(stanfordDir, 'stanford-postagger-full-{}'.format(stanfordVersion), 'models/english-bidirectional-distsim.tagger')

postJarPath = os.path.join(stanfordDir,'stanford-postagger-full-{}'.format(stanfordVersion), 'stanford-postagger.jar')

postTagger = StanfordPOSTagger(postClassifierPath, postJarPath)

Setting up Parser

In [5]:

modelName = 'englishPCFG.ser.gz'

parserJarPath = os.path.join(stanfordDir, 'stanford-parser-full-{}'.format(stanfordVersion), 'stanford-parser.jar')

parserModelsPath = os.path.join(stanfordDir, 'stanford-parser-full-{}'.format(stanfordVersion), 'stanford-parser-{}-models.jar'.format(parserVersion))

modelPath = os.path.join(stanfordDir, 'stanford-parser-full-{}'.format(stanfordVersion), modelName)

if not os.path.isfile(modelPath):
    with zipfile.ZipFile(parserModelsPath) as zf:
        with open(modelPath, 'wb') as f:
            f.write(zf.read('edu/stanford/nlp/models/lexparser/{}'.format(modelName)))

parser = stanford.StanfordParser(parserJarPath, parserModelsPath, modelPath)

depParser = stanford.StanfordDependencyParser(parserJarPath, parserModelsPath)

Short example sentence

In [6]:
text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

tokenized_text = word_tokenize(text)
classified_text = nerTagger.tag(tokenized_text)

print(classified_text)
print(postTagger.tag(tokenized_text))

[('While', 'O'), ('in', 'O'), ('France', 'LOCATION'), (',', 'O'), ('Christine', 'PERSON'), ('Lagarde', 'PERSON'), ('discussed', 'O'), ('short-term', 'O'), ('stimulus', 'O'), ('efforts', 'O'), ('in', 'O'), ('a', 'O'), ('recent', 'O'), ('interview', 'O'), ('with', 'O'), ('the', 'O'), ('Wall', 'ORGANIZATION'), ('Street', 'ORGANIZATION'), ('Journal', 'ORGANIZATION'), ('.', 'O')]
[('While', 'IN'), ('in', 'IN'), ('France', 'NNP'), (',', ','), ('Christine', 'NNP'), ('Lagarde', 'NNP'), ('discussed', 'VBD'), ('short-term', 'JJ'), ('stimulus', 'NN'), ('efforts', 'NNS'), ('in', 'IN'), ('a', 'DT'), ('recent', 'JJ'), ('interview', 'NN'), ('with', 'IN'), ('the', 'DT'), ('Wall', 'NNP'), ('Street', 'NNP'), ('Journal', 'NNP'), ('.', '.')]


In [7]:
pars = parser.raw_parse(text)
print(list(next(pars)))


result = depParser.raw_parse(text)
print(list(next(result).triples()))

[Tree('S', [Tree('SBAR', [Tree('IN', ['While']), Tree('FRAG', [Tree('PP', [Tree('IN', ['in']), Tree('NP', [Tree('NNP', ['France'])])])])]), Tree(',', [',']), Tree('NP', [Tree('NNP', ['Christine']), Tree('NNP', ['Lagarde'])]), Tree('VP', [Tree('VBD', ['discussed']), Tree('NP', [Tree('JJ', ['short-term']), Tree('NN', ['stimulus']), Tree('NNS', ['efforts'])]), Tree('PP', [Tree('IN', ['in']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['recent']), Tree('NN', ['interview'])])]), Tree('PP', [Tree('IN', ['with']), Tree('NP', [Tree('DT', ['the']), Tree('NNP', ['Wall']), Tree('NNP', ['Street']), Tree('NNP', ['Journal'])])])]), Tree('.', ['.'])])]
[(('discussed', 'VBD'), 'advcl', ('France', 'NNP')), (('France', 'NNP'), 'mark', ('While', 'IN')), (('France', 'NNP'), 'case', ('in', 'IN')), (('discussed', 'VBD'), 'nsubj', ('Lagarde', 'NNP')), (('Lagarde', 'NNP'), 'compound', ('Christine', 'NNP')), (('discussed', 'VBD'), 'dobj', ('efforts', 'NNS')), (('efforts', 'NNS'), 'amod', ('short-term', 'JJ')), 

Lets work with the Grimmer dataset again,

In [8]:
dataDir = 'data/grimmerPressReleases'

senReleasesDF = pandas.DataFrame()

for senatorName in [d for d in os.listdir(dataDir) if d[0] != '.']:
    senPath = os.path.join(dataDir, senatorName)
    senReleasesDF = senReleasesDF.append(loadDir(senPath, senatorName), ignore_index = True)

senReleasesDF[:100:10]

NameError: name 'loadDir' is not defined