# Event Analysis CAMEO Code 145
<br>
> Utilize StanfordNLP to parse sentences to update PETRARCH dictionaries for events with low recall
<br>
<br>

#### Install StanfordNLP and English Dictionary

In [1]:
import stanfordnlp
import pandas as pd

In [24]:
stanfordnlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
Y

Default download directory: /Users/jonathanbonaguro/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /Users/jonathanbonaguro/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:29<00:00, 7.34MB/s] 



Download complete.  Models saved to: /Users/jonathanbonaguro/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [2]:
nlp = stanfordnlp.Pipeline()

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/jonathanbonaguro/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/jonathanbonaguro/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/jonathanbonaguro/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/jonathanbonaguro/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/jonathanbonaguro/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path'

#### Example

In [11]:
ex = nlp("Barack Obama was born in Hawaii.  He was elected president in 2008.")
ex.sentences[1].print_dependencies()

('He', '3', 'nsubj:pass')
('was', '3', 'aux:pass')
('elected', '0', 'root')
('president', '3', 'xcomp')
('in', '6', 'case')
('2008', '3', 'obl')
('.', '3', 'punct')




#### Import Data for Parsing
NYTimes articles parsed by BBN Accent

In [13]:
# Trim down to small batch for test
extract = pd.read_csv('nytextract.csv')
test = extract[extract.code == 145]
small = test[1:6]
list(small)

['aid', 'code', 'text', 'bad']

In [17]:
doc = small['text']
pd.DataFrame(doc)

Unnamed: 0,text
241,"""Thirty-four men arrested late last night were..."
242,"""In the worst outbreak of street violence in 1..."
243,Paris policemen and leftist extremists clash a...
244,"Violence occurred through most of the day, des..."
245,"Violence occurred through most of the day, des..."


In [21]:
# Test parse one sentence of obj "small"
sent = nlp(small.iloc[1,2])
sent.sentences[0].print_dependencies()

('"', '21', 'punct')
('In', '5', 'case')
('the', '5', 'det')
('worst', '5', 'amod')
('outbreak', '21', 'obl')
('of', '8', 'case')
('street', '8', 'compound')
('violence', '5', 'nmod')
('in', '11', 'case')
('14', '11', 'nummod')
('months', '5', 'nmod')
('of', '14', 'case')
('labor', '14', 'compound')
('unrest', '11', 'nmod')
(',', '21', 'punct')
('the', '17', 'det')
('police', '21', 'nsubj')
('in', '19', 'case')
('Katowice', '17', 'nmod')
('today', '21', 'obl:tmod')
('battled', '0', 'root')
('5,000', '23', 'nummod')
('protesters', '21', 'obj')
('who', '27', 'nsubj:pass')
('had', '27', 'aux')
('been', '27', 'aux:pass')
('angered', '23', 'acl:relcl')
('by', '30', 'case')
('the', '30', 'det')
('arrest', '27', 'obl')
('of', '34', 'case')
('three', '34', 'nummod')
('Solidarity', '34', 'compound')
('activists', '30', 'nmod')
('for', '36', 'mark')
('distributing', '34', 'acl')
('leaflets', '36', 'obj')
('.', '21', 'punct')




In [24]:
for i in range(len(small)):
    parse = nlp(small.iloc[i,2])
    parse.sentences[0].print_dependencies()
# Note: Does not parse every sentence



('"', '11', 'punct')
('Thirty', '4', 'compound')
('-', '4', 'punct')
('four', '5', 'nummod')
('men', '11', 'nsubj:pass')
('arrested', '5', 'acl')
('late', '6', 'advmod')
('last', '9', 'amod')
('night', '6', 'obl:tmod')
('were', '11', 'aux:pass')
('released', '0', 'root')
('from', '13', 'case')
('jail', '11', 'obl')
('today', '11', 'obl:tmod')
('on', '17', 'case')
('a', '17', 'det')
('variety', '11', 'obl')
('of', '20', 'case')
('misdemeanor', '20', 'compound')
('charges', '17', 'nmod')
('involving', '20', 'acl')
('the', '24', 'det')
('latest', '24', 'amod')
('protest', '21', 'obj')
('over', '27', 'case')
('the', '27', 'det')
('use', '24', 'nmod')
('of', '30', 'case')
('nonunion', '30', 'amod')
('coal', '27', 'nmod')
('at', '38', 'case')
('the', '35', 'det')
('Tennessee', '35', 'compound')
('Valley', '35', 'compound')
('Authority', '38', 'nmod:poss')
("'s", '35', 'case')
('power', '38', 'compound')
('plant', '27', 'nmod')
('at', '40', 'case')
('Paradise', '38', 'nmod')
(',', '40', 'punc



('"', '21', 'punct')
('In', '5', 'case')
('the', '5', 'det')
('worst', '5', 'amod')
('outbreak', '21', 'obl')
('of', '8', 'case')
('street', '8', 'compound')
('violence', '5', 'nmod')
('in', '11', 'case')
('14', '11', 'nummod')
('months', '5', 'nmod')
('of', '14', 'case')
('labor', '14', 'compound')
('unrest', '11', 'nmod')
(',', '21', 'punct')
('the', '17', 'det')
('police', '21', 'nsubj')
('in', '19', 'case')
('Katowice', '17', 'nmod')
('today', '21', 'obl:tmod')
('battled', '0', 'root')
('5,000', '23', 'nummod')
('protesters', '21', 'obj')
('who', '27', 'nsubj:pass')
('had', '27', 'aux')
('been', '27', 'aux:pass')
('angered', '23', 'acl:relcl')
('by', '30', 'case')
('the', '30', 'det')
('arrest', '27', 'obl')
('of', '34', 'case')
('three', '34', 'nummod')
('Solidarity', '34', 'compound')
('activists', '30', 'nmod')
('for', '36', 'mark')
('distributing', '34', 'acl')
('leaflets', '36', 'obj')
('.', '21', 'punct')
('Paris', '2', 'compound')
('policemen', '6', 'nsubj')
('and', '5', 'cc



('Violence', '2', 'nsubj')
('occurred', '0', 'root')
('through', '4', 'case')
('most', '2', 'obl')
('of', '7', 'case')
('the', '7', 'det')
('day', '4', 'nmod')
(',', '2', 'punct')
('despite', '10', 'case')
('pleas', '2', 'obl')
('from', '12', 'case')
('supporters', '10', 'nmod')
('of', '16', 'case')
('the', '16', 'det')
('hunger', '16', 'compound')
('strikers', '12', 'nmod')
('who', '18', 'nsubj')
('said', '16', 'acl:relcl')
('that', '22', 'mark')
('the', '21', 'det')
('disturbances', '22', 'nsubj')
('eroded', '18', 'ccomp')
('sympathy', '22', 'obj')
('for', '26', 'case')
('imprisoned', '26', 'amod')
('nationalists', '23', 'nmod')
('from', '29', 'case')
('other', '29', 'amod')
('Catholics', '26', 'nmod')
('in', '32', 'case')
('the', '32', 'det')
('province', '29', 'nmod')
('.\\n\\n\\n\\n', '2', 'parataxis')
('Violence', '2', 'nsubj')
('occurred', '0', 'root')
('through', '4', 'case')
('most', '2', 'obl')
('of', '7', 'case')
('the', '7', 'det')
('day', '4', 'nmod')
(',', '2', 'punct')
(



In [26]:
smaller = test[7:15]
smaller

Unnamed: 0,aid,code,text,bad
247,22456191,145,A P.A.P. dispatch said only that ''groups of u...,0
248,22456191,145,Riots Last Over 6 Hours\n\n\n\nOne such decisi...,0
249,22456191,145,"The crowd was as large as 10,000, the sources ...",0
250,22489425,145,"On May 3, the people demonstrated the raw powe...",0
251,22497647,145,said they feared the violence last Sunday cou...,0
252,22502980,145,His activities and those of his followers duri...,0
253,22503890,145,"17, of 2195 Ocean Avenue, Brooklyn, Samuel Wi...",0
254,22503890,145,"Unofficial estimates of the crowd size, which ...",0


In [27]:
for i in range(len(smaller)):
    parse = nlp(smaller.iloc[i,2])
    parse.sentences[0].print_dependencies()
# Note: Still does not parse every sentence



('A', '3', 'det')
('P.A.P.', '3', 'compound')
('dispatch', '4', 'nsubj')
('said', '45', 'ccomp')
('only', '4', 'advmod')
('that', '13', 'mark')
("''", '8', 'punct')
('groups', '13', 'nsubj')
('of', '11', 'case')
('unruly', '11', 'amod')
('youngsters', '8', 'nmod')
("''", '8', 'punct')
('took', '4', 'ccomp')
('to', '16', 'case')
('the', '16', 'det')
('streets', '13', 'obl')
('in', '18', 'case')
('rioting', '13', 'obl')
(',', '20', 'punct')
('shouting', '13', 'advcl')
('antistate', '22', 'amod')
('slogans', '20', 'obj')
('and', '24', 'cc')
('destroying', '20', 'conj')
('public', '26', 'amod')
('facilities', '24', 'obj')
('.', '31', 'punct')
("''", '31', 'punct')
('\\', '31', 'punct')
('n\\n\\n\\', '31', 'compound')
('n', '33', 'compound')
("''", '31', 'punct')
('Order', '35', 'nsubj:pass')
('was', '35', 'aux:pass')
('restored', '45', 'ccomp')
('by', '41', 'case')
('the', '41', 'det')
('law', '40', 'compound')
('-', '40', 'punct')
('enforcement', '41', 'compound')
('organ', '35', 'obl')
(



('Riots', '0', 'root')
('Last', '1', 'amod')
('Over', '5', 'case')
('6', '5', 'nummod')
('Hours\\n\\n\\n\\n', '1', 'nmod')




('The', '2', 'det')
('crowd', '5', 'nsubj')
('was', '5', 'cop')
('as', '5', 'advmod')
('large', '0', 'root')
('as', '7', 'case')
('10,000', '5', 'obl')
(',', '11', 'punct')
('the', '10', 'det')
('sources', '11', 'nsubj')
('said', '5', 'parataxis')
(',', '18', 'punct')
('and', '18', 'cc')
('at', '16', 'case')
('one', '16', 'nummod')
('point', '18', 'obl')
('it', '18', 'nsubj')
('turned', '11', 'conj')
('against', '22', 'case')
('a', '22', 'det')
('police', '22', 'compound')
('headquarters', '18', 'obl')
(',', '18', 'punct')
('setting', '18', 'advcl')
('fire', '24', 'obj')
('to', '29', 'case')
('a', '29', 'det')
('police', '29', 'compound')
('shed', '24', 'obl')
('.', '5', 'punct')
('On', '2', 'case')
('May', '7', 'obl')
('3', '2', 'nummod')
(',', '7', 'punct')
('the', '6', 'det')
('people', '7', 'nsubj')
('demonstrated', '0', 'root')
('the', '10', 'det')
('raw', '10', 'amod')
('power', '7', 'obj')
('of', '13', 'case')
('their', '13', 'nmod:poss')
('discontent', '10', 'nmod')
('in', '16'



('said', '0', 'root')
('they', '3', 'nsubj')
('feared', '1', 'ccomp')
('the', '5', 'det')
('violence', '3', 'obj')
('last', '7', 'amod')
('Sunday', '10', 'obl:tmod')
('could', '10', 'aux')
('be', '10', 'aux:pass')
('used', '3', 'ccomp')
('as', '13', 'case')
('a', '13', 'det')
('pretext', '10', 'obl')
('to', '15', 'mark')
('cancel', '13', 'acl')
('the', '18', 'det')
('August', '18', 'compound')
('trip', '15', 'obj')
('.', '1', 'punct')




('His', '2', 'nmod:poss')
('activities', '14', 'nsubj')
('and', '4', 'cc')
('those', '2', 'conj')
('of', '7', 'case')
('his', '7', 'nmod:poss')
('followers', '4', 'nmod')
('during', '11', 'case')
('the', '11', 'det')
('1967', '11', 'nummod')
('riots', '7', 'nmod')
('in', '13', 'case')
('Newark', '11', 'nmod')
('prompted', '0', 'root')
('former', '16', 'amod')
('Gov.', '14', 'obj')
('Richard', '16', 'flat')
('J.', '17', 'flat')
('Hughes', '17', 'flat')
('to', '21', 'mark')
('label', '14', 'xcomp')
('them', '21', 'obj')
("''", '24', 'punct')
('brownshirts', '21', 'obj')
('.', '29', 'punct')
("''", '29', 'punct')
('\\', '29', 'punct')
('n\\', '29', 'compound')
('n\\n\\n', '24', 'appos')




('17', '0', 'root')
(',', '1', 'punct')
('of', '6', 'case')
('2195', '6', 'nummod')
('Ocean', '6', 'compound')
('Avenue', '1', 'appos')
(',', '8', 'punct')
('Brooklyn', '6', 'conj')
(',', '10', 'punct')
('Samuel', '6', 'conj')
('Wilkenfield', '10', 'flat')
(',', '10', 'punct')
('17', '6', 'conj')
(',', '6', 'punct')
('of', '18', 'case')
('577', '18', 'nummod')
('Grand', '18', 'compound')
('Street', '6', 'nmod')
(',', '20', 'punct')
('Manhattan', '18', 'conj')
(',', '23', 'punct')
('and', '23', 'cc')
('Evelyn', '6', 'conj')
('Hyman', '23', 'flat')
(',', '23', 'punct')
('52', '23', 'amod')
(',', '23', 'punct')
('of', '32', 'case')
('419', '32', 'nummod')
('West', '32', 'compound')
('34th', '32', 'compound')
('Street', '23', 'nmod')
(',', '34', 'punct')
('charging', '23', 'acl')
('them', '34', 'obj')
('with', '38', 'case')
('criminal', '38', 'amod')
('mischief', '34', 'obl')
('and', '41', 'cc')
('disorderly', '38', 'conj')
('conduct', '38', 'conj')
('.\\n\\n', '1', 'punct')
('Unofficial',

