In [29]:
# loading all necessary libraries
import requests
import nltk
import re
from bs4 import BeautifulSoup
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

## Download data from web-page

In [2]:
url = 'http://www.nytimes.com/2016/12/08/us/politics/andrew-puzder-labor-secretary-trump.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news&_r=0'

In [3]:
# send request to fetch the page
page = requests.get(url)

In [4]:
# keep only the text of the response (html file) and parse it
soup = BeautifulSoup(page.text, 'html.parser')
# All data is inside p tags (paragraphs)
# so we parse them
p_tag = soup.find_all('p')

In [5]:
txt = []
for p in p_tag:
    txt.append(p.text)


In [6]:
txt

['Advertisement',
 'By NOAM SCHEIBERDEC. 8, 2016\n',
 'President-elect Donald J. Trump on Thursday chose Andrew F. Puzder, chief executive of the company that franchises the fast-food outlets Hardee’s and Carl’s Jr. and an outspoken critic of the worker protections enacted by the Obama administration, to be secretary of labor.',
 '“Andy Puzder has created and boosted the careers of thousands of Americans, and his extensive record fighting for workers makes him the ideal candidate to lead the Department of Labor,” Mr. Trump said in a statement.',
 'Mr. Puzder, 66, fits the profile of some of Mr. Trump’s other domestic cabinet appointments. He is a wealthy businessman and political donor and has a long record of promoting a conservative agenda that takes aim at President Obama’s legacy. And more than the other appointments, he resembles Mr. Trump in style.',
 'He seems to delight in bashing elites — he complained that “big corporate interests” and “globalist companies” were supporting Hi

In [7]:
# clean text
# remove list elements that contain the following strings
remove = ['Advertisement', 'Go to Home Page', 'feedback on this page',
          'See More', 'not a robot', 'New York Times newsletters',
          'Order Reprints', 'Invalid email address',
          'news updates via Facebook', 'top posts in the new administration']

In [9]:
txt_cleaned = [txt_c for txt_c in txt if not any(temp in txt_c for temp in remove)]
txt_cleaned

['By NOAM SCHEIBERDEC. 8, 2016\n',
 'President-elect Donald J. Trump on Thursday chose Andrew F. Puzder, chief executive of the company that franchises the fast-food outlets Hardee’s and Carl’s Jr. and an outspoken critic of the worker protections enacted by the Obama administration, to be secretary of labor.',
 '“Andy Puzder has created and boosted the careers of thousands of Americans, and his extensive record fighting for workers makes him the ideal candidate to lead the Department of Labor,” Mr. Trump said in a statement.',
 'Mr. Puzder, 66, fits the profile of some of Mr. Trump’s other domestic cabinet appointments. He is a wealthy businessman and political donor and has a long record of promoting a conservative agenda that takes aim at President Obama’s legacy. And more than the other appointments, he resembles Mr. Trump in style.',
 'He seems to delight in bashing elites — he complained that “big corporate interests” and “globalist companies” were supporting Hillary Clinton in t

In [10]:
txt_cleaned[2]

'“Andy Puzder has created and boosted the careers of thousands of Americans, and his extensive record fighting for workers makes him the ideal candidate to lead the Department of Labor,” Mr. Trump said in a statement.'

## Perform Named-entity-recognition to find persons

In [11]:
st = StanfordNERTagger('/Users/kyritsis/stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz','/Users/kyritsis/stanford-ner-2015-12-09/stanford-ner.jar')

In [12]:
ans = st.tag(txt_cleaned[1].split())

In [13]:
ans

[('President-elect', 'O'),
 ('Donald', 'PERSON'),
 ('J.', 'PERSON'),
 ('Trump', 'PERSON'),
 ('on', 'O'),
 ('Thursday', 'DATE'),
 ('chose', 'O'),
 ('Andrew', 'PERSON'),
 ('F.', 'PERSON'),
 ('Puzder,', 'PERSON'),
 ('chief', 'O'),
 ('executive', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('company', 'O'),
 ('that', 'O'),
 ('franchises', 'O'),
 ('the', 'O'),
 ('fast-food', 'O'),
 ('outlets', 'O'),
 ('Hardee’s', 'O'),
 ('and', 'O'),
 ('Carl’s', 'PERSON'),
 ('Jr.', 'PERSON'),
 ('and', 'O'),
 ('an', 'O'),
 ('outspoken', 'O'),
 ('critic', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('worker', 'O'),
 ('protections', 'O'),
 ('enacted', 'O'),
 ('by', 'O'),
 ('the', 'O'),
 ('Obama', 'PERSON'),
 ('administration,', 'O'),
 ('to', 'O'),
 ('be', 'O'),
 ('secretary', 'O'),
 ('of', 'O'),
 ('labor.', 'O')]

In [27]:
def find_persons(ans):
    persons = []
    index = 0
    while index < len(ans):
        if ans[index][1] == 'PERSON':
            if ((index + 1) < len(ans)) and (ans[index+1][1] == 'PERSON') :
                if ans[index+2][1] == 'PERSON':
                    str = ans[index][0] + ' ' + ans[index+1][0] + ' ' + ans[index+2][0]
                    index = index + 3
                    persons.append(str)
                    continue
                str = ans[index][0] + ' ' + ans[index+1][0]
                index = index + 2
                persons.append(str)
                continue
            persons.append(ans[index][0])
            index = index + 1
            continue
        index = index + 1

    return persons  

In [28]:
find_persons(ans)

['Donald J. Trump', 'Andrew F. Puzder,', 'Carl’s Jr.', 'Obama']

## Example 1

In [31]:
ex1 = txt_cleaned[2]
ex1

'“Andy Puzder has created and boosted the careers of thousands of Americans, and his extensive record fighting for workers makes him the ideal candidate to lead the Department of Labor,” Mr. Trump said in a statement.'

In [32]:
# find persons
persons = find_persons(st.tag(ex1.split()))
persons

['Puzder', 'Trump']

In [None]:
# find string in double quotes

In [36]:
quotes = re.findall(r'“([^"]*)”', txt_cleaned[2])
quotes

['Andy Puzder has created and boosted the careers of thousands of Americans, and his extensive record fighting for workers makes him the ideal candidate to lead the Department of Labor,']

In [38]:
# check if a person exists to phrases in order to exclude this name
persons_quotes = find_persons(st.tag(quotes[0].split()))
persons_quotes

['Andy Puzder']

In [45]:
a = ex1.split()

In [48]:
ind = a.index('said')
ind

32

In [55]:
# Check before index if person exists
matching = [s for s in persons if a[31] in s]
matching

['Trump']

In [58]:
d = {}
d[matching[0]] = quotes[0]
d

{'Trump': 'Andy Puzder has created and boosted the careers of thousands of Americans, and his extensive record fighting for workers makes him the ideal candidate to lead the Department of Labor,'}

## Example 2

In [59]:
ex2 = txt_cleaned[9]
ex2

'Richard L. Trumka, president of the A.F.L.-C.I.O., said Mr. Puzder was “a man whose business record is defined by fighting against working people.”'

In [61]:
# find persons
persons = find_persons(st.tag(ex2.split()))
persons

['Richard L. Trumka,', 'Puzder']

In [63]:
a = ex2.split()
a

['Richard',
 'L.',
 'Trumka,',
 'president',
 'of',
 'the',
 'A.F.L.-C.I.O.,',
 'said',
 'Mr.',
 'Puzder',
 'was',
 '“a',
 'man',
 'whose',
 'business',
 'record',
 'is',
 'defined',
 'by',
 'fighting',
 'against',
 'working',
 'people.”']

In [64]:
ind = a.index('said')
ind

7

In [80]:
for i in range(0,ind):
    temp = [s for s in persons if a[i] in s]
    if temp:
        matching = [s for s in persons if a[i] in s]

In [76]:
matching

['Richard L. Trumka,']

In [78]:
# extract quote
quotes = re.findall(r'“([^"]*)”', ex2)
quotes

['a man whose business record is defined by fighting against working people.']

In [79]:
d = {}
d[matching[0]] = quotes[0]
d

{'Richard L. Trumka,': 'a man whose business record is defined by fighting against working people.'}

## Example 3 (Difficult)

In [81]:
ex3 = txt_cleaned[11]
ex3

'Matthew Haller, senior vice president for public affairs and communications at the International Franchise Association, of which Mr. Puzder is a board member, said Mr. Puzder saw “a role for government to provide advice to employers, rather than simply deterrence by ‘gotcha’ enforcement,” an allusion to the Obama Labor Department’s enforcement of laws and regulations in the fast-food industry.'

In [83]:
# find persons
persons = find_persons(st.tag(ex3.split()))
persons

['Matthew Haller,', 'Puzder', 'Puzder']