## Extract OpenSecrets Interest groups

In [1]:
import requests       # send requests to web server
from lxml import html # parse HTML
import json           # store data as json file
import re             # regular expressions
import os             # access directories
os.chdir('../../../Data/OpenSecrets')
from tqdm import tqdm # create progress bar (for i in tqdm(list))

## Sectors

In [2]:
# Extract the webpage to scrape and built an HTML document tree
url = 'http://www.opensecrets.org/federal-lobbying/ranked-sectors'
page = requests.get(url)
tree = html.fromstring(page.content)
tree.make_links_absolute(url) #extracts absolute instead of relative links

In [3]:
# Extract the names of each sector
names = tree.xpath('//td[contains(@class, "color-category")]/a/text()')
print('{} names extracted. The first one is:'.format(len(names)))
print(names[0])
# Extract the links of each sector
URLs = tree.xpath('//td[contains(@class, "color-category")]/a/@href')
print('{} links extracted. The first one is:'.format(len(URLs)))
print(URLs[0])

13 names extracted. The first one is:
Health
13 links extracted. The first one is:
http://www.opensecrets.org/federal-lobbying/sectors/summary?cycle=2020&id=H


In [4]:
# Combine names and links into a list of sectors
# Each sector is saved in a dictionary
sectors = []

for i in range(len(names)):
    sectors.append({'name': names[i],
                       'page_url': URLs[i],
                       'id': re.findall('(?<=id=)\S*',URLs[i])[0]})

print('{} sectors extracted. The first one is:'.format(len(sectors)))

print(sectors[0])

13 sectors extracted. The first one is:
{'name': 'Health', 'page_url': 'http://www.opensecrets.org/federal-lobbying/sectors/summary?cycle=2020&id=H', 'id': 'H'}


## Industries

In [5]:
# Extract all industries for each sector
# Each industry is saved in a dictionary
# 'year' defines the years for which to extract the data
print('Extracting the industries for the {} sectors:\n'.format(len(sectors)))

url_base = 'http://www.opensecrets.org/federal-lobbying/sectors/summary?cycle='

for sector in sectors:
    sector['industries'] = []
    print(sector['name'])
    url_temp = url_base + '2010&id=' +  sector['id']
    page_temp = requests.get(url_temp)
    tree_temp = html.fromstring(page_temp.content)
    tree_temp.make_links_absolute(url_temp) #extracts absolute instead of relative links
    length = len(tree_temp.xpath('//td[contains(@class, "color-category")]/a/text()'))
    for i in range(length):
        #print((tree_temp.xpath('//td[contains(@class, "color-category")]/a/text()')[i]))
        sector['industries'].append({'industry': tree_temp.xpath('//td[contains(@class, "color-category")]/a/text()')[i],
                                     'page_url': tree_temp.xpath('//td[contains(@class, "color-category")]/a/@href')[i],
                                     'id': re.findall('(?<=id=)\S*',tree_temp.xpath('//td[contains(@class, "color-category")]/a/@href')[i])[0],
                                     'year': [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010]})


Extracting the industries for the 13 sectors:

Health
Finance/Insur/RealEst
Misc Business
Communic/Electronics
Energy/Nat Resource
Transportation
Other
Agribusiness
Ideology/Single-Issue
Defense
Construction
Labor
Lawyers & Lobbyists


## Lobbying groups

In [6]:
# Extract all lobbying groups per year
# The lobbying groups are saved in each industry dictionary under the key 'lobbying_groups'
url_base = 'http://www.opensecrets.org/federal-lobbying/industries/summary?cycle='

for sector in tqdm(sectors):
    for industry in sector['industries']:
        industry['lobbying_groups_by_year'] = []
        for year in industry['year']:
            url_temp = url_base + str(year) + '&id=' + industry['id']
            page_temp = requests.get(url_temp)
            tree_temp = html.fromstring(page_temp.content)
            industry['lobbying_groups_by_year'].append(tree_temp.xpath('//td[contains(@class, "color-category")]/a/text()'))
        industry['lobbying_groups'] = list(set(sum(industry['lobbying_groups_by_year'],[])))

print('The lobbying groups were successfully extracted.')

100%|██████████| 13/13 [06:25<00:00, 29.66s/it]

The lobbying groups were successfully extracted.





In [7]:
print('The lobbying groups for the following sectors and industries were extracted:')
for sector in sectors:
    print('\n', sector['name'], '\n')
    for industry in sector['industries']:
        print(industry['industry'])

The lobbying groups for the following sectors and industries were extracted:

 Health 

Pharmaceuticals/Health Products
Hospitals/Nursing Homes
Health Professionals
Health Services/HMOs
Misc Health

 Finance/Insur/RealEst 

Insurance
Securities & Investment
Real Estate
Commercial Banks
Finance/Credit Companies
Misc Finance
Accountants
Credit Unions
Savings & Loans

 Misc Business 

Business Associations
Misc Manufacturing & Distributing
Chemical & Related Manufacturing
Retail Sales
Business Services
Food & Beverage
Casinos/Gambling
Beer, Wine & Liquor
Lodging/Tourism
Steel Production
Recreation/Live Entertainment
Misc Business
Misc Services
Textiles

 Communic/Electronics 

Electronics Mfg & Equip
Telecom Services
TV/Movies/Music
Telephone Utilities
Internet
Printing & Publishing
Misc Communications/Electronics

 Energy/Nat Resource 

Electric Utilities
Oil & Gas
Misc Energy
Mining
Environmental Svcs/Equipment
Waste Management
Fisheries & Wildlife

 Transportation 

Air Transport
Autom

In [8]:
print('''The following {} lobbying groups were extracted for the 
Oil & Gas Industry. Here are the first 10:'''.format(len(sectors[4]['industries'][1]['lobbying_groups'])))
sectors[4]['industries'][1]['lobbying_groups'][0:10]

The following 347 lobbying groups were extracted for the 
Oil & Gas Industry. Here are the first 10:


['Collier Resources',
 'Alaska Natural Gas Development Authority',
 'TOTAL SA',
 'YPF SA',
 'Hornbeck Offshore Services',
 'Eclipse Energy Systems',
 'Noble Corp',
 'Shallow Water Energy Security Coalition',
 'Giant Industries',
 'C&C Technologies']

In [9]:
# Save the data as sectors
with open('sectors.json', 'w') as outfile:
    json.dump(sectors, outfile)