## Extract OpenSecrets Interest groups

In [3]:
import requests       # send requests to web server
from lxml import html # parse HTML
import json           # store data as json file
import re             # regular expressions
import os             # access directories
os.chdir('../../../Data/OpenSecrets')
from tqdm import tqdm # create progress bar (for i in tqdm(list))

## Sectors

In [2]:
# Extract the webpage to scrape and built an HTML document tree
url = 'https://www.opensecrets.org/industries/slist.php'
page = requests.get(url)
tree = html.fromstring(page.content)
tree.make_links_absolute(url) #extracts absolute instead of relative links

In [3]:
# Extract the names of each sector
names = []
for i in range(1,14):
    path = '//*[@id="rightColumn"]/h2['+str(i)+']/a/text()'
    names.append(tree.xpath(path)[0])
print('{} names extracted. The first one is:'.format(len(names)))
print(names[0])
# Extract the URL of each sector
URLs = []
for i in range(1,14):
    path = '//*[@id="rightColumn"]/h2['+str(i)+']/a/@href'
    URLs.append(tree.xpath(path)[0])
print('\n{} URLs extracted. The first one is:'.format(len(URLs)))
print(URLs[0])

13 names extracted. The first one is:
Agribusiness

13 URLs extracted. The first one is:
https://www.opensecrets.org/industries/indus.php?ind=A


In [4]:
# Combine names and links into a list of sectors
# Each sector is saved in a dictionary
sectors = []

for i in range(len(names)):
    sectors.append({'name': names[i],
                    'page_url': URLs[i],
                    'id': re.findall('(?<=ind=)\S*',URLs[i])[0]})

print('{} sectors extracted. The first one is:'.format(len(sectors)))

print(sectors[0])

13 sectors extracted. The first one is:
{'name': 'Agribusiness', 'page_url': 'https://www.opensecrets.org/industries/indus.php?ind=A', 'id': 'A'}


## Industries

In [5]:
# Extract all industries for each sector
# Each industry is saved in a dictionary
# 'year' defines the years for which to extract the data

print('Extracting the industries for the {} sectors:\n'.format(len(sectors)))

url_base = 'https://www.opensecrets.org/industries/indus.php?ind='

for sector in sectors:
    sector['industries'] = []
    print(sector['name'])
    url_temp = url_base + sector['id']
    page_temp = requests.get(url_temp)
    tree_temp = html.fromstring(page_temp.content)
    tree_temp.make_links_absolute(url_temp) #extracts absolute instead of relative links
    length = len([item.text for item in tree_temp.xpath('//*[@id="indcode"]')[0]])
    for i in range(length):
        path_temp = '//*[@id="indcode"]/option[' + str(i+1) + ']' #'/@value'
        #print((tree_temp.xpath('//td[contains(@class, "color-category")]/a/text()')[i]))
        sector['industries'].append({'industry': tree_temp.xpath(path_temp + '/text()')[0],
                                     'id': tree_temp.xpath(path_temp + '/@value')[0].strip(),
                                     'page_url': url_base + tree_temp.xpath(path_temp + '/@value')[0].strip(),
                                     'year': [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010]})


Extracting the industries for the 13 sectors:

Agribusiness
Communications/Electronics
Construction
Defense
Energy & Natural Resources
Finance, Insurance & Real Estate
Health
Ideological/Single-Issue
Labor
Lawyers & Lobbyists
Misc Business
Other
Transportation


## Lobbying groups

In [6]:
# Extract all lobbying groups per year
# The lobbying groups are saved in each industry dictionary under the key 'lobbying_groups'
url_base = 'https://www.opensecrets.org/federal-lobbying/industries/summary?id='

for sector in tqdm(sectors):
    for industry in sector['industries']:
        print(industry['industry'])
        industry['lobbying_groups_by_year'] = []
        for year in industry['year']:
            url_temp = url_base + industry['id'] + '&cycle=' + str(year)
            page_temp = requests.get(url_temp)
            tree_temp = html.fromstring(page_temp.content)
            industry['lobbying_groups_by_year'].append(tree_temp.xpath('//td[contains(@class, "color-category")]/a/text()'))
        industry['lobbying_groups'] = list(set(sum(industry['lobbying_groups_by_year'],[])))

print('The lobbying groups were successfully extracted.')

  0%|          | 0/13 [00:00<?, ?it/s]

Agricultural Services/Products
Crop Production & Basic Processing
Dairy
Farm bureaus
Food and kindred products manufacturing
Food Processing & Sales
Food stores
Forestry & Forest Products
Livestock
Meat processing & products
Poultry & Eggs
Sugar cane & sugar beets
Tobacco
Vegetables, fruits and tree nut


  8%|▊         | 1/13 [04:01<48:21, 241.82s/it]

Book, newspaper & periodical publishing
Cable & satellite TV production
Commercial TV & radio stations
Computer software
Electronics Mfg & Equip
Internet
Motion Picture production & distribution
Printing & Publishing
Recorded Music & music production
Telecom Services
Telephone Utilities
TV production
TV/Movies/Music


 15%|█▌        | 2/13 [07:25<42:15, 230.49s/it]

Architectural services
Building Materials & Equipment
Construction Services
General Contractors
Home Builders
Special Trade Contractors


 23%|██▎       | 3/13 [08:52<31:13, 187.31s/it]

Defense Aerospace
Defense Electronics
Misc Defense


 31%|███       | 4/13 [09:37<21:41, 144.64s/it]

Alternate energy production & services
Coal mining
Electric Utilities
Mining
Natural Gas transmission & distribution
Oil & Gas
Waste Management


 38%|███▊      | 5/13 [11:28<17:55, 134.49s/it]

Accountants
Commercial Banks
Credit Unions
Finance/Credit Companies
Hedge Funds
Insurance
Misc Finance
Mortgage bankers and brokers
Payday lenders
Private Equity & Investment Firms
Real Estate
Savings & Loans
Securities & Investment
Student loan companies
Venture capital


 46%|████▌     | 6/13 [14:59<18:21, 157.38s/it]

Chiropractors
Dentists
Health Professionals
Health Services/HMOs
Hospitals/Nursing Homes
Medical Devices & Supplies
Nurses
Nutritional & dietary supplements
Pharmaceutical manufacturing
Pharmaceuticals/Health Products


 54%|█████▍    | 7/13 [17:45<16:01, 160.18s/it]

Abortion Policy/Anti-Abortion
Abortion Policy/Pro-Abortion Rights
Candidate Committees
Democratic Candidate Committees
Democratic leadership PAC
Democratic/Liberal
Environment
Foreign & Defense Policy
Gay & lesbian rights & issues
Gun Control
Gun Rights
Human Rights
Leadership PACs
Pro-Israel
Republican Candidate Committees
Republican leadership PAC
Republican/Conservative
Women's Issues


 62%|██████▏   | 8/13 [22:00<15:42, 188.48s/it]

Air transport unions
Building Trade Unions
Industrial Unions
Misc Unions
Public Sector Unions
Teachers unions
Transportation Unions
US Postal Service unions & associations


 69%|██████▉   | 9/13 [23:39<10:46, 161.72s/it]

Lawyers/Law Firms
Lobbyists


 77%|███████▋  | 10/13 [24:04<06:01, 120.62s/it]

Advertising & public relations services
Beer, Wine & Liquor
Business Associations
Business Services
Casinos/Gambling
Chemical & Related Manufacturing
Clothing & accessories
Correctional facilities constr & mgmt/for-profit
Food & Beverage
Funeral services
Indian Gaming
Lodging/Tourism
Marijuana
Marijuana
Misc Manufacturing & Distributing
Misc Services
Professional sports, arenas & related equip & svcs
Recreation/Live Entertainment
Restaurants & drinking establishments
Retail Sales
Steel Production
Textiles


 85%|████████▍ | 11/13 [29:11<05:53, 176.64s/it]

Civil Servants/Public Officials
Clergy & Religious Organizations
Education
For-profit Education
Non-Profit Institutions
Retired


 92%|█████████▏| 12/13 [30:37<02:29, 149.47s/it]

Air Transport
Airlines
Auto dealers, foreign imports
Auto dealers, new & used
Auto manufacturers
Automotive
Cruise ships & lines
Railroads
Sea Transport
Trucking


100%|██████████| 13/13 [32:32<00:00, 150.17s/it]

The lobbying groups were successfully extracted.





In [7]:
print('The lobbying groups for the following sectors and industries were extracted:')
for sector in sectors:
    print('\n', sector['name'], '\n')
    for industry in sector['industries']:
        print(industry['industry'])

The lobbying groups for the following sectors and industries were extracted:

 Agribusiness 

Agricultural Services/Products
Crop Production & Basic Processing
Dairy
Farm bureaus
Food and kindred products manufacturing
Food Processing & Sales
Food stores
Forestry & Forest Products
Livestock
Meat processing & products
Poultry & Eggs
Sugar cane & sugar beets
Tobacco
Vegetables, fruits and tree nut

 Communications/Electronics 

Book, newspaper & periodical publishing
Cable & satellite TV production
Commercial TV & radio stations
Computer software
Electronics Mfg & Equip
Internet
Motion Picture production & distribution
Printing & Publishing
Recorded Music & music production
Telecom Services
Telephone Utilities
TV production
TV/Movies/Music

 Construction 

Architectural services
Building Materials & Equipment
Construction Services
General Contractors
Home Builders
Special Trade Contractors

 Defense 

Defense Aerospace
Defense Electronics
Misc Defense

 Energy & Natural Resources 

Alter

In [8]:
print('''The following {} lobbying groups were extracted for the 
Oil & Gas Industry. Here are the first 10:'''.format(len(sectors[4]['industries'][5]['lobbying_groups'])))
sectors[4]['industries'][5]['lobbying_groups'][0:10]

The following 347 lobbying groups were extracted for the 
Oil & Gas Industry. Here are the first 10:


['Equitable Production',
 'Ferrell Companies',
 'Mariner Energy',
 'Penn Octane Corp',
 'Oilfield Services & Drilling Indus Cltn',
 'Ergon Inc',
 'Intl Assn of Drilling Contractors',
 'Lion Oil',
 'Texakoma Oil & Gas',
 'BHP Billiton']

In [9]:
print('''The following {} lobbying groups were extracted for the 
Alternate energy production Industry. Here are the first 10:'''.format(len(sectors[4]['industries'][0]['lobbying_groups'])))
sectors[4]['industries'][0]['lobbying_groups'][0:10]

The following 329 lobbying groups were extracted for the 
Alternate energy production Industry. Here are the first 10:


['Renewable Energy Group',
 'Blue Sun Energy',
 'Range Fuels',
 'Konarka Technologies',
 'Ternion Bio Industries',
 'Cottonwood Services',
 'Nanosolar',
 'Clean Economy Network',
 'Futuregen Industrial Alliance',
 'Envires LLC']

In [10]:
# Extract all top 20 contributors per election-cycle
# The contributors are saved in each industry dictionary under the key 'contributors'
url_base = 'https://www.opensecrets.org/industries/contrib.php?cycle='

for sector in tqdm(sectors):
    for industry in sector['industries']:
        print(industry['industry'])
        industry['contributors_by_year'] = []
        for election_cycle in [2004, 2006, 2008, 2010]:
            url_temp = url_base + str(election_cycle) + '&id=' + industry['id']
            page_temp = requests.get(url_temp)
            tree_temp = html.fromstring(page_temp.content)
            election_cycle_contributors = []
            for i in range(20):
                path_temp = '//*[@id="datatable"]/tbody/tr[' + str(i+1) + ']/td[2]//text()'
                election_cycle_contributors.append(tree_temp.xpath(path_temp)[0].strip())
            industry['contributors_by_year'].append(election_cycle_contributors)
        industry['contributors'] = list(set(sum(industry['contributors_by_year'],[])))

print('The top contributors groups were successfully extracted.')

  0%|          | 0/13 [00:00<?, ?it/s]

Agricultural Services/Products
Crop Production & Basic Processing
Dairy
Farm bureaus
Food and kindred products manufacturing
Food Processing & Sales
Food stores
Forestry & Forest Products
Livestock
Meat processing & products
Poultry & Eggs
Sugar cane & sugar beets
Tobacco
Vegetables, fruits and tree nut


  8%|▊         | 1/13 [00:41<08:20, 41.70s/it]

Book, newspaper & periodical publishing
Cable & satellite TV production
Commercial TV & radio stations
Computer software
Electronics Mfg & Equip
Internet
Motion Picture production & distribution
Printing & Publishing
Recorded Music & music production
Telecom Services
Telephone Utilities
TV production
TV/Movies/Music


 15%|█▌        | 2/13 [01:15<07:13, 39.44s/it]

Architectural services
Building Materials & Equipment
Construction Services
General Contractors
Home Builders
Special Trade Contractors


 23%|██▎       | 3/13 [01:31<05:23, 32.35s/it]

Defense Aerospace
Defense Electronics
Misc Defense


 31%|███       | 4/13 [01:38<03:42, 24.71s/it]

Alternate energy production & services
Coal mining
Electric Utilities
Mining
Natural Gas transmission & distribution
Oil & Gas
Waste Management


 38%|███▊      | 5/13 [01:55<02:58, 22.32s/it]

Accountants
Commercial Banks
Credit Unions
Finance/Credit Companies
Hedge Funds
Insurance
Misc Finance
Mortgage bankers and brokers
Payday lenders
Private Equity & Investment Firms
Real Estate
Savings & Loans
Securities & Investment
Student loan companies
Venture capital


 46%|████▌     | 6/13 [02:35<03:13, 27.57s/it]

Chiropractors
Dentists
Health Professionals
Health Services/HMOs
Hospitals/Nursing Homes
Medical Devices & Supplies
Nurses
Nutritional & dietary supplements
Pharmaceutical manufacturing
Pharmaceuticals/Health Products


 54%|█████▍    | 7/13 [03:00<02:40, 26.79s/it]

Abortion Policy/Anti-Abortion
Abortion Policy/Pro-Abortion Rights
Candidate Committees
Democratic Candidate Committees
Democratic leadership PAC
Democratic/Liberal
Environment
Foreign & Defense Policy
Gay & lesbian rights & issues
Gun Control
Gun Rights
Human Rights
Leadership PACs
Pro-Israel
Republican Candidate Committees
Republican leadership PAC
Republican/Conservative
Women's Issues


 62%|██████▏   | 8/13 [03:45<02:42, 32.41s/it]

Air transport unions
Building Trade Unions
Industrial Unions
Misc Unions
Public Sector Unions
Teachers unions
Transportation Unions
US Postal Service unions & associations


 69%|██████▉   | 9/13 [04:06<01:56, 29.03s/it]

Lawyers/Law Firms
Lobbyists


 77%|███████▋  | 10/13 [04:13<01:06, 22.19s/it]

Advertising & public relations services
Beer, Wine & Liquor
Business Associations
Business Services
Casinos/Gambling
Chemical & Related Manufacturing
Clothing & accessories
Correctional facilities constr & mgmt/for-profit
Food & Beverage
Funeral services
Indian Gaming
Lodging/Tourism
Marijuana
Marijuana
Misc Manufacturing & Distributing
Misc Services
Professional sports, arenas & related equip & svcs
Recreation/Live Entertainment
Restaurants & drinking establishments
Retail Sales
Steel Production
Textiles


 85%|████████▍ | 11/13 [05:12<01:06, 33.41s/it]

Civil Servants/Public Officials
Clergy & Religious Organizations
Education
For-profit Education
Non-Profit Institutions
Retired


 92%|█████████▏| 12/13 [05:30<00:28, 28.72s/it]

Air Transport
Airlines
Auto dealers, foreign imports
Auto dealers, new & used
Auto manufacturers
Automotive
Cruise ships & lines
Railroads
Sea Transport
Trucking


100%|██████████| 13/13 [05:55<00:00, 27.31s/it]

The top contributors groups were successfully extracted.





In [11]:
# Save the data as sectors
with open('sectors_industries_contributors.json', 'w') as outfile:
    json.dump(sectors, outfile)