# Manually match the remaining witnesses into *Open Secrets* lobbying categories

#### This script classifies all witnesses according to the *Open Secrets* lobbying categories and subcategories

In [1]:
from CommitteeHearingsFunctions import *

# Change directory
os.chdir('../../Data/')

In [2]:
# Load the data
with open('CommitteeHearings/hearings_witnesses_contrarians_classified_partial.json', 'r') as file:
    df = json.load(file)

<br>

## 1) Preparation: Loading, correcting and inspecting the data

In [3]:
# Load the OpenSectrets Lobbying sectors and industries
with open('OpenSecrets/sectors_industries_contributors.json', 'r') as jfile:
    sectors = json.load(jfile)
print('We imported {} sectors of lobbying organisations for the years {} to {}.\n'.format(len(sectors), sectors[0]['industries'][0]['year'][0], 
                                                                                                        sectors[0]['industries'][0]['year'][-1]))

We imported 13 sectors of lobbying organisations for the years 2003 to 2010.



In [4]:
# Add a new category 'Nuclear Energy' to sector 4 'Energy & Natural Resources'
sectors[4]['industries'].append(
    {'industry': 'Nuclear Energy',
     'id': None,
     'page_url': None,
     'year': None,
     'lobbying_groups_by_year': None,
     'lobbying_groups': None,
     'contributors_by_year': None,
     'contributors': None})

In [5]:
# Add a new category 'International/Intergovernmental' to sector 11 'Other'
sectors[11]['industries'].append(
    {'industry': 'International/Intergovernmental',
     'id': None,
     'page_url': None,
     'year': None,
     'lobbying_groups_by_year': None,
     'lobbying_groups': None,
     'contributors_by_year': None,
     'contributors': None})

In [6]:
# Add a new category 'Individuals' to sector 11 'Other'
sectors[11]['industries'].append(
    {'industry': 'Individuals',
     'id': None,
     'page_url': None,
     'year': None,
     'lobbying_groups_by_year': None,
     'lobbying_groups': None,
     'contributors_by_year': None,
     'contributors': None})

In [7]:
# print('Each of these sectors is grouped into multiple industries. These are the sectors with their respective industries:\n')
# for i, sector in enumerate(sectors):
#     print(i, sector['name'].upper(), '({})'.format(sector['id']))
#     for j, industry in enumerate(sector['industries']):
#         print('\t', j, industry['industry'])
#     print('\n')    

<br>

## 2) Matching: match the witnesses according to the OpenSecrets lobbing groups

### 11 OTHER

In [8]:
# 11 OTHER (W)
# 	 4 Non-Profit Institutions
# 	 0 Civil Servants/Public Officials
# 	 2 Education
#    	 3 For-profit Education
# 	 1 Clergy & Religious Organizations
# 	 5 Retired

In [9]:
# 11.1 OTHER - Clergy & Religious Organizations: 
sector, industry = select_industry(sectors, 11, 1)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['\w+\s?\w+ church',
            'southern baptist theological seminary',
            'religious action center of reform judaism',
            'institute on religion and democracy',
            'interfaith stewardship alliance',
            'virginia interfaith center for public policy',
            'cornwall alliance for the stewardship of creation',
            'author and historian'] # Barton, David, author and historian: representing evangelicals
match_witnesses(df, keywords, sector=sector, industry=industry)

11.1 OTHER - Clergy & Religious Organizations:

 
>>> There are 3 Clergy & Religious Organizations witnesses.

>>> 8 more Clergy & Religious Organizations witnesses have been matched, resulting in a total of 11 witnesses.


In [10]:
# 11.2 OTHER: Education
sector, industry = select_industry(sectors, 11, 2)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['national academy of sciences', 'national research council',
            '\w+\s?\w+\s?\w+\suniversity',  'university\s\w+\s?\w+',
            '\w+\s?\w+\s?\w+\scollege', 'college\s\w+\s?\w+\s?\w+', 
            '\w*\s?school of\s\w+\s?\w+\s?\w+\s?\w+\s?\w+', '\w+\s?\w+\s?\w+\sschool',
            '\w+ institution of \w+', 'institute for coastal research',
            'graduate institute of international and development studies', 
            'yale', 'institut pasteur', 'american meteorological society',
            'arizona water institute']
antikeywords = ['in-q-tel', 'copenhagen consensus center']

match_witnesses(df, keywords, antikeywords, sector=sector, industry=industry)

11.2 OTHER - Education:

 
>>> There are 76 Education witnesses.

>>> 29 more Education witnesses have been matched, resulting in a total of 105 witnesses.


In [11]:
# 11.0 OTHER: Civil Servants/Public Officials
sector, industry = select_industry(sectors, 11,0)

count_witnesses(df, 'witness_industry', industry)

# United States research and development agencies
# https://en.wikipedia.org/wiki/List_of_United_States_research_and_development_agencies
# https://en.wikipedia.org/wiki/Federally_funded_research_and_development_centers

# Match federal research and development agencies
keywords = [# Independent Agencies
            'national science foundation', ' nsf',
            'national aeronautics and space administration', ' nasa', 
            'environmental protection agency office of research and development', 
            'intelligence advanced research projects activity', ' iarpa',
            'smithsonian',
             # Department of Agriculture
            'agricultural research service', ' ars',
            'national institute of food and agriculture', ' nifa'
            'economic research service', ' ers',
            'united states forest service research and development', 'united states forest service r&d',
            'rocky mountain research station, forest service',
             # Department of Commerce
            'national institute of standards and technology', ' nist',
            'national oceanic and atmospheric administration', ' noaa',
            # Department of Education
            'institute of education sciences', ' ies',
            'national institute on disability and rehabilitation research', ' nidrr',
            # Department of Energy
            'department of energy office of science', 'doe office of science', ' doe sc',
            'advanced research projects agency-energy', ' arpa-e',
            # National laboratories
            'national \w*\s*\w*\s*\w*\s*laboratory', 'national \w*\s*\w*\s*\w*\s*lab',
            # Department of health and human services
            'national institutes of health', ' nih',
            'national institute for occupational safety and health', ' niosh',
            'food and drug administration science and research programs',
            'agency for healthcare research and quality', ' ahrq',
            'biomedical advanced research and development authority', ' barda',
            # Department of homeland security
            'directorate for science and technology', ' s&t',
            'coast guard research & development center', ' cg rdc',
            # Department of the interior
            'united states geological survey', ' usgs', 'geological survey',
            # Department of justice
            'national institute of justice', ' nij',
            # Department of transportation
            'research and innovative technology administration',
            'federal aviation administration research, engineering, and development',
            'federal highway administration research and technology',
            'scientist(?:.+?)environmental protection agency',
            'researcher(?:.+?)environmental protection agency',
            'research and development, united states environmental protection agency',
            # Veterans affairs
            'veterans health administration office of research and development', ' ord',
            # Multi-agency initiatives
            'office of science and technology', ' ostp',
            'u.s. global change research program', ' usgcrp', 
            'networking and information technology research and development program', ' nitrd',
            'national nanotechnology initiative', ' nni',
            # Judicial branch
            'federal judicial center',
            # Legislative branch
            'house committee on science, space and technology',
            'senate committee on commerce, science, and transportation',
            'office of technology assessment', ' ota',
            # Joint programs
            'carbon cycle scientific steering group',
            # Federally funded research and development centers
            'institute for defense analyses',
            'center for naval (analyses|analysis){1}', 'cna military advisory board', 'military advisory board',
            'national center for atmospheric research',
            # Other
            'congressional research service',
            'lawrence berkeley laboratory']

match_witnesses(df, keywords, sector=sector, industry=industry)

count = 0
for i, text in enumerate(df):
    text['state_research'] = []
    for j, witness in enumerate(text['witnesses']):
        text['state_research'].append(0)
        if len(first_match(keywords, witness.lower())) > 0:
                count += 1
                text['state_research'][j] = 1
                
print('\n >>> A total of {} state research witnesses were matched.\n\n'.format(count))

# Match other governmental witnesses
keywords = ['department of the \w+', 'department of \w+', '\w+ \w+ agency', '\w+ \w+ administration',
            'administrator, office of \w+', 'commodity futures trading commission',
            '\w+ \w+ commission', 'commission on \w+', 'committee on \w+',
            '\w+ and \w+ \w+ committee',
            'under secretary for democracy and global affairs',
            'government accountability office', 'general accounting office',
            'mayor\,?\s?\w+ \w+ \w+', 'mayor of \w+', 'mayor, \w+',
            'western governors\' wildlife council',
            'governor\,?\s?\w+\s?\w*', 'house of representatives',
            'congressional budget office', 'united states congress',
            'representative from \w+', '\w+ \w+ senate',
            'office of the consumer advocate',
            'council on environmental quality',
            'california air resources board', 'board of public utilities',
            'northeast states for coordinated air use management','\w+ state \w+',
            '(council o(n|f) \w+ \w+)', 'adirondack council',
            '((u.s.|united states) (army|navy))', 'senator', 'county executive',
            'centers for disease control and prevention']

antikeywords = ['national association of regulatory utility commissioners', 
                'inc\.|inc$', 'incorporated', 'international energy agency', 
                'former', 'retired', '(ret\.)', 'european union',
                'resources for the future']

match_witnesses(df, keywords, antikeywords, sector=sector, industry=industry)

11.0 OTHER - Civil Servants/Public Officials:

 
>>> There are 27 Civil Servants/Public Officials witnesses.

>>> 38 more Civil Servants/Public Officials witnesses have been matched, resulting in a total of 65 witnesses.

 >>> A total of 40 state research witnesses were matched.


>>> 131 more Civil Servants/Public Officials witnesses have been matched, resulting in a total of 196 witnesses.


In [12]:
# 11.4 OTHER - Non-Profit Institutions
sector, industry = select_industry(sectors, 11, 4)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['league of american bicyclists', 'appliance standards awareness project',
            'empower consumers', 'freedom to roam', 'regulatory assistance project',
            'carnegie endowment for international peace',
            'peterson institute for international economics',
            'brookings institution', 'national black chamber of commerce',
            'center on budget policies and priorities',
            'center for budget and policy priorities',
            'corporate leaders\' groups on climate change',
            'society of human resource management',
            'bipartisan policy center',
            # Denialist
            'copenhagen consensus center', 'science and public policy institute']

match_witnesses(df, keywords, sector=sector, industry=industry)

11.4 OTHER - Non-Profit Institutions:

 
>>> There are 5 Non-Profit Institutions witnesses.

>>> 20 more Non-Profit Institutions witnesses have been matched, resulting in a total of 25 witnesses.


In [13]:
# 11.1 OTHER - Retired: 
sector, industry = select_industry(sectors, 11, 5)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['major general, u.s. army \(retired\)', 'u.s. senator \(retired\)',
            'former.+']
antikeywords = ['fellow', 'steering committee', 'vantagepoint']

match_witnesses(df, keywords, antikeywords, sector=sector, industry=industry)

11.5 OTHER - Retired:

 
>>> There are 0 Retired witnesses.

>>> 10 more Retired witnesses have been matched, resulting in a total of 10 witnesses.


In [14]:
# 11.6 OTHER - International/Intergovernmental
sector, industry = select_industry(sectors, 11, 6)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['ipcc', 'intergovernmental panel on climate change', 
            'international energy agency',
            '((house of lords|government economic service), united kingdom)',
            'u.k. department for environment, food, and rural affairs',
            'european commission', 'european union commission',
            'minister for environment, nature conservation and nuclear safety, federal republic of germany']
          
match_witnesses(df, keywords, sector=sector, industry=industry)

11.6 OTHER - International/Intergovernmental:

 
>>> There are 0 International/Intergovernmental witnesses.

>>> 12 more International/Intergovernmental witnesses have been matched, resulting in a total of 12 witnesses.


In [15]:
# 11.6 OTHER - Individuals
sector, industry = select_industry(sectors, 11, 7)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['mcintyre, stephen', 'third viscount monckton of brenchley', 'senior meteorologist']

match_witnesses(df, keywords, sector=sector, industry=industry)

11.7 OTHER - Individuals:

 
>>> There are 0 Individuals witnesses.

>>> 3 more Individuals witnesses have been matched, resulting in a total of 3 witnesses.


### 7 IDEOLOGY/SINGLE-ISSUE

In [16]:
# 7 IDEOLOGICAL/SINGLE-ISSUE (Q)
# 	 16 Republican/Conservative
# 	 5 Democratic/Liberal
# 	 12 Leadership PACs
#    	 4 Democratic leadership PAC
#    	 15 Republican leadership PAC
# 	 7 Foreign & Defense Policy
# 	 13 Pro-Israel
# 	 17 Women's Issues
# 	 11 Human Rights
#    	 8 Gay & lesbian rights & issues
# 	 6 Environment
# 	 9 Gun Control
# 	 10 Gun Rights
# 	 0 Abortion Policy/Anti-Abortion
# 	 1 Abortion Policy/Pro-Abortion Rights
# 	 2 Candidate Committees
#    	 14 Republican Candidate Committees
#    	 3 Democratic Candidate Committees

In [17]:
# 7.6 IDEOLOGY/SINGLE-ISSUE: Environment
sector, industry = select_industry(sectors, 7,6)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['resources for the future', 'world resources institute', 
            'pew center on global climate change', 'pew center', 
            'american council for an energy efficient economy',
            'environmental resources trust',
            'environmental defense', 'energy future coalition',
            'oxfam america', 'climate central', 'winter wildlands alliance',
            'society for conservation biology', 'the climate registry',
            'forest climate working group', 'institute for applied ecology',
            'climate action reserve', 'apollo alliance',
            'national religious partnership for the environment',
            'evangelical climate initiative', 'watershed research and training center',
            'tomales bay institute', 'polar oceans research group',
            'center for biological diversity', 'natural resources defense council',
            'theodore roosevelt conservation partnership', 'blue green alliance',
            'friends of the earth',
            # individuals
            'al gore', 'marshall herskovitz']

match_witnesses(df, keywords, sector=sector, industry=industry)

7.6 IDEOLOGICAL/SINGLE-ISSUE - Environment:

 
>>> There are 58 Environment witnesses.

>>> 71 more Environment witnesses have been matched, resulting in a total of 129 witnesses.


In [18]:
# 7.7 IDEOLOGICAL/SINGLE-ISSUE - Foreign & Defense Policy
sector, industry = select_industry(sectors, 7,7)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['association of the united states army', 'truman national security project']

match_witnesses(df, keywords, sector=sector, industry=industry)

7.7 IDEOLOGICAL/SINGLE-ISSUE - Foreign & Defense Policy:

 
>>> There are 2 Foreign & Defense Policy witnesses.

>>> 3 more Foreign & Defense Policy witnesses have been matched, resulting in a total of 5 witnesses.


In [19]:
# 7.16 IDEOLOGICAL/SINGLE-ISSUE - Republican/Conservative
sector, industry = select_industry(sectors, 7,16)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['american enterprise institute', 'competitive enterprise institute',
           'cato institute', 'business & media institute']

match_witnesses(df, keywords, sector=sector, industry=industry)

7.16 IDEOLOGICAL/SINGLE-ISSUE - Republican/Conservative:

 
>>> There are 6 Republican/Conservative witnesses.

>>> 14 more Republican/Conservative witnesses have been matched, resulting in a total of 20 witnesses.


### 8 LABOR

In [20]:
# 8 LABOR (P)
# 	 0 Air transport unions
# 	 1 Building Trade Unions
# 	 2 Industrial Unions
# 	 3 Misc Unions
# 	 4 Public Sector Unions
# 	 5 Teachers unions
# 	 6 Transportation Unions
# 	 7 US Postal Service unions & associations

In [21]:
# 8.2 LABOR: Industrial Unions
sector, industry = select_industry(sectors, 8, 2)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['(united automobile, aerospace (and|&) agricultural implement workers of america)',
            'united steel workers', 'international brotherhood of boilermakers', 'afl- cio',
            'laborers\' international union of north america']

match_witnesses(df, keywords, sector=sector, industry=industry)

8.2 LABOR - Industrial Unions:

 
>>> There are 5 Industrial Unions witnesses.

>>> 7 more Industrial Unions witnesses have been matched, resulting in a total of 12 witnesses.


### 4 ENERGY/NAT RESOURCE

In [22]:
# 4.0 ENERGY/NAT RESOURCE: Alternate energy production & services
sector, industry = select_industry(sectors, 4, 0)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['energy ventures', 'hydrogen energy',
            'sunedison', 'gore and associates',
            'nrg systems', 'powerspan corp']

match_witnesses(df, keywords, sector=sector, industry=industry)

4.0 ENERGY & NATURAL RESOURCES - Alternate energy production & services:

 
>>> There are 11 Alternate energy production & services witnesses.

>>> 7 more Alternate energy production & services witnesses have been matched, resulting in a total of 18 witnesses.


In [23]:
# 4.1 ENERGY/NAT RESOURCE: Coal Mining
sector, industry = select_industry(sectors, 4, 1)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['\w*\s?\w*coal \w*', 'evergreen energy inc']

match_witnesses(df, keywords, sector=sector, industry=industry)

4.1 ENERGY & NATURAL RESOURCES - Coal mining:

 
>>> There are 1 Coal mining witnesses.

>>> 3 more Coal mining witnesses have been matched, resulting in a total of 4 witnesses.


In [24]:
# 4.2 ENERGY/NAT RESOURCE: Electric Utilities
sector, industry = select_industry(sectors, 4, 2)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['((\w+\s?\w+\s?\w+ )?authority( \w+\s?\w+)?)',
            '((\w+ )?\w*power (\w+\s?\w+)?)',
            '((mid\s?american|cps|og&e|txu|shell)\s?energy)',
            'electric cooperative', '\w+\s?\w+\s?\w+ fuels \w+',
            'electricite de france', 'pacific gas and electric \w+',
            'old dominion cooperative', 'tenaska', 'eon energie',
            'midamerican corporation', 'pseg service corporation',
            'generators for clear air', 'national grid',
            'edison electric institute']

antikeywords = ['small power consultants', 'empower', 'air power systems']

match_witnesses(df, keywords, sector=sector, industry=industry)
 
# >>> There are 56 Electric Utilities witnesses
# >>> 22 more Electric Utilities witnesses have been matched, resulting in a total of 78 witnesses.

4.2 ENERGY & NATURAL RESOURCES - Electric Utilities:

 
>>> There are 59 Electric Utilities witnesses.

>>> 24 more Electric Utilities witnesses have been matched, resulting in a total of 83 witnesses.


In [25]:
# 4.3 ENERGY/NAT RESOURCE: Mining
sector, industry = select_industry(sectors, 4, 3)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['rio tinto', 'alcoa \w+\s?\w+', 'quaterra corporation']

match_witnesses(df, keywords, sector=sector, industry=industry)

4.3 ENERGY & NATURAL RESOURCES - Mining:

 
>>> There are 3 Mining witnesses.

>>> 5 more Mining witnesses have been matched, resulting in a total of 8 witnesses.


In [26]:
# 4.4 ENERGY/NAT RESOURCE: Natural Gas transmission & distribution
sector, industry = select_industry(sectors, 4, 4)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['transcanada pipeline']

match_witnesses(df, keywords, sector=sector, industry=industry)

4.4 ENERGY & NATURAL RESOURCES - Natural Gas transmission & distribution:

 
>>> There are 3 Natural Gas transmission & distribution witnesses.

>>> 1 more Natural Gas transmission & distribution witnesses have been matched, resulting in a total of 4 witnesses.


In [27]:
# 4.5 ENERGY/NAT RESOURCE: Oil & Gas
sector, industry = select_industry(sectors, 4, 5)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = [', (shell(\s|$)\w*\s?\w*)', 'continental resources', 
            'conoco- phillips', 'countrymark','unimark']

match_witnesses(df, keywords, sector=sector, industry=industry)

4.5 ENERGY & NATURAL RESOURCES - Oil & Gas:

 
>>> There are 5 Oil & Gas witnesses.

>>> 7 more Oil & Gas witnesses have been matched, resulting in a total of 12 witnesses.


In [28]:
# 4.6 ENERGY/NAT RESOURCE: Waste Management
sector, industry = select_industry(sectors, 4, 6)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['advanced waste management systems', 'waste management, inc']

match_witnesses(df, keywords, sector=sector, industry=industry)

4.6 ENERGY & NATURAL RESOURCES - Waste Management:

 
>>> There are 1 Waste Management witnesses.

>>> 2 more Waste Management witnesses have been matched, resulting in a total of 3 witnesses.


In [29]:
# 4.7 ENERGY/NAT RESOURCE: Nuclear Energy
sector, industry = select_industry(sectors, 4, 7)

count_witnesses(df, 'witness_industry', industry)

# # Match witnesses
keywords = ['babcock and wilcox', 'ge hitachi nuclear energy']

match_witnesses(df, keywords, sector=sector, industry=industry)

4.7 ENERGY & NATURAL RESOURCES - Nuclear Energy:

 
>>> There are 0 Nuclear Energy witnesses.

>>> 2 more Nuclear Energy witnesses have been matched, resulting in a total of 2 witnesses.


### 0 AGRIBUSINESS

In [30]:
# 0 AGRIBUSINESS (A)
# 	 1 Crop Production & Basic Processing
#    	 13 Vegetables, fruits and tree nut
#    	 11 Sugar cane & sugar beets
# 	 12 Tobacco
# 	 8 Livestock
#    	 2 Dairy
#    	 10 Poultry & Eggs
# 	 0 Agricultural Services/Products
#    	 3 Farm bureaus
# 	 5 Food Processing & Sales
#    	 4 Food and kindred products manufacturing
#    	 6 Food stores
#    	 9 Meat processing & products
# 	 7 Forestry & Forest Products

In [31]:
# 0.0 AGRIBUSINESS - Agricultural Services/Products
sector, industry = select_industry(sectors, 0, 0)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['lange-segmann company']

match_witnesses(df, keywords, sector=sector, industry=industry)

0.0 AGRIBUSINESS - Agricultural Services/Products:

 
>>> There are 13 Agricultural Services/Products witnesses.

>>> 1 more Agricultural Services/Products witnesses have been matched, resulting in a total of 14 witnesses.


In [32]:
# 0.1 AGRIBUSINESS - Crop Production & Basic Processing
sector, industry = select_industry(sectors, 0, 1)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['pacific northwest direct seed association',
            'national milk producers federation', 'usa rice producers\' group',
            'grain growers association', 'vineyard', 'corn growers association']

match_witnesses(df, keywords, sector=sector, industry=industry)

0.1 AGRIBUSINESS - Crop Production & Basic Processing:

 
>>> There are 2 Crop Production & Basic Processing witnesses.

>>> 6 more Crop Production & Basic Processing witnesses have been matched, resulting in a total of 8 witnesses.


In [33]:
# 0.7 AGRIBUSINESS - Forestry & Forest Products
sector, industry = select_industry(sectors, 0, 7)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['the forest carbon dialogue']

match_witnesses(df, keywords, sector=sector, industry=industry)

0.7 AGRIBUSINESS - Forestry & Forest Products:

 
>>> There are 1 Forestry & Forest Products witnesses.

>>> 1 more Forestry & Forest Products witnesses have been matched, resulting in a total of 2 witnesses.


In [34]:
# 0.8 AGRIBUSINESS - Livestock
sector, industry = select_industry(sectors, 0, 8)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['brubaker farms']

match_witnesses(df, keywords, sector=sector, industry=industry)

0.8 AGRIBUSINESS - Livestock:

 
>>> There are 0 Livestock witnesses.

>>> 1 more Livestock witnesses have been matched, resulting in a total of 1 witnesses.


### 12 TRANSPORTATION 

In [35]:
# 12 TRANSPORTATION (M)
# 	 0 Air Transport (11)
#    	 1 Airlines (0)
# 	 5 Automotive (0)
#    	 4 Auto manufacturers (9)
#    	 3 Auto dealers, new & used (0)
#    	 2 Auto dealers, foreign imports (0)
# 	 9 Trucking (1)
# 	 7 Railroads (5)
# 	 8 Sea Transport (2)
#    	 6 Cruise ships & lines (0)

In [36]:
# 12.0 TRANSPORTATION - Air Transport
sector, industry = select_industry(sectors, 12,0)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['air transport association', 'airport council international', 'cargo airline association']

match_witnesses(df, keywords, sector=sector, industry=industry)

12.0 TRANSPORTATION - Air Transport:

 
>>> There are 2 Air Transport witnesses.

>>> 3 more Air Transport witnesses have been matched, resulting in a total of 5 witnesses.


In [37]:
# 12.5 TRANSPORTATION - Automotive
sector, industry = select_industry(sectors, 12,5)

# Merge subindustry 'Auto manufacturers' into parent category 'Automotive'
for i, text in enumerate(df):
    for j, witness in enumerate(text['witnesses']):
        if df[i]['witness_industry'][j] in ['Auto manufacturers']:
            df[i]['witness_industry'][j] = 'Automotive'

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['pridgeon & clay', 'american highway users alliance',
            'toyota', 'better place', 'ford motor company', 
            'alliance for automobile manufacturers']

match_witnesses(df, keywords, sector=sector, industry=industry)



12.5 TRANSPORTATION - Automotive:

 
>>> There are 5 Automotive witnesses.

>>> 5 more Automotive witnesses have been matched, resulting in a total of 10 witnesses.


In [38]:
# 12.7 TRANSPORTATION - Railroad
sector, industry = select_industry(sectors, 12,7)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['american public transportation association', 'colorado railcar']

match_witnesses(df, keywords, sector=sector, industry=industry)

12.7 TRANSPORTATION - Railroads:

 
>>> There are 1 Railroads witnesses.

>>> 4 more Railroads witnesses have been matched, resulting in a total of 5 witnesses.


In [39]:
# 112.9 TRANSPORTATION - Trucking
sector, industry = select_industry(sectors, 12,9)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['hahn transportation', 'con-way', 'american trucking association']

match_witnesses(df, keywords, sector=sector, industry=industry)

12.9 TRANSPORTATION - Trucking:

 
>>> There are 0 Trucking witnesses.

>>> 3 more Trucking witnesses have been matched, resulting in a total of 3 witnesses.


### 2 CONSTRUCTION

In [40]:
# 2 CONSTRUCTION (C)
# 	 2 Construction Services
#    	 0 Architectural services
# 	 1 Building Materials & Equipment
# 	 3 General Contractors
# 	 4 Home Builders
# 	 5 Special Trade Contractors

In [41]:
# 2.1 CONSTRUCTION - Building Materials & Equipment
sector, industry = select_industry(sectors, 2,1)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['holcim cement']

match_witnesses(df, keywords, sector=sector, industry=industry)

2.1 CONSTRUCTION - Building Materials & Equipment:

 
>>> There are 0 Building Materials & Equipment witnesses.

>>> 1 more Building Materials & Equipment witnesses have been matched, resulting in a total of 1 witnesses.


In [42]:
# 2.2 CONSTRUCTION - Construction Services
sector, industry = select_industry(sectors, 2, 2)

# Merge Architectural services into parent category Construction Services
for i, text in enumerate(df):
    for j, witness in enumerate(text['witnesses']):
        if df[i]['witness_industry'][j] in ['Architectural services']:
            df[i]['witness_industry'][j] = 'Construction Services'
            
count_witnesses(df, 'witness_industry', industry)

2.2 CONSTRUCTION - Construction Services:

 
>>> There are 1 Construction Services witnesses.



In [43]:
# 2.1 CONSTRUCTION - Home Builders
sector, industry = select_industry(sectors, 2, 4)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['national association of homebuilders']

match_witnesses(df, keywords, sector=sector, industry=industry)

2.4 CONSTRUCTION - Home Builders:

 
>>> There are 0 Home Builders witnesses.

>>> 1 more Home Builders witnesses have been matched, resulting in a total of 1 witnesses.


### 5 FINANCE, INSURANCE & REAL ESTATE

In [44]:
# 5 FINANCE, INSURANCE & REAL ESTATE (F)
# 	 1 Commercial Banks
# 	 11 Savings & Loans
# 	 2 Credit Unions
# 	 3 Finance/Credit Companies
#    	 13 Student loan companies
#    	 8 Payday lenders
# 	 12 Securities & Investment
#    	 14 Venture capital
#    	 4 Hedge Funds
#    	 9 Private Equity & Investment Firms
# 	 5 Insurance
# 	 10 Real Estate
#    	 7 Mortgage bankers and brokers
# 	 0 Accountants
# 	 6 Misc Finance

In [45]:
# 5.5 FINANCE, INSURANCE & REAL ESTATE - Insurance
sector, industry = select_industry(sectors, 5,5)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['swiss re']

match_witnesses(df, keywords, sector=sector, industry=industry)

5.5 FINANCE, INSURANCE & REAL ESTATE - Insurance:

 
>>> There are 1 Insurance witnesses.

>>> 1 more Insurance witnesses have been matched, resulting in a total of 2 witnesses.


In [46]:
# 5.12 FINANCE, INSURANCE & REAL ESTATE - Securities & Investment
sector, industry = select_industry(sectors, 5,12)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['friedman billings ramsey and company', 'missionpoint capital partners',
            'natsource', 'investor responsibility research center','new energy finance',
            'mayfield fund', 'free enterprise action fund', 'jp morgan securities',
            'friedman, billings, ramsey and company', 'tgp capital', 'td bank']

match_witnesses(df, keywords, sector=sector, industry=industry)

5.12 FINANCE, INSURANCE & REAL ESTATE - Securities & Investment:

 
>>> There are 6 Securities & Investment witnesses.

>>> 13 more Securities & Investment witnesses have been matched, resulting in a total of 19 witnesses.


In [47]:
# 5.14 FINANCE, INSURANCE & REAL ESTATE - Venture capital
sector, industry = select_industry(sectors, 5,14)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['vantagepoint venture partners',  '\w+ capital partners',
            'kleiner perkins caufield & byers']

match_witnesses(df, keywords, sector=sector, industry=industry)

5.14 FINANCE, INSURANCE & REAL ESTATE - Venture capital:

 
>>> There are 0 Venture capital witnesses.

>>> 5 more Venture capital witnesses have been matched, resulting in a total of 5 witnesses.


### 10 MISC BUSINESS

In [48]:
# 10 MISC BUSINESS (N)

# 	 2 Business Associations
# 	 8 Food & Beverage
#    	 18 Restaurants & drinking establishments
# 	 1 Beer, Wine & Liquor
# 	 19 Retail Sales
# 	 15 Misc Services
#    	 9 Funeral services
# 	 3 Business Services
#    	 0 Advertising & public relations services
# 	 17 Recreation/Live Entertainment
#    	 16 Professional sports, arenas & related equip & svcs
# 	 4 Casinos/Gambling
#    	 10 Indian Gaming
# 	 11 Lodging/Tourism
# 	 12 Marijuana
# 	 13 Marijuana
# 	 7 Correctional facilities constr & mgmt/for-profit
# 	 5 Chemical & Related Manufacturing
# 	 20 Steel Production
# 	 14 Misc Manufacturing & Distributing
#    	 6 Clothing & accessories
# 	 21 Textiles

In [49]:
# 10.2 MISC BUSINESS - Business Association
sector, industry = select_industry(sectors, 10, 2)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['national association of \w+ \w+\s?\w+',  'u.s. chamber of commerce', 'chamber of commerce']

match_witnesses(df, keywords, sector=sector, industry=industry)

10.2 MISC BUSINESS - Business Associations:

 
>>> There are 10 Business Associations witnesses.

>>> 10 more Business Associations witnesses have been matched, resulting in a total of 20 witnesses.


In [50]:
# 10.3 MISC BUSINESS - Business Services

sector, industry = select_industry(sectors, 10, 3)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = [# Consultancies
            'energy and environmental analysis', 'climate advisers', ', (pace)$',
            'orbis energy advisors', 'arduin, laffer and moore econometrics', 'point carbon', 
            'cra international', 'charles river associates', 'c-lock technology', 
            'doane advisory services', 'first environment', 'coulomb technologies',
            'imbue technology solutions', 'independent consultant']

match_witnesses(df, keywords, sector=sector, industry=industry)

10.3 MISC BUSINESS - Business Services:

 
>>> There are 2 Business Services witnesses.

>>> 20 more Business Services witnesses have been matched, resulting in a total of 22 witnesses.


In [51]:
# 10.5 MISC BUSINESS - Chemical & Related Manufacturing
sector, industry = select_industry(sectors, 10, 5)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['zaclon chemical', 'dupont', 'basf']

match_witnesses(df, keywords, sector=sector, industry=industry)

10.5 MISC BUSINESS - Chemical & Related Manufacturing:

 
>>> There are 5 Chemical & Related Manufacturing witnesses.

>>> 3 more Chemical & Related Manufacturing witnesses have been matched, resulting in a total of 8 witnesses.


In [52]:
# 10.14 MISC BUSINESS - Misc Manufacturing & Distributing
sector, industry = select_industry(sectors, 10, 14)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['energy[\s-]{1}intensive manufacturers', 'manufacturers association', 'corning incorporated',
            'general electric', 'ge global research', 'ge energy', 'siemens', 'aircuity', 'applied materials']

match_witnesses(df, keywords, sector=sector, industry=industry)

10.14 MISC BUSINESS - Misc Manufacturing & Distributing:

 
>>> There are 11 Misc Manufacturing & Distributing witnesses.

>>> 16 more Misc Manufacturing & Distributing witnesses have been matched, resulting in a total of 27 witnesses.


In [53]:
# 10.20 MISC BUSINESS - Steel Production
sector, industry = select_industry(sectors, 10, 20)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['nucor steel corporation']

match_witnesses(df, keywords, sector=sector, industry=industry)

10.20 MISC BUSINESS - Steel Production:

 
>>> There are 3 Steel Production witnesses.

>>> 1 more Steel Production witnesses have been matched, resulting in a total of 4 witnesses.


### 9 LAWYERS & LOBBYISTS

In [54]:
# 9 LAWYERS & LOBBYISTS (K)
# 	 0 Lawyers/Law Firms
# 	 1 Lobbyists

In [55]:
# 9.0 LAWYERS & LOBBYISTS - Lawyers/Law Firms
sector, industry = select_industry(sectors, 9, 0)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['latham & watkins', 'crowell and moring', 'meagher & flom']

match_witnesses(df, keywords, sector=sector, industry=industry)

9.0 LAWYERS & LOBBYISTS - Lawyers/Law Firms:

 
>>> There are 6 Lawyers/Law Firms witnesses.

>>> 2 more Lawyers/Law Firms witnesses have been matched, resulting in a total of 8 witnesses.


### 1 COMMUNICATIONS/ELECTRONICS

In [56]:
# 1 COMMUNICATIONS/ELECTRONICS (B)
# 	 0 Book, newspaper & periodical publishing
# 	 1 Cable & satellite TV production
# 	 2 Commercial TV & radio stations
# 	 3 Computer software
# 	 4 Electronics Mfg & Equip
# 	 5 Internet
# 	 6 Motion Picture production & distribution
# 	 7 Printing & Publishing
# 	 8 Recorded Music & music production
# 	 9 Telecom Services
# 	 10 Telephone Utilities
# 	 11 TV production
# 	 12 TV/Movies/Music

In [57]:
# 1.t COMMUNICATIONS/ELECTRONICS - Internet
sector, industry = select_industry(sectors, 1, 5)

count_witnesses(df, 'witness_industry', industry)

# Match witnesses
keywords = ['google']

match_witnesses(df, keywords, sector=sector, industry=industry)

1.5 COMMUNICATIONS/ELECTRONICS - Internet:

 
>>> There are 0 Internet witnesses.

>>> 3 more Internet witnesses have been matched, resulting in a total of 3 witnesses.


In [58]:
# Search for further matches
keywords = [' ']
antikeywords = []

match_witnesses(df, keywords, antikeywords, search = True, sector=sector, industry=industry)

### CORRECTIONS

In [59]:
# # Reclassify wrongly classified witnesses (the automatic matching process matches the FIRST match, sometimes resulting with a fringe aspect of a business operation in multiple sectors being the first match)

for i, text in enumerate(df):
    for j, witness in enumerate(text['witnesses']):
# United Technologies to Air Transport
        if len(re.findall('United Technologies', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Air Transport'
            df[i]['witness_sector'][j] = 'Transportation' 
# DuPont and the Fertilizer Institute to Chemical Manufacturing
        if len(re.findall('DuPont|Dupont|Fertilizer Institute', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Chemical & Related Manufacturing'
            df[i]['witness_sector'][j] = 'Misc Business'
# Baxter International to Misc Manufacturing:
        if len(re.findall('Baxter International', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Misc Manufacturing & Distributing'
            df[i]['witness_sector'][j] = 'Misc Business'  
# United Nations Foundation and Center on Budget and Policy Priorities to Non-Profit Institutions
        if len(re.findall('United Nations Foundation|Center on Budget and Policy Priorities', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Non-Profit Institutions'
            df[i]['witness_sector'][j] = 'Other'
# Infinia Corporation to Alternate energy production & services
        if len(re.findall('Infinia Corporation', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Alternate energy production & services'
            df[i]['witness_sector'][j] = 'Energy & Natural Resources'
# Nuclear Energy and Duke Nuclear to Nuclear Energy
        if len(re.findall('Nuclear Energy|Duke Nuclear', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Nuclear Energy'
            df[i]['witness_sector'][j] = 'Energy & Natural Resources'
# TransCanada, ConocoPhillips AGL Resources, and American Gas Association to Oil & Gas
        if len(re.findall('TransCanada|ConocoPhillips|AGL Resources|American Gas Association', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Oil & Gas'
            df[i]['witness_sector'][j] = 'Energy & Natural Resources' 
# AFL-CIO and United Auto Workers to Industrial Unions
        if len(re.findall('AFL-CIO', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Industrial Unions'
            df[i]['witness_sector'][j] = 'Labor'
# Natsource to Securities & Investment
        if len(re.findall('Natsource|Stark Investments', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Securities & Investment'
            df[i]['witness_sector'][j] = 'Finance, Insurance & Real Estate'
# Quinault Indian Nation to Civil Servants/Public Officials
        if len(re.findall('Quinault Indian Nation|State of Michigan', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'Civil Servants/Public Officials'
            df[i]['witness_sector'][j] = 'Other'
# Intergovernmental Panel on Climate Change to International/Intergovernmental
        if len(re.findall('Intergovernmental Panel on Climate Change', witness)):
            print(witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])
            df[i]['witness_industry'][j] = 'International/Intergovernmental'
            df[i]['witness_sector'][j] = 'Other'

Cogen, Jack, President, Natsource : Securities & Investment -- Finance, Insurance & Real Estate
Rosenzweig, Richard, Chief Operating Officer, Member of International Climate Change Partner, Natsource : Securities & Investment -- Finance, Insurance & Real Estate
Shaw, Ruth, Group Executive for Public Policy and President for Duke Nuclear, Duke Energy Corporation : Electric Utilities -- Energy & Natural Resources
Houghton, Sir John, Co-Chairman, Scientific Assessment Working Group, Intergovernmental Panel on Climate Change, London, England : International/Intergovernmental -- Other
Dr. Mack McFarland, Environmental Manager, Fluorochemicals Business, E.I. DuPont De Nemours and Company : Hospitals/Nursing Homes -- Health
Mr. Ronald E. Meissen, Senior Director, Engineering, Environment, Health & Safety, Baxter International, Inc. : Medical Devices & Supplies -- Health
Dr. Robert H. Hobbs, Director of Operations, United Technologies Research Center, United Technologies Corporation : Electric

In [60]:
# Summary:
matched = 0
witnesses = 0

for text in df:
    for i, witness in enumerate(text['witnesses']):
        witnesses += 1
        if text['witness_affiliation'][i] != None:
            matched += 1              

print('The affiliations of {} out of {} witnesses were successfully matched.\n\n'.format(matched, witnesses))

The affiliations of 855 out of 855 witnesses were successfully matched.




In [61]:
# Print witnesses per sector
for sector in sectors:
    count_witnesses(df, 'witness_sector',  sector['name'], print_witnesses = False)

 
>>> There are 20 Agribusiness witnesses.

 
>>> There are 3 Communications/Electronics witnesses.

 
>>> There are 4 Construction witnesses.

 
>>> There are 0 Defense witnesses.

 
>>> There are 132 Energy & Natural Resources witnesses.

 
>>> There are 28 Finance, Insurance & Real Estate witnesses.

 
>>> There are 0 Health witnesses.

 
>>> There are 158 Ideological/Single-Issue witnesses.

 
>>> There are 14 Labor witnesses.

 
>>> There are 11 Lawyers & Lobbyists witnesses.

 
>>> There are 89 Misc Business witnesses.

 
>>> There are 370 Other witnesses.

 
>>> There are 26 Transportation witnesses.



In [62]:
# # Print witnesses per industry
# for sector in sectors:
#     for industry in sector['industries']:
#         count_witnesses(df, 'witness_industry',  industry['industry'], print_witnesses = False)

In [63]:
# Save the data
with open('CommitteeHearings/hearings_witnesses_contrarians_classified.json', 'w') as file:
    json.dump(df, file)

In [64]:
# # Investigate industries per sector
# for industry in sectors[2]['industries']: #jump
#     count_witnesses(df, 'witness_industry',  industry['industry'], print_witnesses = False)

In [65]:
# # Search industries
# for sector in sectors:
#     for industry in sector['industries']:
#         if industry['lobbying_groups']:
#             for firm in industry['lobbying_groups']:
#                 if 'dominion'.lower() in firm.lower():
#                     print(firm.lower(), ':', industry['industry'])

In [66]:
# # Search witnesses
# for i, text in enumerate(df):
#     for j, witness in enumerate(text['witnesses']):
#         if len(re.findall(' ge '.lower(), witness.lower())):
#             print(df[i]['identifier'], witness, ':', df[i]['witness_affiliation'][j], '--', df[i]['witness_industry'][j])
# #             print(df[i]['identifier'], witness, ':', df[i]['witness_industry'][j], '--', df[i]['witness_sector'][j])