In [1]:
from CommitteeHearingsFunctions import *

# Change directory
os.chdir('../../Data/')

In [2]:
# Load the data
with open('CommitteeHearings/hearings_raw.json', 'r') as file:
    df = json.load(file)

## Subsetting Part I:  quantitative pre-selection (keyword mentions)

In [3]:
# Match cap-and-trade keywords in the text
keywords = {'keywords_climatechange': ['climate.?change', 'global.?warming']}
get_keywords(df, 'htm_stripped', keywords)
# Drop all hearings that do not mention climate change or global warming
print(len([h for h in df if not len(h['keywords_climatechange']) > 0]), "climate change unrelated hearings are dropped.")
df = [h for h in df if len(h['keywords_climatechange']) > 0]
print('\n >> The data now contains {} hearings.'.format(len(df)))

  0%|          | 0/12090 [00:00<?, ?it/s]

9904 climate change unrelated hearings are dropped.

 >> The data now contains 2186 hearings.


In [4]:
# Match climate change focussed keywords in the title
keywords = {'keywords_climatefocus': ['climate.?change', 'global.?warming',
                                      'climate.?science', 'greenhouse.?gas']}
get_keywords(df, 'title', keywords)
# Match cap-and-trade keywords in the text
keywords = {'keywords_capandtrade': ['cap.?and.?trade', 'cap.?and.?auction',
                                     'cap.?and.?dividend', 'cap.?and.?tax', 
                                     'cap.?on.?carbon', 'cap.?on.?co2', 
                                     'cap.?on.?emissions',
                                     'cap.?on.?the.?amount.?of.?carbon',
                                     'cap.?on.?the.?amount.?of.?co2',
                                     'carbon.?limit[s]?','climate.?income',
                                     'emission[s]?.?cap[s]?', 'emission[s]?.?limit[s]?', 
                                     'emission[s]?.?restriction[s]?', 'emission[s]?.?standard[s]?',
                                     'emission[s]?.?trading']}
get_keywords(df, 'htm_stripped', keywords)

  0%|          | 0/2186 [00:00<?, ?it/s]

  0%|          | 0/2186 [00:00<?, ?it/s]

In [5]:
set(keywords['keywords_capandtrade'])

{'cap.?and.?auction',
 'cap.?and.?dividend',
 'cap.?and.?tax',
 'cap.?and.?trade',
 'cap.?on.?carbon',
 'cap.?on.?co2',
 'cap.?on.?emissions',
 'cap.?on.?the.?amount.?of.?carbon',
 'cap.?on.?the.?amount.?of.?co2',
 'carbon.?limit[s]?',
 'climate.?income',
 'emission[s]?.?cap[s]?',
 'emission[s]?.?limit[s]?',
 'emission[s]?.?restriction[s]?',
 'emission[s]?.?standard[s]?',
 'emission[s]?.?trading'}

In [6]:
# Subset the hearings to all hearings that mention either a climate focus keyword in the title OR contain at more than 10 cap and trade keywords in the text
df_sub = [text for text in df if (len(text['keywords_climatefocus']) >= 1) or (len(text['keywords_capandtrade']) > 10)]
for i in df_sub:
    i['identifier'] = i['packageId'][5:]
len(df_sub) #287 # 237 # 

257

In [7]:
# Drop hearings without (external) witnesses or held by a joint committee
print('No (external) witnesses / Joint committee:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall('''errata|the economic outlook''', text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall('''errata|the economic outlook''', text["title"].lower())]
len(df_sub)

No (external) witnesses / Joint committee:
(0, '[ERRATA] Green Jobs Created by Global Warming Initiatives', '110shrg80068')
(19, 'The Economic Outlook', '111shrg58197')




255

## Subsetting part II: Manual fine-tuning of the selection

In [8]:
# Drop all budget, appropriations, nominations and annual reports hearings
print(len([text['title'] for text in df_sub if re.findall("budget|appropriations|fiscal year|fiscal year|nomination|annual report", text["title"].lower())]),
      "budget hearings, nominations and annual reports are dropped.")
# Uncomment below to see the titles of the hearings that will be dropped
# print(*[text['title'] for text in df_sub if re.findall("budget|appropriations|fiscal year|nomination|annual report", text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall("budget|appropriations|fiscal year|fiscal year|nomination|annual report", text["title"].lower())]
print('\n >> The data now contains {} hearings.'.format(len(df_sub)))

20 budget hearings, nominations and annual reports are dropped.

 >> The data now contains 235 hearings.


In [9]:
# Regulation of (other) pollutants
print('Regulation of other pollutants:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall("mercury|particulate matter and ozone|cair|diesel|montreal|black carbon|water quality", text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall("mercury|particulate matter and ozone|cair|diesel|montreal|black carbon|water quality", text["title"].lower())]
# Foreign policy, international agreements, and global and national security
print('Foreign policy, international agreements, and global and national security:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall('''vulnerable nations|vulnerable societies|vulnerable countries|island|arctic|tropical forests|global security|global economic recovery|
|global effort|asia|china|africa|geopolitical implications|national security(?!,)|bali|poznan|copenhagen''', text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall('''vulnerable nations|vulnerable societies|vulnerable countries|island|arctic|tropical forests|global security|global economic recovery|
|global effort|asia|china|africa|geopolitical implications|national security(?!,)|bali|poznan|copenhagen''', text["title"].lower())]
# Energy/technology hearings not focussed on setting a price on carbon
print('Energy/technology hearings not focussed on setting a price on carbon:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall('''climate change technology|efficiency|energy-efficient|electric|renewable|agriculture policy|cafe|new power plants|
|sequestration|capture and storage|massachusetts|coal(?! under)|aviation|transportation|cars|oil recovery|oil demand|universities|science program|research and information program
|voluntary carbon offsets|clean technology|energy tax incentives|small business solutions|with gas|research and applications investments|resilience|derivatives''', text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall('''climate change technology|efficiency|energy-efficient|electric|renewable|agriculture policy|cafe|new power plants|
|sequestration|capture and storage|massachusetts|coal(?! under)|aviation|transportation|cars|oil recovery|oil demand|universities|science program|research and information program|
|voluntary carbon offsets|clean technology|energy tax incentives|small business solutions|with gas|research and applications investments|resilience|derivatives''', text["title"].lower())]
# Regional impacts and regulations of climate change
print('Regional impacts and regulations of climate change:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall("chesapeake|new england|coastal|vulnerable communities|california|colorado|public lands|green cities", text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall("chesapeake|new england|coastal|vulnerable communities|california|colorado|public lands|green cities", text["title"].lower())]
# Other impacts of climate change
print('Other impacts of climate change:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall('''impacts on agriculture|insurance|financial risks|economic impacts of global|potential impacts of|water supply|ocean|climate change to national park|
|public health$|nation's forests|federal forests|extreme weather|wildfire|human health|wildlife''', text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall('''impacts on agriculture|insurance|financial risks|economic impacts of global|potential impacts of|water supply|ocean|climate change to national park|
|public health$|nation's forests|federal forests|extreme weather|wildfire|human health|wildlife''', text["title"].lower())]
# Interference with climate legislation
print('Interference with climate legislation:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall('''interference|integrity''', text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall('''interference|integrity''', text["title"].lower())]
# No witnesses / Joint committee
print('No (external) witnesses / Joint committee:')
print(*[(len(text['keywords_capandtrade']), text['title'],  text['identifier']) for text in df_sub if re.findall('''errata|senators|local|the economic outlook''', text["title"].lower())], sep='\n'); print('\n')
df_sub = [text for text in df_sub if not re.findall('''errata|the economic outlook''', text["title"].lower())]

print(*[(len(text['keywords_capandtrade']), text['title'], text['packageId']) 
        for text in df_sub if re.findall('''state''', text["title"].lower())], sep='\n'); print('\n')

len(df_sub)

Regulation of other pollutants:
(19, 'Implementation of the New Air Quality Standards for Particulate Matter and Ozone', '108shrg94602')
(20, 'Implementation of the Existing Particulate Matter and Ozone Air Quality Standards', '109shrg39523')
(11, 'S. 1265, the Diesel Emissions Reduction Act of 2005', '109shrg37294')
(31, 'Hearing on Mercury Legislation', '110shrg85534')
(57, 'The State of Mercury Regulation, Science and Technology', '110shrg61965')
(0, 'The Montreal Protocol and Global Warming', '110hhrg44428')
(0, 'Epa Black Carbon and Global Warming', '110hhrg45164')
(15, 'Nonpoint Source Pollution: Atmospheric Deposition and Water Quality', '110hhrg34796')
(17, "Epa's Clean Air Interstate Rule (Cair): Recent Court Decision and Its Implications", '110shrg88904')
(39, "Oversight: Environmental Protection Agency's Clean Air Regulations--One Year After the Cair and Camr Federal Court Decisions", '111shrg95160')
(14, "Oversight: Epa's Proposal for Federal Implementation Plans to Reduce 

117

In [10]:
# Save the data
with open('CommitteeHearings/hearings.json', 'w') as file:
    json.dump(df_sub, file)