# Extract hearings for the correct time-span mentioning climate change and a carbon pricing keyword

In [1]:
from TextCollection import *

In [2]:
# # Reinstantiate class after changing the Textcollection.py script
# # Do not run this in the last run! Leads to a conflict with pickle.

#from importlib import reload

# os.chdir('/home/mirjam/OneDrive/congress_committees/ArticleOne/Article_Scripts/Hearings')
# import TextCollection; reload(TextCollection)

# # Reinstantiate class
# t.__class__ = HearingsCollection

In [3]:
# Change directory
os.chdir('../../Data/')

<br>

### Import all hearings as t.texts.

In [4]:
path = '../../../../congress_committees/Boussalis & Coan/data/texts/'
basename = 'text_CHRG-'
filetype = '.htm'

In [5]:
tic = time.time()
t = HearingsCollection(path, basename, filetype)
toc = time.time()
print('It took approximately {} minutes to load the hearings.'.format(round((toc-tic)/60)))

21729 hearings were loaded.
It took approximately 12 minutes to load the hearings.


In [6]:
t.texts[0].keys()

dict_keys(['filename', 'identifier', 'content_raw', 'content', 'content_stripped', 'congress'])

##### <br>

### Climate related?

In [7]:
keywords = {'climate_related': ['climate.?change', 'global.?warming']}

In [8]:
tic = time.time()
t.get_keywords(keywords)
toc = time.time()
print('The keywords of {} hearings were extracted in {} minutes.'.format(len(t.texts), round((toc-tic)/60,2)))

The keywords of 21729 hearings were extracted in 1.19 minutes.


In [9]:
t.texts[0].keys()

dict_keys(['filename', 'identifier', 'content_raw', 'content', 'content_stripped', 'congress', 'climate_related'])

In [10]:
count = 0
for text in t.texts:
    if len(text['climate_related']) > 0:
        count += 1
print('{} climate related hearings were found.'.format(count))

3529 climate related hearings were found.


<br>

### Subset the correct years (i.e. congress number).

In [11]:
congress_numbers = ['108', '109', '110', '111']
t.subset('congress', congress_numbers)
print('{} hearings from the {}th to the {}th congress were found.'.format(len(t.texts), congress_numbers[0], congress_numbers[-1]))

The data now contains 9948 hearings.
9948 hearings from the 108th to the 111th congress were found.


<br>

### Extract the relevant keywords for further subsetting the data

In [12]:
keywords = {'keywords_climatechange': ['climate.?change', 'global.?warming'],
            'keywords_climate': ['greenhouse.?effect', 'greenhouse.?gas', 'ghg',
                                 'carbon.?dioxide', 'co2',
                                 'carbon.?emission',
                                 'methane', 'ch4',
                                 'nitruous.?oxide', 'n2o', 
                                 'ozone', 'o3',
                                 'chlorofluorocarbon', 'cfc', 
                                 'hydrofluorocarbon', 'hcfc', 'hfc'],
            'keywords_carbonpricing': ['emission[s]?.?trading', 'cap.?and.?trade',
                                     '(?:american)?.?clean.?energy.?and.?security.?act', 
                                     'climate.?security', 'waxman.?markey',
                                     'climate.?stewardship', 'carbon.?tax', 'carbon.?fee',
                                     'price.?on.?carbon',  'emission[s]?.?tax', 'emission[s]?.?fee',
                                      'climate.?policy', 'climate.?change.?policy']}

In [13]:
tic = time.time()
t.get_keywords(keywords)
toc = time.time()
print('The keywords of {} hearings were extracted in {} minutes.'.format(len(t.texts), round((toc-tic)/60,2)))

The keywords of 9948 hearings were extracted in 2.9 minutes.


In [14]:
t.print_key_i('keywords_climatechange', 0, 100, return_set = 1)

5 {'climate change', 'global warming'}
20 {'climate change'}
27 {'global warming'}
28 {'climate change', 'global warming'}
29 {'climate change', 'global warming'}
41 {'climate change', 'global warming'}
44 {'climate change', 'global warming'}
50 {'climate change', 'global warming'}
56 {'climate change'}
57 {'globalwarming', 'climate change', 'global warming'}
62 {'climate change'}
64 {'globalwarming', 'climate-change', 'climate change', 'global warming'}
66 {'climate change', 'global warming'}
72 {'globalwarming', 'climate change', 'global warming'}
78 {'climate change'}
94 {'climate change', 'global warming'}


In [15]:
t.print_key_i('keywords_carbonpricing', 0, 100, return_set = 1)

41 {'carbon tax'}
44 {'price on carbon', 'cap-and trade', 'climate change policy', 'climate policy', 'cap-and-trade', 'carbon tax'}
64 {'cap and trade', 'climate security', 'climate change policy', 'climate policy', 'cap-and-trade', 'carbon tax'}
72 {'climate policy', 'carbon tax', 'cap and trade'}
92 {'waxman-markey', 'cap-and-trade'}


In [16]:
count = 0
cutoff = 1
for text in t.texts:
    if len(text['keywords_climatechange']) >= cutoff:
        count += 1  
print('There are {} hearings mentioning climate change or global warming at least {} time.'.format(count, cutoff))

There are 1775 hearings mentioning climate change or global warming at least 1 time.


In [17]:
count = 0
cutoff = 1
for text in t.texts:
    if len(text['keywords_carbonpricing']) >= cutoff:
        count += 1  
print('There are {} hearings with at least {} carbon pricing related keywords in the remaining hearings.'.format(count, cutoff))

There are 628 hearings with at least 1 carbon pricing related keywords in the remaining hearings.


<br>

### Subset to hearings containg at least one climate change and one carbon pricing keyword.

In [18]:
t.texts = [text for text in t.texts if len(text['keywords_climatechange']) > 0 and len(text['keywords_carbonpricing']) > 0]
print('The data now contains {} hearings.'.format(len(t.texts)))
#497

The data now contains 545 hearings.


In [19]:
count = 0
cutoff = 10
for text in t.texts:
    if len(text['keywords_carbonpricing']) > cutoff:
        count += 1  
print('There are {} hearings with more than {} climate related keywords in the remaining hearings.'.format(count, cutoff))

There are 175 hearings with more than 10 climate related keywords in the remaining hearings.


<br>

### Extract the titles

In [20]:
tic = time.time()
t.get_titles()
toc = time.time()
print('The titles of {} hearings were extracted in {} seconds.'.format(len(t.texts), round(toc-tic,2)))

The titles of 545 hearings were extracted in 9.65 seconds.


In [21]:
t.print_key_i('title', missing = True) 
# No missing titles

<br>

### Extract the dates

In [22]:
tic = time.time()
t.get_dates()
toc = time.time()
print('The dates of {} hearings were extracted in {} seconds.'.format(len(t.texts), round(toc-tic,2)))

The dates of 545 hearings were extracted in 4.48 seconds.


In [23]:
t.print_key_i('year', missing = True) 
# No missing dates/years

<br>

### Extract the committees

In [24]:
tic = time.time()
t.get_committees()
toc = time.time()
print('The committees of {} hearings were extracted in {} seconds.'.format(len(t.texts), round(toc-tic,2)))

The committees of 545 hearings were extracted in 0.12 seconds.


In [25]:
t.print_key_i('committee', 0, 5)

0 ['energy and resources', 'government reform']
1 ['energy and commerce', 'energy and environment']
2 ['intelligence', 'energy independence and global warming', 'energy independence', 'intelligence community management permanent select']
3 ['energy independence and global warming', 'energy independence']
4 ['joint economic committee']


In [26]:
t.print_key_i('committee', missing = True) 
# All but one (134) of these hearings with missing committee are hearings before a comission.

57 []
71 []
74 []
92 []
152 []
372 []
510 []


In [27]:
# Hearing 134 is before the Committee on Appropriations, which was not noted in the header of the transcript (see https://www.govinfo.gov/app/details/CHRG-111hhrg56343/CHRG-111hhrg56343)

t.print_key_head('content', 134)

# Replace and echeckthe missing committee
t.texts[134]['committee'] = 'appropriations'
t.print_key('committee', 134)



 - energy and water development appropriations for fiscal year 2009
[senate hearing 110-840]
[from the u.s. government printing office]




                                                        s. hrg. 110-840

                                                        senate hearings

                                 before the committee on appropriations

_______________________________________________________________________


                                                       energy and 
appropriations


## Save data

In [38]:
save_as(t, 'Hearings/00_hearings_carbon_pricing.pkl') # Last completed on Sep 15, 2020