## Extract the witnesses giving testimony

#### This script extracts the information about the hearings as well as the witnesses giving testimony - where available, from the metadata files otherwise manually from the  transcripts.

In [1]:
from CommitteeHearingsFunctions import *

# Change directory
os.chdir('../../Data/')

In [2]:
# Load the data
with open('CommitteeHearings/hearings.json', 'r') as file:
    df = json.load(file)

### Extract hearings information

In [3]:
# Extract chamber and committee
for text in df:
    # Extract the witnesses
    text['chamber'] = first_match(['<chamber>(.+?)</chamber>'], str(text['mods']))
    text['committee'] = first_match(['<name type="authority-standard">(.+?)</name>'], str(text['mods']))
    text['committee_short'] = first_match(['<name type="authority-short">(.+?)</name>'], str(text['mods']))

### Add witnesses from the metadata files (mods)

In [4]:
count_MODS = 0
for text in df:
    text['witnesses_mods'] = []
    # Extract the witnesses
    text['witnesses_mods'] = all_matches(['<witness>(.+?)</witness>'], str(text['mods']))
    # Replace wrongly parsed information
    for i, witness in enumerate(text['witnesses_mods']):
        text['witnesses_mods'][i] = re.sub("Answers(.+?)\\\\\d\\\\[\.]{0,}\s?", "", text['witnesses_mods'][i])
        text['witnesses_mods'][i] = re.sub("\s?\\\\\d\\\\(.+?)Prepared statement", "", text['witnesses_mods'][i])
        text['witnesses_mods'][i] = re.sub(", prepared statement", "", text['witnesses_mods'][i])
        text['witnesses_mods'][i] = re.sub("\s?\\\\\d\\\\(.+?)submitted questions", "", text['witnesses_mods'][i])
        text['witnesses_mods'][i] = re.sub("; Accompanied by(.+)$", "", text['witnesses_mods'][i])
        text['witnesses_mods'][i] = re.sub("National Resources Defense Council", "Natural Resources Defense Council", text['witnesses_mods'][i])
    # Count hearings with successfully extracted witness information
    if len(text['witnesses_mods']) > 0:
        count_MODS += 1
print('Witnesses were found for {} of the {} hearing.'.format(count_MODS, len(df)))

Witnesses were found for 83 of the 117 hearing.


In [5]:
# Post-process the witness information from the mods
for i, text in enumerate(df):
    # Drop non-witness entries wrongly included as witnesses in the mods file
    df[i]['witnesses_mods'] = [w for w in df[i]['witnesses_mods'] if not
                               w.lower().startswith(('analysis', 'article', 'clean', 'excerpt', 'letter', 
                                                     'newspaper', 'polar bear', 'position paper', 
                                                     'prepared statement', 'report', 'supplement to'))]
    if text['identifier'] =='108shrg91748':   
    # Drop opening statements wrongly included as witnesses in the mods file
        df[i]['witnesses_mods'] = [w for w in df[i]['witnesses_mods'] if 
                               len(re.findall('senator', w.lower())) == 0]
# Note: There are still several Senators in the mods witnesses. However, manual checking revealed that all of these
# were not committee members giving opening statements but rather invited to testify as witnesses.

In [6]:
# # Print witnesses extracted from mods and the urls to the respective htm and mods
# for i, text in enumerate(df):
#     if text['witnesses_mods'] != []:
#         print(i, text['identifier'])
#         for witness in text['witnesses_mods']:
#             print(witness)
#         get_htm(text['identifier'])
#         get_mods(text['identifier'])
#         print('\n\n')

### Manually add witnesses for hearings with missing witness metadata

In [7]:
# Regular expressions for witness extraction (need to be run in order)
regex1 = ['STATEMENTS\n+\s*Page\n+(.+?)\n+\s*(?:APPENDIX|CLIMATE CHANGE)']
regex2 = ['Testimony of:\n+\s*(.+?)\n+\s*Addit[i]?onal material submitted for the record:']
regex3 = ['TESTIMONY\n+\s*(.+?)\n+\s*PREPARED']
regex4 = ['Witnesses:?\n+\s*(.+?)\n\nDiscussion', 'Panel I+:\n+\s*(.+?)Discussion']
regex5 = ['Witnesses:?\n+\s*(.+?)\n\n']
regex6 = ['Statement of:\n+\s*(.+?)\n+\s*(?:Letters|COUNTING)']
regex7 = ['STATEMENTS\n+\s*\w+\s\d{2},\s\d{4}\n+(.+?)\n+\s*(?:\w+\s\d{2},\s\d{4})(.+?)\n+\s*APPENDIX']
regex8 = ['Page\s+(.+?)\s+(?:Additional Statement|SUBMISSIONS|Additional Material|\(iii\)|POLICY OPTIONS)']
regex9 = ['Statement of (?!Sen)(.+?)\.+\s*\d+\n+']
regex10 = ['Witness:\s+(.+?)Oral Statement']
regex11 = ['Panel I+\n+\s*(.+?)\n{2,}']
# regex12 = ['STATEMENT[S]* OF\s*(\\b[^a-z]+[a-z]?[^a-z]+\\b)'] 

In [8]:
# Match the witnesses consecutively with the regular expressions defined above
count = 0
for i, text in enumerate(df):
    text['witnesses_htm'] = []
    if len(text['witnesses_mods']) == 0:
# Match the witnesses
        text['witnesses_htm'] = all_matches(regex1, str(text['htm']))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex2, str(text['htm']))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex3, str(text['htm'][0:20000]))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex4, str(text['htm']))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex5, str(text['htm']))    
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex6, str(text['htm']))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = [''.join(item) for item in all_matches(regex7, str(text['htm']))]
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex8, str(text['htm']))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex9, str(text['htm']))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex10, str(text['htm']))
        if len(text['witnesses_htm']) == 0:
            text['witnesses_htm'] = all_matches(regex11, str(text['htm']))
        if len(text['witnesses_htm']) > 0:
                count += 1
#                 Uncomment below to print the matches
#                 print(i, text['identifier'], '\n', text['witnesses_htm'])
#                 get_htm(text['identifier'])
#                 print('\n\n')
        else:
            print(i)
print('Witnesses were found for {} of the {} hearings missing witness information.'.format(count, len(df) - count_MODS))

Witnesses were found for 34 of the 34 hearings missing witness information.


In [9]:
# Post-process the witness information from the transcripts
for i, text in enumerate(df):
    text['witnesses'] = []
    if text['witnesses_htm'] != []:
        for j, witnesses in enumerate(text['witnesses_htm']):
            # Split the multi-witness blocks into individual witnesses
            text['witnesses'].append(re.split(r'\.*\s{3,}\d+\n*|\.{5,}\n|;\s+(?:and )?', 
                                              text['witnesses_htm'][j]))
        # Flatten the list
        text['witnesses'] = [item for sublist in text['witnesses'] for item in sublist]
        for k, witness in enumerate(text['witnesses']):
            witness_clean = re.sub('[\n\s]+', ' ', witness.strip())
            df[i]['witnesses'][k] = witness_clean
        # Drop empty entries
        text['witnesses'] = [w for w in text['witnesses'] if not w == '']
        # Drop non-witness entries
        text['witnesses'] = [w for w in text['witnesses'] if not
                             w.startswith(('accompanied by', 'Article', 'Biography', 'Prepared',
                                           'Paper', 'Responses', 'Written', '---'))]
        # Drop opening statements
        text['witnesses'] = [w for w in text['witnesses'] if not
                             all_matches(['chairman, house', 'representative in congress',
                                          'ranking \w*\s*member', 'senator'], w.lower())]
        # Drop exact duplicates
        text['witnesses'] = [w for w in set(text['witnesses'])]
        # Drop partial duplicates () for hearing '109hhrg29932'
        if text['identifier'] == '109hhrg29932':
             text['witnesses'] = [w for w in text['witnesses'] if len(w.split())>5]

In [10]:
# # Print witnesses extracted from the htm and the urls to the respective htm and mods
# for i, text in enumerate(df):
#     if text['witnesses'] != []:
#         print(i, text['identifier'])
#         for witness in text['witnesses']:
#             print(witness)
#         get_htm(text['identifier'])
#         get_mods(text['identifier'])
#         print('\n\n')

### Join the witness information from both sources

In [11]:
# Join the witness information
matched = 0
for i, text in enumerate(df):
    if text['witnesses'] == []:
        text['witnesses'] = text['witnesses_mods']
    if text['witnesses'] != []:
        matched += 1
print('Witnesses for {} out of {} hearings have been matched.'.format(matched, len(df)))


Witnesses for 117 out of 117 hearings have been matched.


In [12]:
# Correct misspellings
for i, text in enumerate(df):
    for j, w in enumerate(text['witnesses']):
        if re.findall('Holmstead, Hon. Jeffery', w):
            print(w.replace('Jeffery', 'Jeffrey'))
            df[i]['witnesses'][j] = w.replace('Jeffery', 'Jeffrey')
        if re.findall('(American|Business|Defense) Counsel', w):
            print(w.replace('Counsel', 'Council'))
            df[i]['witnesses'][j] = w.replace('Counsel', 'Council')

Holmstead, Hon. Jeffrey, Assistant Administrator for Air and Radiation, Environmental Protection Agency
Thorning, Margo, Ph.D., senior vice president and chief economist, American Council for Capital Formation
Lisa Jacobson, Executive Director, Business Council for Sustainable Energy
Doniger, David, Policy Director, Climate Center, Natural Resources Defense Council


In [13]:
# Add missing witness information
print(df[3]['witnesses'][13])
df[3]['witnesses'][13] = 'Brownstein, Mark S., director, Enterprise Strategy, PSEG Service Corporation'
print('>>', df[3]['witnesses'][13], '\n')

Brownstein, Mark S., director, Enterprise Strategy
>> Brownstein, Mark S., director, Enterprise Strategy, PSEG Service Corporation 



In [14]:
# Drop the temporary witness variables 'witnesses_mods' & witnesses_htm
df = drop_variables(df, ['witnesses_mods', 'witnesses_htm'])

In [15]:
# Save the data
with open('CommitteeHearings/hearings_witnesses.json', 'w') as file:
    json.dump(df, file)