# Extract witness information

#### This script extracts the witnesses present at the hearings from the mod-files and the transcripts

In [1]:
from TextCollection import *

In [2]:
# # Reinstantiate class after changing the Textcollection.py script
# # Do not run this in the last run! Leads to a conflict with pickle.

# from importlib import reload

# os.chdir('/home/mirjam/OneDrive/congress_committees/ArticleOne/Article_Scripts/Hearings')
# import TextCollection; reload(TextCollection)

# # Reinstantiate class
# t.__class__ = HearingsCollection

In [3]:
# Change directory
os.chdir('../../Data/')

<br>

## 1) Preparation: Loading and matching data

<br>

### Load the data

In [4]:
t = load('Hearings/01_hearings_carbon_pricing.pkl')
print('{} hearings were loaded.'.format(len(t)))

265 hearings were loaded.


<br>

### Match and append the MODS (Metadata Object Description Schema) 



In [5]:
source = '../../../../congress_committees/Boussalis & Coan/data/mods/'

t.get_mods(source = source)

In [6]:
files = os.listdir(source)

for text in t.texts:
    for file in files:
        if file.find(text['identifier']) != -1:
            f = codecs.open(source + file, 'r', 'utf-8')
            text['MODS'] = BeautifulSoup(f.read(), 'html.parser')

In [7]:
print('The MODS files of the {} hearings were extracted. Here is the first MODS file:'.format(len(t)))
t.get_key('MODS', 0)

The MODS files of the 265 hearings were extracted. Here is the first MODS file:


<mods id="P0b002ee182a214ff" version="3.3" xmlns="http://www.loc.gov/mods/v3" xmlns:exslt="http://exslt.org/common" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-3.xsd">
<name type="corporate">
<namepart>United States Government Publishing Office</namepart>
<role>
<roleterm authority="marcrelator" type="text">publisher</roleterm>
<roleterm authority="marcrelator" type="code">pbl</roleterm>
</role>
<role>
<roleterm authority="marcrelator" type="text">distributor</roleterm>
<roleterm authority="marcrelator" type="code">dst</roleterm>
</role>
</name>
<name type="corporate">
<namepart>United States</namepart>
<namepart>Congress</namepart>
<namepart>House of Representatives</namepart>
<role>
<roleterm authority="marcrelator" type="text">author</roleterm>
<roleterm authority="marcrelator" type="code">aut</roleterm>
</role>
<description>Government Organ

<br>

## 2) Extract the witness information

<br>

### Extract witness information from the MODS files



In [8]:
count_MODS = 0
for text in t.texts:
    text['witnesses_MODS'] = []
    #Extract the witnesses
    text['witnesses_MODS'] = find_string.all_matches(['<witness>(.+?)</witness>'], str(text['MODS']))
    #Replace wrongly parsed ampersands
    for i, witness in enumerate(text['witnesses_MODS']):
        text['witnesses_MODS'][i] = witness.replace('&amp;','&')
    # Count hearings with sucessfully extracted witness information
    if len(text['witnesses_MODS']) > 0:
        count_MODS += 1
print('Witnesses were found for {} of the {} hearing.'.format(count_MODS, len(t)))

Witnesses were found for 185 of the 265 hearing.


In [9]:
# Correct parsing errors
t.texts[29]['witnesses_MODS'][2] = 'Doniger, David D., policy director, Climate Center, Natural Resources Defense Council'
t.texts[29]['witnesses_MODS'][3] = 'Lomborg, Bjorn, adjunct professor, Copenhagen Consensus Center'
t.texts[47]['witnesses_MODS'][2] = 'Scott, Douglas P., director, Illinois Environmental Protection Agency'
t.texts[125]['witnesses_MODS'][16] = 'Brossy, Fred, organic wheat, bean, potato, and hay producer, Shoshone, ID'

In [10]:
# Remove reports, letters, articles and other submitted material from the witness list and split witnesses
count = 0
for i, text in enumerate(t.texts):
    # Remove submitted material
    for j, witness in enumerate(text['witnesses_MODS']):
        if len(re.findall('submitted|letter|report|article|document|transcript|^prepared statement$', witness.lower())) > 0:
            count+=1
            # Uncomment to display entries to be deleted
#             print('Hearing {}: {}'.format(i, witness))
            t.texts[i]['witnesses_MODS'][j] = None
    # Delete empty list entries
    t.texts[i]['witnesses_MODS'] = [witness for witness in t.texts[i]['witnesses_MODS'] if witness]
    # Split accompanying witnesses
    for j, witness in enumerate(text['witnesses_MODS']):
        if len(re.findall('accompanied', witness.lower())) > 0:
            # Uncomment to display entries to be split
#             print('Hearing {}: {}'.format(i, witness))
            text['witnesses_MODS'][j] = re.split(r';?\s?accompanied by\:?\s?', witness.lower())
    #Flatten the list
    text['witnesses_MODS'] = flatten(text['witnesses_MODS'])
    # Split witnesses with page numbers
    for j, witness in enumerate(text['witnesses_MODS']):
        if len(re.findall('\d+(?:,\s\d+\s?)+\s', witness.lower())) > 0:
            # Uncomment to display entries to be split
#             print('Hearing {}: {}'.format(i, witness))
            text['witnesses_MODS'][j] = re.split('\d+(?:,\s\d+\s?)+\s', witness.lower())
    #Flatten the list
    text['witnesses_MODS'] = flatten(text['witnesses_MODS'])   
    
print('{} non-witness entries were removed.'.format(count))

86 non-witness entries were removed.


In [11]:
t.texts[40]['witnesses_MODS']

['tim adams, managing director, the lindsey group',
 'richard berner, phd, managing director and chief us economist, morgan stanley',
 'honorable steven chu, secretary, us department of energy',
 'robert a. dennis, assistant director for macroeconomics analysis',
 'douglas w. elemendorf, phd, director, congr',
 'honorable timothy f. geithner, secretary, us department of the treasury',
 'douglas holtz-eakin, phd, president, dhe consulting, llc',
 'simon johnson, phd, ronald kurtz professor of enterpreneurship, mit sloan school of management and senior fellow, peterson institute for international economics',
 'honorable peter r. orszag, director, us office of management and budget',
 'rudolph g. penner, phd,',
 'Ranking Member Gregg',
 'senator bunning',
 'senator crapo',
 'senator ensign',
 'senator enzi',
 'senator feingold',
 'senator graham',
 'senator murray',
 'senator nelson',
 'senator warner']

In [12]:
# Summary:
witnesses = 0
hearings = 0
for text in t.texts:
    if text['witnesses_MODS'] != []:
        hearings += 1
        for witness in text['witnesses_MODS']:
            witnesses += 1
        
print('{} witnesses were found at {} of the {} hearings.\n\n'.format(witnesses, hearings, len(t)))
print('Here is an example:')
[witness for witness in t.texts[0]['witnesses_MODS']]

1358 witnesses were found at 185 of the 265 hearings.


Here is an example:


['John McMackin, Jr., Williams and Jenson, PLLC, On Behalf Of The Energy Intensive Manufacturers Working Group on Greenhouse Gas Regulations',
 'Martin McBroom, Director, Federal Environmental Affairs, American Electric Power',
 'Paul Cicio, Industrial Energy Consumers of America',
 'Margo Thorning, Ph.D., Senior Vice President and Chief Economist, American Council for Capital Formation',
 'Richard D. Morgenstern, Senior Fellow, Resources for the Future',
 'Eileen Claussen, President, Pew Center on Global Climate Change']

<br>

### Extract remaining witness information from the transcripts

In [13]:
# Resular expressions for witness extraction (need to be run in order)
regex1 = ['STATEMENTS\n+\s*Page\n+(.+?)\n+\s*APPENDIX']
regex2 = ['TESTIMONY\n+\s*(.+?)\n+\s*PREPARED']
regex3 = ['Witnesses:?\n+\s*(.+?)\n\nDiscussion', 'Panel I+:\n+\s*(.+?)Discussion']
regex4 = ['Witnesses:?\n+\s*(.+?)\n\n']
regex5 = ['STATEMENT[S]* OF\s*(\\b[^a-z]+[a-z]?[^a-z]+\\b)']
regex6 = ['\n(\w.+?)\.{3,}\s+\d+'] # For last hearing (33)

In [14]:
# Extract witnesses consecutively with the regular expressions defined above
missing = len(t) - count_MODS
count = 0
for i, text in enumerate(t.texts):
    text['witnesses_transcript'] = []
    if len(text['witnesses_MODS']) == 0:
        #Extract the witnesses
        text['witnesses_transcript'] = find_string.all_matches(regex1, str(text['content_raw']))
        if len(text['witnesses_transcript']) == 0:
            text['witnesses_transcript'] = find_string.all_matches(regex2, str(text['content_raw']))
        if len(text['witnesses_transcript']) == 0:
            text['witnesses_transcript'] = find_string.all_matches(regex3, str(text['content_raw'][0:20000]))
        if len(text['witnesses_transcript']) == 0:
            text['witnesses_transcript'] = find_string.all_matches(regex4, str(text['content_raw']))
        if len(text['witnesses_transcript']) == 0:
            text['witnesses_transcript'] = find_string.all_matches(regex5, str(text['content_raw']))
        if len(text['witnesses_transcript']) == 0:
            text['witnesses_transcript'] = find_string.all_matches(regex6, str(text['content_raw'][3000:5000]))
        if len(text['witnesses_transcript']) > 0:
                count += 1
        else:
            print(i)
print('Witnesses were found for {} of the {} hearings missing witness information.'.format(count, missing))

Witnesses were found for 80 of the 80 hearings missing witness information.


In [15]:
t.print_key_i('witnesses_transcript', 39)

['HON. JOHN F. KERRY,\n                U.S. SENATOR FROM MASSACHUSETTS\n\n    ', 'HON. RICHARD G. LUGAR,\n                   U.S. SENATOR FROM INDIANA\n\n    ', 'LORD NICHOLAS STERN, CHAIR OF THE GRANTHAM \n   RESEARCH INSTITUTE ON CLIMATE CHANGE AND THE ENVIRONMENT, \n   LONDON SCHOOL OF ECONOMICS AND POLITICAL SCIENCE, LONDON, \n                         UNITED KINGDOM\n\n    ', 'JAMES E. ROGERS, PRESIDENT AND CHIEF EXECUTIVE \n              OFFICER, DUKE ENERGY, CHARLOTTE, NC\n\n    Mr. ']


In [16]:
# Post-process the witness information from the transcripts
for i, text in enumerate(t.texts):
    if text['witnesses_transcript'] != []:
        for j, witnesses in enumerate(text['witnesses_transcript']):
            # Make the witnesses lower case with leading capital letters
            text['witnesses_transcript'][j] = witnesses.title()
            # Split the multi-witness blocks into individual witnesses
            text['witnesses_transcript'][j] = re.split(r'\.*\s{3,}\d+\n*|\.{5,}\n', text['witnesses_transcript'][j])
        # Flatten the list while removing empty entries
        text['witnesses_transcript'] = [item for sublist in text['witnesses_transcript'] for item in sublist if item]
        for j, witnesses in enumerate(text['witnesses_transcript']):     
            # Split accompanying witnesses (split with semicolon when '; And', '; John' or 'Accompanied By' is present)
            if re.findall(r'; And|; John|Accompanied By', text['witnesses_transcript'][j]):
                text['witnesses_transcript'][j] = re.split(r';|\s?Accompanied By\s', text['witnesses_transcript'][j])
        # Flatten the list while removing empty entries
        text['witnesses_transcript'] = flatten(text['witnesses_transcript'])
        for j, witnesses in enumerate(text['witnesses_transcript']):
            # Remove unnecessary whitespace
            temp = re.sub('[\n\s]+', ' ', witnesses.strip())
            # Remove leading 'And' and trailing salutations
            text['witnesses_transcript'][j] = re.sub('\AAnd\s|(\sIii)?\s(Mr.|Ms.|Mrs.|Dr.)$|\sIii\s?$', '', temp)
        # Uncomment the following line to print the witnesses    
        print(i, text['witnesses_transcript'], '\n')

11 ['Bingaman, Hon. Jeff, U.S. Senator From New Mexico', 'Chavez, Martin J., U.S. Conference Of Mayors, And Mayor, Albuquerque, New Mexico', 'Collier, Alicia, Director, Global Energy Policy, Honeywell Building Solutions, Honeywell International, On Behalf Of Federal Performance Contracting Coalition (Fpcc)', 'Domenici, Hon. Pete V., U.S. Senator From New Mexico', 'Kerr, James Y., Ii, Commissioner, North Carolina Public Utilities Commission, And President, National Association Of Regulatory Utility Commissioners (Naruc)', 'Mizroch, John, Principal Deputy Assistant Secretary For Energy Efficiency And Renewable Energy, Department Of Energy', 'Pitsor, Kyle, Vice President, Government Relations, National Electrical Manufacturers Association (Nema)', 'Prindle, William, Acting Executive Director, American Council For An Energy Efficient Economy (Aceee)', 'Schjerven, Robert E., Chief Executive Officer Emeritus, Lennox International, Inc., On Behalf Of The Gas Appliance Manufacturers Associatio

In [17]:
# Remove written/prepared statements and biography entries from the witness list
count = 0
for i, text in enumerate(t.texts):
    for j, witness in enumerate(text['witnesses_transcript']):
#         if len(witness.split()) <= 4:
        if re.findall('written statement|biography|prepared statement', witness.lower()) != []:
            count+=1
            # Uncomment to display entries to be deleted
            #print('Hearing {}: {}'.format(i, witness))
            t.texts[i]['witnesses_transcript'][j] = None
    # Delete empty list entries
    t.texts[i]['witnesses_transcript'] = [witness for witness in t.texts[i]['witnesses_transcript'] if witness] 
        
print('{} non-witness entries were removed.'.format(count))

120 non-witness entries were removed.


In [18]:
[witness for witness in t.texts[110]['witnesses_transcript']]

['Philip Cooney, Former Chief Of Staff Of The White House Council On Environmental Quality',
 'James Hansen, Director, Nasa Goddard Institute For Space Studies',
 'George Deutsch, Former Nasa Public Affairs Officer Statement Of Philip A. Cooney',
 'James E. Hansen',
 'George C. Deutsch',
 'James L. Connaughton, Chairman, White House Council On Environmental Quality',
 'Roy Spencer, University Of Alabama, Huntsville']

In [19]:
# Remove trailing 'statement of ...' text snippets (wrongly parsed)'
count = 0
for i, text in enumerate(t.texts):
    for j, witness in enumerate(text['witnesses_transcript']):
        if len(re.findall('^(.+?)\s[Ss]tatement [Oo]f', witness)) != 0:
            print(witness)
            t.texts[i]['witnesses_transcript'][j] = re.findall('^(.+?)\s[Ss]tatement [Oo]f', witness)[0]
            count += 1
print('\n >>> {} trailing "statement of ..." text snippets were removed.'.format(count))

Thomas Karl, Director, National Climatic Data Center, National Oceanic And Atmospheric Administration Statement Of Jim Connaughton
Jay Gulledge, Senior Research Fellow For Science & Impacts, Pew Center On Global Climate Change Statement Of Judith Curry
Marshall Herskovitz, Producer/Director/Writer, Television And Films Statement Of Theodore Roosevelt Iv
George Deutsch, Former Nasa Public Affairs Officer Statement Of Philip A. Cooney
Leslie Kass, Senior Director Of Business Policy And Programs, Nuclear Energy Institute Statement Of Peter Bradford
Christopher Guith, Vice President Of Public Policy, U.S. Chamber Of Commerce Statement Of Mark Cooper
John Cline, Partner, Troutman Sanders Llp Statement Of Ron Curry

 >>> 7 trailing "statement of ..." text snippets were removed.


In [20]:
# Summary:
witnesses = 0
hearings = 0
for text in t.texts:
    if text['witnesses_transcript']:
        hearings += 1
        for witness in text['witnesses_transcript']:
            witnesses += 1
        
print('{} witnesses were found at {} of the {} hearings.\n\n'.format(witnesses, hearings, len(t)))
print('Here is an example:')
[witness for witness in t.texts[11]['witnesses_transcript']]

733 witnesses were found at 80 of the 265 hearings.


Here is an example:


['Bingaman, Hon. Jeff, U.S. Senator From New Mexico',
 'Chavez, Martin J., U.S. Conference Of Mayors, And Mayor, Albuquerque, New Mexico',
 'Collier, Alicia, Director, Global Energy Policy, Honeywell Building Solutions, Honeywell International, On Behalf Of Federal Performance Contracting Coalition (Fpcc)',
 'Domenici, Hon. Pete V., U.S. Senator From New Mexico',
 'Kerr, James Y., Ii, Commissioner, North Carolina Public Utilities Commission, And President, National Association Of Regulatory Utility Commissioners (Naruc)',
 'Mizroch, John, Principal Deputy Assistant Secretary For Energy Efficiency And Renewable Energy, Department Of Energy',
 'Pitsor, Kyle, Vice President, Government Relations, National Electrical Manufacturers Association (Nema)',
 'Prindle, William, Acting Executive Director, American Council For An Energy Efficient Economy (Aceee)',
 'Schjerven, Robert E., Chief Executive Officer Emeritus, Lennox International, Inc., On Behalf Of The Gas Appliance Manufacturers Assoc

<br>

### Merge the witnesses

In [21]:
# Merge witnesses
for i, text in enumerate(t.texts):
    if text['witnesses_MODS'] != []:
        t.texts[i]['witnesses'] = text['witnesses_MODS'].copy()
    else:
        t.texts[i]['witnesses'] = text['witnesses_transcript'].copy()

In [22]:
# Summary:
witnesses = 0
hearings = 0
for text in t.texts:
    if text['witnesses'] != []:
        hearings += 1
        for witness in text['witnesses']:
            witnesses += 1
        
print('{} witnesses were found at {} of the {} hearings.'.format(witnesses, hearings, len(t)))

2091 witnesses were found at 265 of the 265 hearings.


<br>

## 3) Post-process the witnesses

<br>

### Remove Members of Congress

In [23]:
# Remove all Members of Congress from the witness list
count = 0

for i, text in enumerate(t.texts):
    for j, witness in enumerate(text['witnesses']):
        if (not find_string.all_matches(['former', 'retired', 'state senate'], str(witness.lower()))
        and find_string.all_matches(['senator', 'representative in congress', 'representative from the state', 'opening statement', 'ranking member'], str(witness.lower()))):
#             print(i, j, witness)
            count += 1
            text['witnesses'][j] = None
    # Delete empty list entries
    text['witnesses'] = [i for i in text['witnesses'] if i]

print('{} politicians were removed from the witness list.'.format(count))

273 politicians were removed from the witness list.


<br>

### Remove Duplicates

In [24]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

#### Fuzzywuzzy Partial Ratio

In [25]:
# Create base name for better matching (remove initials, nicknames and suffixes))
for i, text in enumerate(t.texts):
    t.texts[i]['witnesses_base'] = text['witnesses'].copy() 
    for j, witness in enumerate(text['witnesses_base']):
    # Remove leading inititals
        try:
            t.texts[i]['witnesses_base'][j] = re.findall('^(?:\s?[A-Z]{1}\.{1}\s)(.+)', witness)[0]
        except:
            pass
    # Remove middle inititals
        try:
            t.texts[i]['witnesses_base'][j] = ' '.join(re.findall('(.+?)(?:\s?[A-Z]{1}\.{1}\s?|\s[A-Z]{2}\s){1,2}(.*)', witness)[0])
        except:
            pass
    # Remove nicknames
        try:
            t.texts[i]['witnesses_base'][j] = ' '.join(re.findall('(.+?)\s(?:\(.+?\))\s?(.+)?', witness)[0])
        except:
            pass
    # Remove suffixes (except for Pielke Sr and Jr as both are sceptics)
        try:
            t.texts[i]['witnesses_base'][j] = re.findall('(.+?)(?<!Pielke)(?:,?\s(Jr|Sr))', witness)[0][0]
        except:
            pass
    # Remove special characters
        t.texts[i]['witnesses_base'][j] = unidecode.unidecode(t.texts[i]['witnesses_base'][j])

In [26]:
# Create fuzz.partial_ratio for all witnesses per text using the witnesses without initials
for i, text in enumerate(t.texts):
    text['duplicates_partial'] = []
    for j, witness in enumerate(text['witnesses_base']):
        text['duplicates_partial'].append([])
        count = 1
        while True:
            try:
                text['duplicates_partial'][j].append(fuzz.partial_ratio(text['witnesses_base'][j], 
                                                  text['witnesses_base'][j + count]))
                count += 1
            except:
                break

In [27]:
# Print partial duplicates
for i, text in enumerate(t.texts):
    if text['duplicates_partial'] is not None:
        for j, witness in enumerate(text['duplicates_partial']):
            for k, ratio in enumerate(witness):
                if text['duplicates_partial'][j][k] >= 95:
                    print(witness[k], '% similarity',  '\n\n',
                          i, j, '\t--> \t', text['witnesses'][j], '\n',
                          i, j+k+1,'\t--> \t', text['witnesses'][k+j+1], '\n\n')

100 % similarity 

 38 2 	--> 	 David Owens, Executive Vice President, Edison Electric Institute 
 38 24 	--> 	 David Owens, Executive Vice President, Edison Electric Institute 


100 % similarity 

 38 5 	--> 	 David Hawkins, Natural Resources Defense Council 
 38 14 	--> 	 David Hawkins, Natural Resources Defense Council 


100 % similarity 

 45 6 	--> 	 Smith, Matt, on behalf of the Society of American Foresters, Falconer, New York 
 45 7 	--> 	 Smith, Matt, on behalf of the Society of American Foresters, Falconer, New York 


100 % similarity 

 104 1 	--> 	 Thomas Karl, Director, National Climatic Data Center, National Oceanic And Atmospheric Administration 
 104 2 	--> 	 Thomas Karl 


100 % similarity 

 104 4 	--> 	 John R. Christy, Professor And Director, Earth System Science Center, Nsstc, University Of Alabama In Huntsville 
 104 7 	--> 	 John R. Christy 


100 % similarity 

 104 5 	--> 	 Roger A. Pielke, Jr., Center For Science And Technology Policy Research, University O

In [28]:
# Delete the shorter version of the partial duplicates 
count = 0

for i, text in enumerate(t.texts):
    if text['duplicates_partial'] is not None:
        for j, witness in enumerate(text['duplicates_partial']):
            for k, ratio in enumerate(witness):
                if text['duplicates_partial'][j][k] == 100:
                    if text['witnesses'][j] is not None and text['witnesses'][j+k+1] is not None:
                        count += 1
                        if len(text['witnesses'][j+k+1]) >= len(text['witnesses'][j]):
                            text['witnesses'][j] = None
                            text['witnesses_base'][j] = None
                        else:
                            text['witnesses'][j+k+1] = None
                            text['witnesses_base'][j+k+1] = None
    # Delete empty list entries
        text['witnesses'] = [i for i in text['witnesses'] if i]
        text['witnesses_base'] = [i for i in text['witnesses_base'] if i]

print('{} partial duplicates were removed'.format(count))

29 partial duplicates were removed


In [29]:
t.remove_key('duplicates_partial')
t.remove_key('witnesses_base')

<br>

### Summary

In [30]:
# Summary:
witnesses = 0
hearings = 0
nowitnesses = 0

for text in t.texts:
    if text['witnesses'] != []:
        hearings += 1
        for witness in text['witnesses']:
            witnesses += 1
    else:
        nowitnesses +=1

print('{} witnesses were found at {} of the {} hearings.\n\n'.format(witnesses, hearings, len(t)))

t.texts = [text for text in t.texts if text['witnesses'] != []]
        
print('The {} hearings without witnesses (only MoCs present) were removed from the dataset. The new dataset contains {} hearings.'.format(nowitnesses, hearings))

1789 witnesses were found at 263 of the 265 hearings.


The 2 hearings without witnesses (only MoCs present) were removed from the dataset. The new dataset contains 263 hearings.


<br>

### Save data


In [31]:
# Run for maximum recursion error during saving

import resource
import sys

print(resource.getrlimit(resource.RLIMIT_STACK))
print(sys.getrecursionlimit())

# May segfault without this line. 0x100 is a guess at the size of each stack frame.
max_rec = 0x100000
resource.setrlimit(resource.RLIMIT_STACK, [0x100 * max_rec, resource.RLIM_INFINITY])
sys.setrecursionlimit(max_rec)

print(sys.getrecursionlimit())

(8388608, -1)
3000
1048576


In [32]:
save_as(t, 'Hearings/02_witnesses.pkl') # Last completed on Oct 30, 2020