## Collecting the Congressional Hearings

### Before you start
Make sure you have the following Python modules installed: <i>os</i>, <i>urllib</i> and <i>selenium</i>.<br>
If you have not installed selenium, you will also need to install the <a href="https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path">Geckodriver</a>.<br>

### Basic program setup
There are two basic steps in the setup:<br>
(1) load the Python modules that will be used in the program. This program uses the Operating System module ("os") which allows the program to work on your hard drive, the urllib module ("urllib") to collect text from a webpage, and the Selenium module ("selenium") to mimic the user's click routine on the page.<br>
(2) set the working directory to where the text files should be stored.

In [None]:
"""
Created on Fri Oct 23 13:23:47 2015
@author: Jeff and Johannes for CapitolQuery
"""

import os
import urllib
from selenium import webdriver
import re


## SET your storing directory here 
os.chdir("your storing directory")


### Set up the browser


In [None]:
#to prevent download dialog
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2) # custom location
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', os.getcwd()) # use current working directory
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv')



### Start the browser

In [None]:
#open browser driver
browser = webdriver.Firefox(profile)
browser.implicitly_wait(20)
browser.get("http://www.gpo.gov/fdsys/browse/collection.action?collectionCode=CHRG")



### Loop through all sessions of Congress

In [None]:
#find the elements by congress
congressElements = browser.find_elements_by_xpath(".//div[@class='level1 browse-level']/a")

#loop through congresses
counter = 0
for counter in range(counter, len(congressElements)):
    
    #re-establish the element locations    
    congressElements = browser.find_elements_by_xpath(".//div[@class='level1 browse-level']/a")    
    
    #save the name of the congress   
    congress_name = congressElements[counter].text    
    
    #click on the chosen congress
    congressElements[counter].click()

    #record chamber element locations   
    chamberElements = browser.find_elements_by_xpath(".//div[@class='level2 browse-level']/a")
    
    #loop through chambers
    chamber_counter = 0
    for chamber_counter in range(chamber_counter, len(chamberElements)):
        
        #re-establish the chamber element locations        
        chamberElements = browser.find_elements_by_xpath(".//div[@class='level2 browse-level']/a")
        
        #save the name of the chamber        
        chamber_name = chamberElements[chamber_counter].text        
        
        #click the chosen chamber        
        chamberElements[chamber_counter].click()
        
        #record committee element locations        
        committeeElements = browser.find_elements_by_partial_link_text("Committee")
        
        #it is picking up an erroneous committee name from a sidebar
        committeeElements.pop(0)
        
          #loop through committees
        committee_counter = 0
        for committee_counter in range(committee_counter, len(committeeElements)):
            committeeElements = browser.find_elements_by_partial_link_text("Committee")
            committeeElements.pop(0)
            committee_name = committeeElements[committee_counter].text
            committeeElements[committee_counter].click()
            
        
        #record hearing element locations
            hearingElements = browser.find_elements_by_xpath(".//tr[td[span[@class='results-line2']]]")
            
            #loop through hearings
            hearing_counter = 0
            for hearing_counter in range(hearing_counter, len(hearingElements)):
                
                #re-establish hearing element locations                
                hearingElements = browser.find_elements_by_xpath(".//tr[td[span[@class='results-line2']]]")
                
                #navigate to the line with the hearing elements                
                hearing_line =  hearingElements[hearing_counter]               
                
                #save the hearing date & clean it slightly to remove the dot at the end               
                hearing_date = hearing_line.find_element_by_xpath(".//span[@class='results-line2']").text
                hearing_date = re.sub('\.', '', hearing_date)
                
                #Storing the hearing text
                #find the "Text" part of the element and extract the link                
                hearing_text_link = hearing_line.find_element_by_partial_link_text("Text").get_attribute('href')
                
                #download the text file 
                file_name_elements = [congress_name, chamber_name, committee_name, hearing_date]
                file_name = "_".join(file_name_elements)
                file_name += ".txt"
                
                # depricated: 
                # urllib.request.urlretrieve("http://www.gpo.gov/fdsys/pkg/CHRG-115hhrg24324/html/CHRG-115hhrg24324.htm")
                                
                hrng=urllib.request.Request(hearing_text_link, headers={'User-Agent': "Mozilla/56.0.1 (Windows 10; Win64; x64)"})
                con = urllib.request.urlopen( hrng )
                text= con.read()
                file = open(file_name,'wb')
                file.write(text)
                file.close()
                
                #Storing the hearing metadata (MODS)
                #Getting the URL & going there
                hearing_MODS_page_link = hearing_line.find_element_by_partial_link_text("More").get_attribute('href')
                browser.get(hearing_MODS_page_link)
                             
                #download the text file 
                file_name_elements = [congress_name, chamber_name, committee_name, hearing_date, 'MODS']
                file_name = "_".join(file_name_elements)
                file_name += ".xml"
                
                hearing_MODS_file_link = browser.find_element_by_partial_link_text("MODS").get_attribute('href')
                hrng=urllib.request.Request(hearing_MODS_file_link, headers={'User-Agent': "Mozilla/56.0.1 (Windows 10; Win64; x64)"})
                con = urllib.request.urlopen( hrng )
                text= con.read()
                file = open(file_name,'wb')
                file.write(text)
                file.close()
                
                browser.back()
            
            browser.back()
       
        browser.back()
    
    browser.back()