## XML Web Scraping Roll Call Data

In [471]:
#import libs
import urllib
import xml.etree.ElementTree as ET
import re
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from time import sleep

In [272]:
#test
#request
url = "https://www.senate.gov/legislative/LIS/roll_call_votes/vote1142/vote_114_2_00081.xml"
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
    the_page = response.read()

In [457]:
#begin parse with root
root = ET.fromstring(the_page)

In [454]:
#checkout children
children = {}
for child in root:
    print(child.tag, ":", child.text)

congress : 114
session : 2
congress_year : 2016
vote_number : 163
vote_date : December 10, 2016,  12:57 AM
modify_date : December 10, 2016,  01:26 AM
vote_question_text : On the Motion (Motion to Concur in the House Amendment to S. 612)
vote_document_text : A bill to designate the Federal building and United States courthouse located at 1300 Victoria Street in Laredo, Texas, as the "George P. Kazen Federal Building and United States Courthouse".
vote_result_text : Motion Agreed to (78-21)
question : On the Motion
vote_title : Motion to Concur in the House Amendment to S. 612
majority_requirement : 1/2
vote_result : Motion Agreed to
document : 
    
amendment : 
    
count : 
    
tie_breaker : 
    
members : 
    


In [463]:
#bill_info
info={}
for key, value in master_roots_1.items():
    bill_info={}
    for child in value:
        bill_info[child.tag] = child.text
    info[key] = bill_info   
    

In [468]:
info_2 = {}
for key, value in master_roots_2.items():
    bill_info={}
    for child in value:
        bill_info[child.tag] = child.text
    info_2[str(int(key)+339)] = bill_info 

In [177]:
#document
for element in root[-5]:
    print (element)

<Element 'document_congress' at 0x000001F540702B88>
<Element 'document_type' at 0x000001F540702BD8>
<Element 'document_number' at 0x000001F540702C28>
<Element 'document_name' at 0x000001F540702C78>
<Element 'document_title' at 0x000001F540702CC8>
<Element 'document_short_title' at 0x000001F540702D18>


In [178]:
#amendment
for element in root[-4]:
    print (element)

<Element 'amendment_number' at 0x000001F540702DB8>
<Element 'amendment_to_amendment_number' at 0x000001F540702E58>
<Element 'amendment_to_amendment_to_amendment_number' at 0x000001F540702EF8>
<Element 'amendment_to_document_number' at 0x000001F540702F98>
<Element 'amendment_to_document_short_title' at 0x000001F540703048>
<Element 'amendment_purpose' at 0x000001F540703098>


In [179]:
#count
for element in root[-3]:
    print (element)

<Element 'yeas' at 0x000001F540703188>
<Element 'nays' at 0x000001F5407031D8>
<Element 'present' at 0x000001F540703228>
<Element 'absent' at 0x000001F540703278>


In [180]:
#tie-breaker
for i in root[-2]:
    print (i)

<Element 'by_whom' at 0x000001F540703318>
<Element 'tie_breaker_vote' at 0x000001F540703368>


In [181]:
#members
for element in root[-1][0]:
    print (element.text)

Alexander (R-TN)
Alexander
Lamar
R
TN
Nay
S289


In [182]:
#members children
for i in root[-1][0]:
    print (i)

<Element 'member_full' at 0x000001F540703458>
<Element 'last_name' at 0x000001F5407034A8>
<Element 'first_name' at 0x000001F5407034F8>
<Element 'party' at 0x000001F540703548>
<Element 'state' at 0x000001F540703598>
<Element 'vote_cast' at 0x000001F5407035E8>
<Element 'lis_member_id' at 0x000001F540703638>


In [183]:
#parse members
member_names = []
parties = []
states = []
votes = []
ids = []
for element in root[-1]:
    member_names.append(element[1].text + ' '+ element[2].text)
    parties.append(element[3].text)
    states.append(element[4].text)
    votes.append(element[5].text)
    ids.append(element[6].text)

In [184]:
member_data = pd.DataFrame.from_dict({'member': member_names, 'party': parties, 'state': states, 'vote': votes, 'idx': ids})

In [185]:
member_data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
idx,S289,S340,S354,S317,S330,S341,S342,S370,S343,S223,...,S303,S384,S351,S326,S299,S327,S366,S316,S318,S247
member,Alexander Lamar,Ayotte Kelly,Baldwin Tammy,Barrasso John,Bennet Michael,Blumenthal Richard,Blunt Roy,Booker Cory,Boozman John,Boxer Barbara,...,Thune John,Tillis Thomas,Toomey Pat,Udall Tom,Vitter David,Warner Mark,Warren Elizabeth,Whitehouse Sheldon,Wicker Roger,Wyden Ron
party,R,R,D,R,D,D,R,D,R,D,...,R,R,R,D,R,D,D,D,R,D
state,TN,NH,WI,WY,CO,CT,MO,NJ,AR,CA,...,SD,NC,PA,NM,LA,VA,MA,RI,MS,OR
vote,Nay,Nay,Yea,Nay,Nay,Yea,Nay,Yea,Nay,Not Voting,...,Nay,Nay,Nay,Yea,Nay,Yea,Yea,Yea,Nay,Yea


In [189]:
def get_votes(root):
    votes = []
    for element in root[-1]:
        votes.append(element[5].text)
    return votes
def get_member_id(root):
    ids = []
    for element in root[-1]:
        ids.append(element[6].text)
    

In [196]:
def padded(num):
    l = len(str(num))
    zeros = "0"*(5-l)
    new = zeros+str(num)
    return new

In [None]:
#create master dict of roots for each bill

In [205]:
#repeat the above process, but create a master dict of all roots
master_roots = {}
for i in range(1,3):
    n= padded(i)
    url = "https://www.senate.gov/legislative/LIS/roll_call_votes/vote1141/vote_114_1_"+ n + ".xml"
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        the_page = response.read()
    root = ET.fromstring(the_page)
    master_roots[n] = root

In [206]:
#it works
master_roots

{'00001': <Element 'roll_call_vote' at 0x000001F541D0F7C8>,
 '00002': <Element 'roll_call_vote' at 0x000001F541D49548>}

In [230]:
#it works
for i in range(1,164):
    n = padded(i)
    #print (master_roots[n][-1][0][0].text)

### Create a dict of all the roll call votes

In [301]:
#session 1
master_roots_1 = {}
for i in range(1,340):
    n= padded(i)
    url = "https://www.senate.gov/legislative/LIS/roll_call_votes/vote1141/vote_114_1_"+ n + ".xml"
    req = urllib.request.Request(url)
    with urllib.request.urlopen(req) as response:
        the_page = response.read()
    root = ET.fromstring(the_page)
    master_roots_1[n] = root

In [519]:
#session 2
master_roots_2 = {}
for i in range(1,164):
    try:
        n= padded(i)
        url = "https://www.senate.gov/legislative/LIS/roll_call_votes/vote1142/vote_114_2_"+ n + ".xml"
        req = urllib.request.Request(url)
        with urllib.request.urlopen(req) as response:
            the_page = response.read()
        root = ET.fromstring(the_page)
        master_roots_2[n] = root
        sleep(2)
    except:
        continue

In [524]:
master_roots_2['00121']

<Element 'roll_call_vote' at 0x000001F5511DEF48>

In [404]:
senators = []
for i in range(100):
    senator = master_roots['00001'][-1][i][0].text
    senators.append(senator)

In [405]:
senators[:10]

['Alexander (R-TN)',
 'Ayotte (R-NH)',
 'Baldwin (D-WI)',
 'Barrasso (R-WY)',
 'Bennet (D-CO)',
 'Blumenthal (D-CT)',
 'Blunt (R-MO)',
 'Booker (D-NJ)',
 'Boozman (R-AR)',
 'Boxer (D-CA)']

### Create votes table

In [406]:
#make a dict of all the votes for each bill
votes_dict_1 = {}
for key, value in master_roots_1.items():
    votes_dict_1[key] = get_votes(value)

In [407]:
votes_dict_2 = {}
for key, value in master_roots_2.items():
    votes_dict_2[str(int(key)+339)] = get_votes(value)

In [408]:
bills_df = pd.DataFrame.from_dict(votes_dict_1).join(pd.DataFrame.from_dict(votes_dict_2)).T

In [409]:
bills_df.columns = senators

In [410]:
bills_df.index.name = 'vote_number'

In [411]:
bills_df.shape

(438, 100)

In [607]:
bills_df

Index(['00001', '00002', '00003', '00004', '00005', '00006', '00007', '00008',
       '00009', '00010',
       ...
       '462', '478', '487', '492', '494', '496', '497', '500', '501', '502'],
      dtype='object', name='vote_number', length=438)

#### Get outcomes

In [413]:
def get_outcome(root):
    return root[-6].text

In [495]:
#make a dict of all outcomes
outcomes_dict_1 = {}
for key, value in master_roots_1.items():
    outcomes_dict_1[key] = get_outcome(value)
outcomes_dict_2 = {}
for key, value in master_roots_2.items():
    outcomes_dict_2[str(int(key)+339)] = get_outcome(value)

In [496]:
total_outcomes = []
for key, value in outcomes_dict_2.items():
    outcomes_dict_1[key] = value

In [611]:
outcomes = pd.Series(outcomes_dict_1)
outcomes.name = 'outcome'

In [613]:
votes = bills_df.join(outcomes)

In [615]:
new_index = [str(int(x)) for x in votes.index.values]

In [617]:
votes.index = new_index

In [618]:
votes

Unnamed: 0,Alexander (R-TN),Ayotte (R-NH),Baldwin (D-WI),Barrasso (R-WY),Bennet (D-CO),Blumenthal (D-CT),Blunt (R-MO),Booker (D-NJ),Boozman (R-AR),Boxer (D-CA),...,Tillis (R-NC),Toomey (R-PA),Udall (D-NM),Vitter (R-LA),Warner (D-VA),Warren (D-MA),Whitehouse (D-RI),Wicker (R-MS),Wyden (D-OR),outcome
1,Nay,Nay,Yea,Nay,Nay,Yea,Nay,Yea,Nay,Not Voting,...,Nay,Nay,Yea,Nay,Yea,Yea,Yea,Nay,Yea,Amendment Rejected
2,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Not Voting,...,Yea,Yea,Yea,Yea,Yea,Nay,Yea,Yea,Yea,Bill Passed
3,Yea,Yea,Nay,Yea,Yea,Nay,Yea,Nay,Yea,Nay,...,Yea,Yea,Yea,Yea,Yea,Nay,Nay,Yea,Not Voting,Cloture on the Motion to Proceed Agreed to
4,Yea,Yea,Nay,Yea,Nay,Nay,Yea,Nay,Yea,Nay,...,Yea,Yea,Nay,Yea,Yea,Nay,Nay,Yea,Nay,Motion to Table Agreed to
5,Yea,Yea,Nay,Yea,Nay,Nay,Yea,Nay,Yea,Nay,...,Yea,Yea,Nay,Yea,Nay,Nay,Nay,Yea,Nay,Motion to Table Agreed to
6,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,...,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Amendment Agreed to
7,Yea,Yea,Nay,Yea,Nay,Nay,Yea,Nay,Yea,Nay,...,Yea,Yea,Nay,Yea,Nay,Nay,Nay,Yea,Nay,Amendment Rejected
8,Nay,Nay,Yea,Nay,Yea,Yea,Nay,Yea,Nay,Yea,...,Nay,Nay,Yea,Nay,Yea,Yea,Yea,Nay,Yea,Amendment Rejected
9,Nay,Nay,Nay,Yea,Nay,Nay,Yea,Nay,Yea,Nay,...,Yea,Yea,Nay,Yea,Nay,Nay,Nay,Yea,Nay,Amendment Rejected
10,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Yea,...,Yea,Yea,Yea,Yea,Yea,Yea,Yea,Nay,Yea,Amendment Agreed to


In [502]:
#export the bills to .csv and save for later

In [503]:
votes.shape

(438, 101)

In [504]:
votes.to_csv('votes.csv')

In [505]:
votes = pd.read_csv('votes.csv', index_col=0)

In [547]:
df = votes.replace({'Nay': 0, 'Yea': 1, 'Not Voting': 0, 'Present': 1})

In [548]:
outcomes = df.outcome.unique()

In [549]:
list(outcomes)

['Amendment Rejected',
 'Bill Passed',
 'Cloture on the Motion to Proceed Agreed to',
 'Motion to Table Agreed to',
 'Amendment Agreed to',
 'Motion for Attendance Agreed to',
 'Cloture Motion Rejected',
 'Cloture Motion Agreed to',
 'Cloture on the Motion to Proceed Rejected',
 'Nomination Confirmed',
 'Motion to Table Failed',
 'Motion to Proceed Agreed to',
 'Joint Resolution Passed',
 'Veto Sustained',
 'Motion Rejected',
 'Concurrent Resolution Agreed to',
 'Motion Agreed to',
 'Conference Report Agreed to',
 'Resolution Agreed to',
 'Veto Overridden']

In [550]:
outcome_replacements = {'Amendment Rejected': 0,
 'Bill Passed': 1,
 'Cloture on the Motion to Proceed Agreed to': 1,
 'Motion to Table Agreed to': 1,
 'Amendment Agreed to': 1,
 'Motion for Attendance Agreed to': 1,
 'Cloture Motion Rejected': 0,
 'Cloture Motion Agreed to': 1,
 'Cloture on the Motion to Proceed Rejected': 0,
 'Nomination Confirmed': 1,
 'Motion to Table Failed': 0,
 'Motion to Proceed Agreed to': 1,
 'Joint Resolution Passed': 1,
 'Veto Sustained': 0,
 'Motion Rejected': 0,
 'Concurrent Resolution Agreed to': 1,
 'Motion Agreed to': 1,
 'Conference Report Agreed to': 1,
 'Resolution Agreed to': 1,
 'Veto Overridden': 1}

In [551]:
df = df.replace(outcome_replacements)

In [552]:
df.to_csv('cleaned_votes.csv', index=False)

In [556]:
fixers = df.select_dtypes(exclude=[np.number])

#### Bill info

In [558]:
url = "https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_114_2.xml"
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
    the_page = response.read()
root1 = ET.fromstring(the_page)

In [560]:
url = "https://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_114_1.xml"
req = urllib.request.Request(url)
with urllib.request.urlopen(req) as response:
    the_page = response.read()
root2 = ET.fromstring(the_page)

In [566]:
for child in root1:
    print(child.tag, ":", child.text)

congress : 114
session : 2
congress_year : 2016
votes : 
    


In [588]:
bills = {}
for vote in root1[-1]:
    pieces = {}
    bills[vote[0].text] = pieces
    for element in vote:
        pieces[element.tag] = element.text
    pieces['yea'] = (vote[-2][0].text)
    pieces['nay'] = (vote[-2][1].text)
        


bills2 = {}
for vote in root2[-1]:
    pieces = {}
    bills2[vote[0].text] = pieces
    for element in vote:
        pieces[element.tag] = element.text
    pieces['yea'] = (vote[-2][0].text)
    pieces['nay'] = (vote[-2][1].text)
        

In [591]:
bills_114_2 = pd.DataFrame(bills).T.drop('vote_tally', axis=1)
bills_114_1 = pd.DataFrame(bills2).T.drop('vote_tally', axis=1)

In [592]:
bills_114_1

Unnamed: 0,issue,nay,question,result,title,vote_date,vote_number,yea
00001,H.R. 26,66,On the Amendment,Rejected,Warren Amdt. No. 1; In the nature of a substit...,08-Jan,00001,31
00002,H.R. 26,4,On Passage of the Bill,Passed,H.R. 26; A bill to extend the termination date...,08-Jan,00002,93
00003,S. 1,32,On Cloture on the Motion to Proceed,Agreed to,Motion to Invoke Cloture on the Motion to Proc...,12-Jan,00003,63
00004,S. 1,42,On the Motion to Table,Agreed to,Motion to Table Markey Amdt. No. 13; To ensure...,20-Jan,00004,57
00005,S. 1,46,On the Motion to Table,Agreed to,Motion to Table Franken Amdt. No. 17; To requi...,20-Jan,00005,53
00006,S. 1,5,On the Amendment,Agreed to,Portman Amdt. No. 3 As Modified; To promote en...,20-Jan,00006,94
00007,S. 1,45,On the Amendment,Rejected,Lee Amdt. No. 33; To conform citizen suits und...,21-Jan,00007,54
00008,S. 1,58,On the Amendment,Rejected,Durbin Amdt. No. 69; To ensure that the storag...,21-Jan,00008,41
00009,S. 1,45,On the Amendment,Rejected,Toomey Amdt. No. 41; To continue cleaning up f...,21-Jan,00009,54
00010,S. 1,1,On the Amendment,Agreed to,Whitehouse Amdt. No. 29; To express the sense ...,21-Jan,00010,98


In [596]:
new_index = []
for val in bills_114_2.index.values:
    i = int(val)
    new_num = i+339
    new_index.append(new_num)

In [598]:
new_index
bills_114_2.index = new_index

In [600]:
bills_114_2.head()

Unnamed: 0,issue,nay,question,result,title,vote_date,vote_number,yea
340,PN11,6,On the Nomination,Confirmed,"Nomination Luis Felipe Restrepo, of Pennsylvan...",11-Jan,1,82
341,S. 2232,44,On Cloture on the Motion to Proceed,Rejected,Motion to Invoke Cloture on the Motion to Proc...,12-Jan,2,53
342,PN367,36,On the Nomination,Confirmed,"Confirmation Wilhelmina Marie Wright, of Minne...",19-Jan,3,58
343,H.R. 4038,43,On the Cloture Motion,Rejected,Motion to Invoke Cloture on the Motion to Proc...,20-Jan,4,55
344,S.J.Res. 22,40,On the Cloture Motion,Rejected,Motion to Invoke Cloture on the Veto Message t...,21-Jan,5,52


In [604]:
all_bills = bills_114_1.append(bills_114_2)

In [605]:
all_bills.sample(10)

Unnamed: 0,issue,nay,question,result,title,vote_date,vote_number,yea
265,H.J.Res. 61,42,On the Cloture Motion,Rejected,Motion to Invoke Cloture on McConnell Amdt. No...,15-Sep,265,56
308,S. 1177,6,On the Cloture Motion,Agreed to,Motion to Invoke Cloture on the Motion to Disa...,18-Nov,308,91
446,H.R. 2578,42,On the Motion to Table,Agreed to,Motion to Table the Motion to Commit with Inst...,20-Jun,107,56
235,S. 1177,64,On the Amendment,Rejected,Lee Amdt No. 2162; To amend the Elementary and...,14-Jul,235,32
342,PN367,36,On the Nomination,Confirmed,"Confirmation Wilhelmina Marie Wright, of Minne...",19-Jan,3,58
162,S. 178,2,On the Amendment,Agreed to,Kirk Amdt. No. 273 As Modified; To amend title...,22-Apr,162,97
258,H.R. 22,38,On the Amendment,Agreed to,McConnell Amdt. No. 2266 As Amended; In the na...,29-Jul,258,62
176,H.R. 1314,45,On Cloture on the Motion to Proceed,Rejected,Motion to Invoke Cloture on the Motion to Proc...,12-May,176,52
415,H.R. 2577,28,On the Motion,Agreed to,Motion to Waive All Applicable Budgetary Disci...,19-May,76,70
185,PN19,0,On the Nomination,Confirmed,"Confirmation Jose Rolando Olvera, Jr., of Texa...",21-May,185,100


In [606]:
all_bills.to_csv('all_bills.csv')