In [1]:
# beautiful soup
import requests
from bs4 import BeautifulSoup

In [2]:
# let's look at some XML
houseXML = requests.get("http://clerk.house.gov/xml/lists/MemberData.xml").text
type(houseXML)

str

In [3]:
#.text gives us a string of whatever we requested
houseXML[0:1000]

'<?xml version="1.0" encoding="UTF-8"?><MemberData publish-date="October 5, 2020"><title-info><congress-num>116</congress-num><congress-text>One Hundred Sixteenth Congress</congress-text><session>2</session><majority>D</majority><minority>R</minority><clerk>Cheryl L. Johnson</clerk><weburl>http://clerk.house.gov</weburl></title-info><members><member><statedistrict>AK00</statedistrict><member-info><namelist>Young, Don</namelist><bioguideID>Y000033</bioguideID><lastname>Young</lastname><firstname>Don</firstname><middlename/><sort-name>YOUNG,DON</sort-name><suffix/><courtesy>Mr.</courtesy><prior-congress>115</prior-congress><official-name>Don Young</official-name><formal-name>Mr. Young</formal-name><party>R</party><caucus>R</caucus><state postal-code="AK"><state-fullname>Alaska</state-fullname></state><district>At Large</district><townname>Fort Yukon</townname><office-building>RHOB</office-building><office-room>2314</office-room><office-zip>20515</office-zip><office-zip-suffix>0200</offic

In [4]:
# look at all those tags!  let's get some help with them.
houseSoup = BeautifulSoup(houseXML,"lxml")
# houseXML: data source (string)
# lxml: parser that beautiful soup will use
# the parser tells python how to interpret the source
# the result (i.e., houseSoup) is structured data that we'll be able to use more easily
# than if it were just the plain text we see above

In [5]:
type(houseSoup)
# it's a BeulSoup object

bs4.BeautifulSoup

In [6]:
# in the XML we saw a members tag.  let's see what's going on in there
houseSoup.members
# this gives us everything in the members tag (including the members tag)

<members><member><statedistrict>AK00</statedistrict><member-info><namelist>Young, Don</namelist><bioguideid>Y000033</bioguideid><lastname>Young</lastname><firstname>Don</firstname><middlename></middlename><sort-name>YOUNG,DON</sort-name><suffix></suffix><courtesy>Mr.</courtesy><prior-congress>115</prior-congress><official-name>Don Young</official-name><formal-name>Mr. Young</formal-name><party>R</party><caucus>R</caucus><state postal-code="AK"><state-fullname>Alaska</state-fullname></state><district>At Large</district><townname>Fort Yukon</townname><office-building>RHOB</office-building><office-room>2314</office-room><office-zip>20515</office-zip><office-zip-suffix>0200</office-zip-suffix><phone>(202) 225-5765</phone><elected-date date="20181106">November  6, 2018</elected-date><sworn-date date="20190103">January  3, 2019</sworn-date></member-info><committee-assignments><committee comcode="II00" rank="2"></committee><committee comcode="PW00" rank="2"></committee><subcommittee rank="1" 

In [7]:
type(houseSoup.members)

bs4.element.Tag

In [8]:
houseSoup.members.contents
# this gives us a list of the tags inside the members tag

[<member><statedistrict>AK00</statedistrict><member-info><namelist>Young, Don</namelist><bioguideid>Y000033</bioguideid><lastname>Young</lastname><firstname>Don</firstname><middlename></middlename><sort-name>YOUNG,DON</sort-name><suffix></suffix><courtesy>Mr.</courtesy><prior-congress>115</prior-congress><official-name>Don Young</official-name><formal-name>Mr. Young</formal-name><party>R</party><caucus>R</caucus><state postal-code="AK"><state-fullname>Alaska</state-fullname></state><district>At Large</district><townname>Fort Yukon</townname><office-building>RHOB</office-building><office-room>2314</office-room><office-zip>20515</office-zip><office-zip-suffix>0200</office-zip-suffix><phone>(202) 225-5765</phone><elected-date date="20181106">November  6, 2018</elected-date><sworn-date date="20190103">January  3, 2019</sworn-date></member-info><committee-assignments><committee comcode="II00" rank="2"></committee><committee comcode="PW00" rank="2"></committee><subcommittee rank="1" subcomco

In [9]:
type(houseSoup.members.contents)

list

In [10]:
type(houseSoup.members.contents[0])

bs4.element.Tag

In [11]:
# what can we do with lists?
# all sorts of things
# how many people are in the house of representatives?
len(houseSoup.members.contents)

441

In [12]:
# but there are 435 members of the house. hmm.  we'll get back to this.
# there are 441 member tags inside the members tag. each member tag is
# structured the same way as all the other member tags.

In [13]:
# let's get all the states for all the members to find out why we got 441 members instead
# of the expected 435 members
# each member's state name is inside that state-fullname tag
# to get that tag we can use .find method that BS provides us
# (it's a method that you can run on BS tag objects)
houseSoup.members.contents[0].find("state-fullname")

<state-fullname>Alaska</state-fullname>

In [14]:
# that finds the tag we're looking for
# how do we get the stuff between the open/close tags?
houseSoup.members.contents[0].find("state-fullname").text

'Alaska'

In [15]:
# now we are getting somewhere!
# we know how to get one rep's state, so we know how to get every rep's state.
# let's iterate through all of the reps
# let's store every state to a list
states = []
for member in houseSoup.members.contents:
    #print(member.find("state-fullname").text)
    states.append(member.find("state-fullname").text)

In [16]:
len(states)

441

In [17]:
print(states)

['Alaska', 'Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alabama', 'Alabama', 'Arkansas', 'Arkansas', 'Arkansas', 'Arkansas', 'American Samoa', 'Arizona', 'Arizona', 'Arizona', 'Arizona', 'Arizona', 'Arizona', 'Arizona', 'Arizona', 'Arizona', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'California', 'Colo

In [18]:
# let's create a set to store the unique states
statesSet = set(states)

In [19]:
len(statesSet)

56

In [20]:
# 56? there are 50 states.
statesSet

{'Alabama',
 'Alaska',
 'American Samoa',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Guam',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Northern Mariana Islands',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Puerto Rico',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virgin Islands',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming'}

In [21]:
# today we'll look at the committees tag in the xml
# how many committees are there?
houseSoup.committees.contents # this gives us a list of tags that are in the committees tag

[<committee com-building-code="LHOB" com-header-text="The chairman and ranking minority member are ex officio members of all subcommittees." com-phone="225-2171" com-room="1301" com-zip="20515" com-zip-suffix="6001" comcode="AG00" type="standing"><committee-fullname>Committee on Agriculture</committee-fullname><ratio><majority>26</majority><minority>22</minority></ratio><subcommittee subcom-building-code="LHOB" subcom-phone="225-2171" subcom-room="1301" subcom-zip="20515" subcom-zip-suffix="0" subcomcode="AG03"><subcommittee-fullname>Nutrition, Oversight, and Department Operations</subcommittee-fullname><ratio><majority>7</majority><minority>6</minority></ratio></subcommittee><subcommittee subcom-building-code="LHOB" subcom-phone="225-2171" subcom-room="1301" subcom-zip="20515" subcom-zip-suffix="0" subcomcode="AG14"><subcommittee-fullname>Biotechnology, Horticulture, and Research</subcommittee-fullname><ratio><majority>12</majority><minority>9</minority></ratio></subcommittee><subcomm

In [23]:
len(houseSoup.committees.contents)
# there are 28 committees in the House of Representatives

28

In [24]:
# let's get the name of the first committee
houseSoup.committees.contents[0].find("committee-fullname").text

'Committee on Agriculture'

In [25]:
# to get the names of all the committees, use a for loop
for committee in houseSoup.committees.contents:
    print(committee.find("committee-fullname").text)

Committee on Agriculture
Committee on Appropriations
Committee on Armed Services
Committee on Financial Services
Committee on the Budget
Committee on Education and Labor
Committee on Foreign Affairs
Committee on Oversight and Reform
Committee on House Administration
Committee on Homeland Security
Committee on Energy and Commerce
Committee on Natural Resources
Committee on the Judiciary
Committee on Transportation and Infrastructure
Committee on Rules
Committee on Small Business
Committee on Ethics
Committee on Science, Space, and Technology
Select Subcommittee on the Coronavirus Crisis
Committee on Veterans' Affairs
Committee on Ways and Means
Permanent Select Committee on Intelligence
Select Committee on the Climate Crisis
Select Committee on the Modernization of Congress
Joint Economic Committee
Joint Committee on Taxation
Joint Committee on the Library
Joint Committee on Printing


In [26]:
# it would be nice if BS gave us a way to find all the tags that we're looking for
# of course it does!
allMembers = houseSoup.find_all("member")
allMembers
# find_all gives us a list of all of the matching tags and everything under that tag

[<member><statedistrict>AK00</statedistrict><member-info><namelist>Young, Don</namelist><bioguideid>Y000033</bioguideid><lastname>Young</lastname><firstname>Don</firstname><middlename></middlename><sort-name>YOUNG,DON</sort-name><suffix></suffix><courtesy>Mr.</courtesy><prior-congress>115</prior-congress><official-name>Don Young</official-name><formal-name>Mr. Young</formal-name><party>R</party><caucus>R</caucus><state postal-code="AK"><state-fullname>Alaska</state-fullname></state><district>At Large</district><townname>Fort Yukon</townname><office-building>RHOB</office-building><office-room>2314</office-room><office-zip>20515</office-zip><office-zip-suffix>0200</office-zip-suffix><phone>(202) 225-5765</phone><elected-date date="20181106">November  6, 2018</elected-date><sworn-date date="20190103">January  3, 2019</sworn-date></member-info><committee-assignments><committee comcode="II00" rank="2"></committee><committee comcode="PW00" rank="2"></committee><subcommittee rank="1" subcomco

In [27]:
len(allMembers)

441

In [28]:
type(allMembers)
# for now (at least), just know that a ResultSet behaves like a list: it is numerically 
# indexed and you can iterate through it

bs4.element.ResultSet

In [29]:
allMembers[0]

<member><statedistrict>AK00</statedistrict><member-info><namelist>Young, Don</namelist><bioguideid>Y000033</bioguideid><lastname>Young</lastname><firstname>Don</firstname><middlename></middlename><sort-name>YOUNG,DON</sort-name><suffix></suffix><courtesy>Mr.</courtesy><prior-congress>115</prior-congress><official-name>Don Young</official-name><formal-name>Mr. Young</formal-name><party>R</party><caucus>R</caucus><state postal-code="AK"><state-fullname>Alaska</state-fullname></state><district>At Large</district><townname>Fort Yukon</townname><office-building>RHOB</office-building><office-room>2314</office-room><office-zip>20515</office-zip><office-zip-suffix>0200</office-zip-suffix><phone>(202) 225-5765</phone><elected-date date="20181106">November  6, 2018</elected-date><sworn-date date="20190103">January  3, 2019</sworn-date></member-info><committee-assignments><committee comcode="II00" rank="2"></committee><committee comcode="PW00" rank="2"></committee><subcommittee rank="1" subcomcod

In [30]:
# each member of the house has a sworn-in date
swornDates = houseSoup.find_all("sworn-date")
swornDates # this is a ResultSet

[<sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3, 2019</sworn-date>,
 <sworn-date date="20190103">January  3,

In [31]:
swornDates[0]

<sworn-date date="20190103">January  3, 2019</sworn-date>

In [32]:
# notice the date= in the sworn-date tag
# this is an attribute of the tag
# it provides additional information about a tag
# to get attributes, we access them in a similar fashion as we do with nested elements
swornDates[0]["date"]

'20190103'

In [33]:
# we can use find_all to search within a subtree rather than the entire tree
houseSoup.committees.contents[0].find_all("subcommittee")
# this gives us only the subcommittees within the first committee

[<subcommittee subcom-building-code="LHOB" subcom-phone="225-2171" subcom-room="1301" subcom-zip="20515" subcom-zip-suffix="0" subcomcode="AG03"><subcommittee-fullname>Nutrition, Oversight, and Department Operations</subcommittee-fullname><ratio><majority>7</majority><minority>6</minority></ratio></subcommittee>,
 <subcommittee subcom-building-code="LHOB" subcom-phone="225-2171" subcom-room="1301" subcom-zip="20515" subcom-zip-suffix="0" subcomcode="AG14"><subcommittee-fullname>Biotechnology, Horticulture, and Research</subcommittee-fullname><ratio><majority>12</majority><minority>9</minority></ratio></subcommittee>,
 <subcommittee subcom-building-code="LHOB" subcom-phone="225-2171" subcom-room="1301" subcom-zip="20515" subcom-zip-suffix="0" subcomcode="AG15"><subcommittee-fullname>Conservation and Forestry</subcommittee-fullname><ratio><majority>6</majority><minority>5</minority></ratio></subcommittee>,
 <subcommittee subcom-building-code="LHOB" subcom-phone="225-2171" subcom-room="13

In [34]:
# let's get the full name of each subcommittee here
houseSoup.committees.contents[0].find_all("subcommittee-fullname")

[<subcommittee-fullname>Nutrition, Oversight, and Department Operations</subcommittee-fullname>,
 <subcommittee-fullname>Biotechnology, Horticulture, and Research</subcommittee-fullname>,
 <subcommittee-fullname>Conservation and Forestry</subcommittee-fullname>,
 <subcommittee-fullname>General Farm Commodities and Risk Management</subcommittee-fullname>,
 <subcommittee-fullname>Commodity Exchanges, Energy, and Credit</subcommittee-fullname>,
 <subcommittee-fullname>Livestock and Foreign Agriculture</subcommittee-fullname>]

In [35]:
# print just the text name, not the tags of each of these
for subcommittee in houseSoup.committees.contents[0].find_all("subcommittee-fullname"):
    print(subcommittee.text)

Nutrition, Oversight, and Department Operations
Biotechnology, Horticulture, and Research
Conservation and Forestry
General Farm Commodities and Risk Management
Commodity Exchanges, Energy, and Credit
Livestock and Foreign Agriculture


In [39]:
type(houseSoup.committees.contents[0].find_all("subcommittee-fullname"))

bs4.element.ResultSet

In [40]:
# let's iterate through all of the committees and print out each name and the names 
# of the subcommittees
for committee in houseSoup.committees.contents:
    # print the name of the committee
    print("Committee Name: {0}".format(committee.find("committee-fullname").text))
    # iterate through the subcommittee names
    for subcomname in committee.find_all("subcommittee-fullname"):
        print("\tSubcommittee Name: {0}".format(subcomname.text))

Committee Name: Committee on Agriculture
	Subcommittee Name: Nutrition, Oversight, and Department Operations
	Subcommittee Name: Biotechnology, Horticulture, and Research
	Subcommittee Name: Conservation and Forestry
	Subcommittee Name: General Farm Commodities and Risk Management
	Subcommittee Name: Commodity Exchanges, Energy, and Credit
	Subcommittee Name: Livestock and Foreign Agriculture
Committee Name: Committee on Appropriations
	Subcommittee Name: Agriculture, Rural Development, Food and Drug Administration, and Related Agencies
	Subcommittee Name: Defense
	Subcommittee Name: State, Foreign Operations, and Related Programs
	Subcommittee Name: Interior, Environment, and Related Agencies
	Subcommittee Name: Labor, Health and Human Services, Education, and Related Agencies
	Subcommittee Name: Energy and Water Development, and Related Agencies
	Subcommittee Name: Homeland Security
	Subcommittee Name: Military Construction, Veterans Affairs, and Related Agencies
	Subcommittee Name: 