In [1]:
# import the following libraries for our web scraping project

import requests # to make https requests
from bs4 import BeautifulSoup # our web scraping library

import pandas as pd # for making dataframe later

In [3]:
# save the data from the website as a "soup" object

site = requests.get('https://translegislation.com/bills/2024/US') # gets the URL
html_code = site.content # saves the HTML code
soup = BeautifulSoup(html_code, 'lxml') # creates a soup object

In [5]:
# runs the loop on the bill cards
bill_cards = soup.find_all('div', class_ ='css-4rck61')

# creating empty lists to hold all of our data
titles = []
captions = []
categories = []
descriptions = []

# extracting the data from the bill cards
for item in bill_cards:
    title = item.h3.text
    category = item.find('span').text
    caption = item.h2.text
    if item.h2.text is not None:
        description = item.h2.text
    else:
        description = 'No bill description'
    
    # adding the items to the empty lists
    titles.append(title)
    categories.append(category)
    captions.append(caption)
    descriptions.append(description)
    # remember that "legiscan_links" is already saved as a list, so we don't have to create it here

In [7]:
urls = []
for item in bill_cards:
    extension = 'https://translegislation.com/' + item.a['href']
    urls.append(extension)

# making a soup object of *every* page that is linked
# this may take several seconds
soups = []
for item in urls:
    site = requests.get(item)
    html_code = site.content
    soup = BeautifulSoup(html_code, 'lxml')
    soups.append(soup)


legiscan_links = []
congress_links = []
for item in soups:
    # we are getting two links here, one to legiscan and one to the congress website
    links = item.find_all('a', class_='chakra-link css-oga2ct')
    anchor1 = links[0]['href'] # link to legiscan
    legiscan_links.append(anchor1)
    anchor2 = links[1]['href'] # link to congress
    congress_links.append(anchor2)

In [9]:
# creating a dataframe, with separate columns to hold each of our lists
df = pd.DataFrame(
    {'title': titles,
     'caption': captions,
     'category': categories,
     'description': descriptions,
     'url': urls,
     'legiscan': legiscan_links,
     'congress': congress_links
    })

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        87 non-null     object
 1   caption      87 non-null     object
 2   category     87 non-null     object
 3   description  87 non-null     object
 4   url          87 non-null     object
 5   legiscan     87 non-null     object
 6   congress     87 non-null     object
dtypes: object(7)
memory usage: 4.9+ KB


In [None]:
df

In [13]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /Users/michaelajackson/Desktop/Antitrans/GA-Trans-Leg


In [15]:
df.to_csv('/Users/michaelajackson/Desktop/Antitrans/GA-Trans-Leg/2024CSV')

In [17]:
#restarting the process for 2025
site25 = requests.get('https://translegislation.com/bills/2025/US') # gets the URL
html_code25 = site25.content # saves the HTML code
soup25 = BeautifulSoup(html_code25, 'lxml') # creates a soup object

In [19]:
# runs the loop on the bill cards
bill_cards25 = soup25.find_all('div', class_ ='css-4rck61')

# creating empty lists to hold all of our data
titles25 = []
captions25 = []
categories25 = []
descriptions25 = []

# extracting the data from the bill cards
for item in bill_cards25:
    title25 = item.h3.text
    category25 = item.find('span').text
    caption25 = item.h2.text
    if item.h2.text is not None:
        description25 = item.h2.text
    else:
        description25 = 'No bill description'
    
    # adding the items to the empty lists
    titles25.append(title25)
    categories25.append(category25)
    captions25.append(caption25)
    descriptions25.append(description25)
    # remember that "legiscan_links" is already saved as a list, so we don't have to create it here

In [21]:
urls25 = []
for item in bill_cards25:
    extension25 = 'https://translegislation.com/' + item.a['href']
    urls25.append(extension25)

# making a soup object of *every* page that is linked
# this may take several seconds
soups25 = []
for item in urls25:
    site25 = requests.get(item)
    html_code25 = site25.content
    soup25 = BeautifulSoup(html_code25, 'lxml')
    soups25.append(soup25)


legiscan_links25 = []
congress_links25 = []
for item in soups25:
    # we are getting two links here, one to legiscan and one to the congress website
    links25 = item.find_all('a', class_='chakra-link css-oga2ct')
    anchor25 = links[0]['href'] # link to legiscan
    legiscan_links25.append(anchor25)
    anchor25 = links[1]['href'] # link to congress
    congress_links25.append(anchor25)

In [22]:
# creating a dataframe, with separate columns to hold each of our lists
df25 = pd.DataFrame(
    {'title': titles25,
     'caption': captions25,
     'category': categories25,
     'description': descriptions25,
     'url': urls25,
     'legiscan': legiscan_links25,
     'congress': congress_links25
    })

In [25]:
df25.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        21 non-null     object
 1   caption      21 non-null     object
 2   category     21 non-null     object
 3   description  21 non-null     object
 4   url          21 non-null     object
 5   legiscan     21 non-null     object
 6   congress     21 non-null     object
dtypes: object(7)
memory usage: 1.3+ KB


In [None]:
df25

In [27]:
#combining both dfs into 1
df_combined = pd.concat([df, df25], ignore_index=True)

In [31]:
df_combined

Unnamed: 0,title,caption,category,description,url,legiscan,congress
0,US HB10075,Stopping the Mutilation of Children Act of 2024,HEALTHCARE,Stopping the Mutilation of Children Act of 2024,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10075/id/3025659,https://www.congress.gov/bill/118th-congress/h...
1,US HB10186,Protecting Women’s Private Spaces Act,BIRTH CERTIFICATES,Protecting Women’s Private Spaces Act,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10186/id/3029142,https://www.congress.gov/bill/118th-congress/h...
2,US HB10290,Stop the Invasion of Women’s Spaces Act,INCARCERATION,Stop the Invasion of Women’s Spaces Act,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10290/id/3038024,https://www.congress.gov/bill/118th-congress/h...
3,US HB1064,Ensuring Military Readiness Act of 2023,MILITARY,Ensuring Military Readiness Act of 2023,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB1064/id/2737306,https://www.congress.gov/bill/118th-congress/h...
4,US HB1112,Ensuring Military Readiness Act of 2023,MILITARY,Ensuring Military Readiness Act of 2023,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB1112/id/2742708,https://www.congress.gov/bill/118th-congress/h...
...,...,...,...,...,...,...,...
103,US SB312,A bill to establish a Federal tort against ped...,HEALTHCARE,A bill to establish a Federal tort against ped...,https://translegislation.com//bills/2025/US/SB312,https://legiscan.com/US/text/SR669/id/2998369,https://www.congress.gov/bill/118th-congress/s...
104,US SB74,Fair Play for Girls Act,SPORTS,Fair Play for Girls Act,https://translegislation.com//bills/2025/US/SB74,https://legiscan.com/US/text/SR669/id/2998369,https://www.congress.gov/bill/118th-congress/s...
105,US SB9,Protection of Women and Girls in Sports Act of...,SPORTS,Protection of Women and Girls in Sports Act of...,https://translegislation.com//bills/2025/US/SB9,https://legiscan.com/US/text/SR669/id/2998369,https://www.congress.gov/bill/118th-congress/s...
106,US SR21,"A resolution designating October 10, 2025, as ...",SPORTS,"A resolution designating October 10, 2025, as ...",https://translegislation.com//bills/2025/US/SR21,https://legiscan.com/US/text/SR669/id/2998369,https://www.congress.gov/bill/118th-congress/s...


In [33]:
print(df_combined['congress'][0])

https://www.congress.gov/bill/118th-congress/house-bill/10075/all-info


In [35]:
# Create a new column with "/all-info" replaced by "/text"
df_combined['text'] = df_combined['congress'].str.replace('/all-info', '/text/is?format=txt', regex=False)

In [57]:
print(df_combined['text'][30])

https://www.congress.gov/bill/118th-congress/house-bill/6658/text/is?format=txt


In [39]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        108 non-null    object
 1   caption      108 non-null    object
 2   category     108 non-null    object
 3   description  108 non-null    object
 4   url          108 non-null    object
 5   legiscan     108 non-null    object
 6   congress     108 non-null    object
 7   text         108 non-null    object
dtypes: object(8)
memory usage: 6.9+ KB


In [45]:
#creating function for extracting text from links in text cloumn

def extract_text(url):
    try:
        sitetext = requests.get(url) # gets the URL
        html_codetext = sitetext.content # saves the HTML code
        souptext = BeautifulSoup(html_codetext, 'lxml') # creates a soup object

# Find the <pre> tag containing the bill text
        bill_text = souptext.find('pre', {'id': 'billTextContainer'})
        
        return bill_text.get_text(strip=True) if bill_text else "Bill text not found"
    except requests.RequestException as e:
        return f"Error: {e}"  # Handle request errors for sites with pdfs instead of html

In [47]:
df_combined['bill_text'] = df_combined['text'].apply(extract_text)

In [49]:
df_combined.head()

Unnamed: 0,title,caption,category,description,url,legiscan,congress,text,bill_text
0,US HB10075,Stopping the Mutilation of Children Act of 2024,HEALTHCARE,Stopping the Mutilation of Children Act of 2024,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10075/id/3025659,https://www.congress.gov/bill/118th-congress/h...,https://www.congress.gov/bill/118th-congress/h...,Bill text not found
1,US HB10186,Protecting Women’s Private Spaces Act,BIRTH CERTIFICATES,Protecting Women’s Private Spaces Act,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10186/id/3029142,https://www.congress.gov/bill/118th-congress/h...,https://www.congress.gov/bill/118th-congress/h...,Bill text not found
2,US HB10290,Stop the Invasion of Women’s Spaces Act,INCARCERATION,Stop the Invasion of Women’s Spaces Act,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10290/id/3038024,https://www.congress.gov/bill/118th-congress/h...,https://www.congress.gov/bill/118th-congress/h...,Bill text not found
3,US HB1064,Ensuring Military Readiness Act of 2023,MILITARY,Ensuring Military Readiness Act of 2023,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB1064/id/2737306,https://www.congress.gov/bill/118th-congress/h...,https://www.congress.gov/bill/118th-congress/h...,Bill text not found
4,US HB1112,Ensuring Military Readiness Act of 2023,MILITARY,Ensuring Military Readiness Act of 2023,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB1112/id/2742708,https://www.congress.gov/bill/118th-congress/h...,https://www.congress.gov/bill/118th-congress/h...,Bill text not found


In [55]:
print(df_combined[['text', 'bill_text']].tail(50))

                                                  text            bill_text
58   https://www.congress.gov/bill/118th-congress/h...  Bill text not found
59   https://www.congress.gov/bill/118th-congress/h...  Bill text not found
60   https://www.congress.gov/bill/118th-congress/h...  Bill text not found
61   https://www.congress.gov/bill/118th-congress/h...  Bill text not found
62   https://www.congress.gov/bill/118th-congress/h...  Bill text not found
63   https://www.congress.gov/bill/118th-congress/h...  Bill text not found
64   https://www.congress.gov/bill/118th-congress/s...  Bill text not found
65   https://www.congress.gov/bill/118th-congress/s...  Bill text not found
66   https://www.congress.gov/bill/118th-congress/s...  Bill text not found
67   https://www.congress.gov/bill/118th-congress/s...  Bill text not found
68   https://www.congress.gov/bill/118th-congress/s...  Bill text not found
69   https://www.congress.gov/bill/118th-congress/s...  Bill text not found
70   https:/