In [1]:
# import the following libraries for our web scraping project

import requests # to make https requests
from bs4 import BeautifulSoup # our web scraping library

import pandas as pd # for making dataframe later

In [2]:
# save the data from the website as a "soup" object

site = requests.get('https://translegislation.com/bills/2024/US') # gets the URL
html_code = site.content # saves the HTML code
soup = BeautifulSoup(html_code, 'lxml') # creates a soup object

In [3]:
# runs the loop on the bill cards
bill_cards = soup.find_all('div', class_ ='css-4rck61')

# creating empty lists to hold all of our data
titles = []
captions = []
categories = []
descriptions = []

# extracting the data from the bill cards
for item in bill_cards:
    title = item.h3.text
    category = item.find('span').text
    caption = item.h2.text
    if item.h2.text is not None:
        description = item.h2.text
    else:
        description = 'No bill description'
    
    # adding the items to the empty lists
    titles.append(title)
    categories.append(category)
    captions.append(caption)
    descriptions.append(description)
    # remember that "legiscan_links" is already saved as a list, so we don't have to create it here

In [4]:
urls = []
for item in bill_cards:
    extension = 'https://translegislation.com/' + item.a['href']
    urls.append(extension)

# making a soup object of *every* page that is linked
# this may take several seconds
soups = []
for item in urls:
    site = requests.get(item)
    html_code = site.content
    soup = BeautifulSoup(html_code, 'lxml')
    soups.append(soup)


legiscan_links = []
congress_links = []
for item in soups:
    # we are getting two links here, one to legiscan and one to the congress website
    links = item.find_all('a', class_='chakra-link css-oga2ct')
    anchor1 = links[0]['href'] # link to legiscan
    legiscan_links.append(anchor1)
    anchor2 = links[1]['href'] # link to congress
    congress_links.append(anchor2)

In [5]:
# creating a dataframe, with separate columns to hold each of our lists
df24 = pd.DataFrame(
    {'title': titles,
     'caption': captions,
     'category': categories,
     'description': descriptions,
     'url': urls,
     'legiscan': legiscan_links,
     'congress': congress_links
    })

In [6]:
df24.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        87 non-null     object
 1   caption      87 non-null     object
 2   category     87 non-null     object
 3   description  87 non-null     object
 4   url          87 non-null     object
 5   legiscan     87 non-null     object
 6   congress     87 non-null     object
dtypes: object(7)
memory usage: 4.9+ KB


In [7]:
df24

Unnamed: 0,title,caption,category,description,url,legiscan,congress
0,US HB10075,Stopping the Mutilation of Children Act of 2024,HEALTHCARE,Stopping the Mutilation of Children Act of 2024,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10075/id/3025659,https://www.congress.gov/bill/118th-congress/h...
1,US HB10186,Protecting Women’s Private Spaces Act,BIRTH CERTIFICATES,Protecting Women’s Private Spaces Act,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10186/id/3029142,https://www.congress.gov/bill/118th-congress/h...
2,US HB10290,Stop the Invasion of Women’s Spaces Act,INCARCERATION,Stop the Invasion of Women’s Spaces Act,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB10290/id/3038024,https://www.congress.gov/bill/118th-congress/h...
3,US HB1064,Ensuring Military Readiness Act of 2023,MILITARY,Ensuring Military Readiness Act of 2023,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB1064/id/2737306,https://www.congress.gov/bill/118th-congress/h...
4,US HB1112,Ensuring Military Readiness Act of 2023,MILITARY,Ensuring Military Readiness Act of 2023,https://translegislation.com//bills/2024/US/HB...,https://legiscan.com/US/text/HB1112/id/2742708,https://www.congress.gov/bill/118th-congress/h...
...,...,...,...,...,...,...,...
82,US SJR90,A joint resolution providing for congressional...,HEALTHCARE,A joint resolution providing for congressional...,https://translegislation.com//bills/2024/US/SJR90,https://legiscan.com/US/text/SJR90/id/3003899,https://www.congress.gov/bill/118th-congress/s...
83,US SJR96,A joint resolution providing for congressional...,EDUCATION,A joint resolution providing for congressional...,https://translegislation.com//bills/2024/US/SJR96,https://legiscan.com/US/text/SJR96/id/3009679,https://www.congress.gov/bill/118th-congress/s...
84,US SR267,A resolution supporting the designation of the...,SPORTS,A resolution supporting the designation of the...,https://translegislation.com//bills/2024/US/SR267,https://legiscan.com/US/text/SR267/id/2831179,https://www.congress.gov/bill/118th-congress/s...
85,US SR53,A resolution establishing a Women's Bill of Ri...,CIVIL RIGHTS,A resolution establishing a Women's Bill of Ri...,https://translegislation.com//bills/2024/US/SR53,https://legiscan.com/US/text/SR53/id/2696872,https://www.congress.gov/bill/118th-congress/s...


In [8]:
import os

# Get the current working directory
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /Users/michaelajackson/Desktop/Antitrans/GA-Trans-Leg


In [19]:
df24.to_csv('/Users/michaelajackson/Desktop/Antitrans/GA-Trans-Leg/2024CSV')

In [21]:
#restarting the process for 2025
site25 = requests.get('https://translegislation.com/bills/2025/US') # gets the URL
html_code25 = site25.content # saves the HTML code
soup25 = BeautifulSoup(html_code25, 'lxml') # creates a soup object

In [23]:
# runs the loop on the bill cards
bill_cards25 = soup25.find_all('div', class_ ='css-4rck61')

# creating empty lists to hold all of our data
titles25 = []
captions25 = []
categories25 = []
descriptions25 = []

# extracting the data from the bill cards
for item in bill_cards25:
    title25 = item.h3.text
    category25 = item.find('span').text
    caption25 = item.h2.text
    if item.h2.text is not None:
        description25 = item.h2.text
    else:
        description25 = 'No bill description'
    
    # adding the items to the empty lists
    titles25.append(title25)
    categories25.append(category25)
    captions25.append(caption25)
    descriptions25.append(description25)
    # remember that "legiscan_links" is already saved as a list, so we don't have to create it here

In [39]:
urls25 = []
for item in bill_cards25:
    extension25 = 'https://translegislation.com/' + item.a['href']
    urls25.append(extension25)

# making a soup object of *every* page that is linked
# this may take several seconds
soups25 = []
for item in urls25:
    site25 = requests.get(item)
    html_code25 = site25.content
    soup25 = BeautifulSoup(html_code25, 'lxml')
    soups25.append(soup25)

legiscan_links25 = []
congress_links25 = []
for item in soups25:
    # we are getting two links here, one to legiscan and one to the congress website and adding if statement for bills with no links
    links25 = item.find_all('a', class_='chakra-link css-oga2ct')
    # Default values
    legiscan_link = "Not available"
    congress_link = "Not available"
    
    # Check each link
    for link in links25:
        href = link.get('href', '')
        # If it’s a Legiscan link
        if "legiscan.com" in href:
            legiscan_link = href
        # If it’s a Congress link
        elif "congress.gov" in href:
            congress_link = href

    legiscan_links.append(legiscan_link)
    congress_links.append(congress_link)

In [41]:
print("Titles:", len(titles))
print("Captions:", len(captions))
print("Categories:", len(categories))
print("Descriptions:", len(descriptions))
print("URLs:", len(urls))
print("Legiscan links:", len(legiscan_links))
print("Congress links:", len(congress_links))

Titles: 87
Captions: 87
Categories: 87
Descriptions: 87
URLs: 87
Legiscan links: 150
Congress links: 150


In [31]:
# creating a dataframe, with separate columns to hold each of our lists
df25 = pd.DataFrame(
    {'title': titles25,
     'caption': captions25,
     'category': categories25,
     'description': descriptions25,
     'url': urls25,
     'legiscan': legiscan_links25,
     'congress': congress_links25
    })

ValueError: All arrays must be of the same length

In [None]:
df25.info()

In [None]:
df25

In [None]:
df24.to_csv('/Users/michaelajackson/Desktop/Antitrans/GA-Trans-Leg/2024CSV')

In [None]:
#combining both dfs into 1
df_combined = pd.concat([df, df25], ignore_index=True)

In [None]:
df_combined

print(df_combined['congress'][12])

In [None]:
#sorting values by session number in congress url
df_combined.sort_values(by='session', ascending=False, inplace=True)
df_combined.tail()

In [None]:
#Government website has bot blocks to the code in the block does not work

# Create a new column with "/all-info" replaced by "/text"
#df_combined['text'] = df_combined['congress'].str.replace('/all-info', '/text/is?format=txt', regex=False)

#print(df_combined['text'][30])

#df_combined.info()

#df_combined.head()

#creating function for extracting text from links in text cloumn

#def extract_text(url):
   # try:
   #     sitetext = requests.get(url) # gets the URL
   #     html_codetext = sitetext.content # saves the HTML code
   #     souptext = BeautifulSoup(html_codetext, 'lxml') # creates a soup object

# Find the <pre> tag containing the bill text
     #   bill_text = souptext.find('pre', {'id': 'billTextContainer'})
        
      #  return bill_text.get_text(strip=True) if bill_text else "Bill text not found"
  #  except requests.RequestException as e:
      #  return f"Error: {e}"  # Handle request errors for sites with pdfs instead of html

#df_combined['bill_text'] = df_combined['text'].apply(extract_text)

In [None]:
#creating new column named "session" that pulls the congress session number (either 117 or 118) from the urls in congress column
import re

# Function to extract the Congress session from a URL
def extract_session(url):
    match = re.search(r'/(\d+)(?:th)?-congress/', url)
    return int(match.group(1)) if match else None

In [None]:
df_combined['session'] = df_combined['congress'].apply(extract_session)

In [None]:
print(df_combined['session'].unique())

In [None]:
df_combined.sample(20)

In [None]:
#renaming columns

In [None]:
#Creating new urls that link to the bills xml site

# function that contains a loop to insert bill numbers
# into the URL, then to grab the content and add to a new list



In [None]:
#checking if function works


In [None]:
#Cleaning full text 

In [None]:
#applying to combined_df and creating a new full text column