In [1]:
!pip install pdfplumber pandas python-dotenv google-cloud-aiplatform

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import pandas as pd
import pdfplumber
import glob
import requests
import json
import re
from dotenv import load_dotenv

load_dotenv()

False

In [4]:
#meant to be outside of notebook. unsure how to do that
os.environ['GOOGLE_API_KEY'] = 'AIzaSyBoVVnSkze0Ypu8C0bZmwPk--j4aEFjYtI'  # replace with actual key

In [5]:
#pdf folder, output, and api key locations. load api key as env var
PDF_FOLDER = os.getenv('PDF_FOLDER', '/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bills')
OUTPUT_CSV = '/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bill_summary.csv'

GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

# ingest team-descriptions document

In [6]:
#check and see if all pdf files are found
pdf_paths = sorted(glob.glob(os.path.join(PDF_FOLDER, "*.pdf")))
pdf_paths

['/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bills/20210SB156_96.pdf',
 '/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bills/20250AB1018_97.pdf',
 '/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bills/20250SB17_97.pdf',
 '/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bills/20250SJR3_99.pdf',
 '/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bills/Bill Text - AB-1049 California Food Assistance Program_ sponsor deeming rules..pdf',
 '/content/drive/MyDrive/OfficeInnovation_BillAnalysis/bills/California-2023-AB904-Chaptered.pdf']

#Model and needed functions

In [7]:
#create REST url
GEMINI_URL = (
    "https://generativelanguage.googleapis.com/v1beta/models/"
    "gemini-1.5-flash:generateContent"
    f"?key={GOOGLE_API_KEY}"
)

#call LLM function
def call_llm(prompt: str, max_output_tokens: int = 512, temperature: float = 0.2) -> str:

    body = {
        "contents": [{"parts": [{"text": prompt}]}], # gemini expected format
        "generationConfig": {
            "temperature": temperature,       #controls the randomness from 0-1
            "maxOutputTokens": max_output_tokens, #limits response length
            "topP": 0.8,            #nucleus sampling, considers only top 80% of tokens
            "topK": 10              #only consider the top 10 likely tokens
        }
    }

    #send request to gemini api with json body
    resp = requests.post(GEMINI_URL, json=body, headers={"Content-Type": "application/json"})
    resp.raise_for_status() #keep here in case of http request fails

    #parse json response and extract generated text
    response_data = resp.json()
    return response_data["candidates"][0]["content"]["parts"][0]["text"]

In [8]:
def extract_metadata(text: str) -> dict:
    #prompt, must be in json structure
    prompt = (
        "Extract the bill number and sponsors from this bill. "
        "Return only JSON in this format: "
        '{"bill_number":"SB 156", "sponsors":["Name1", "Name2"]}\n\n' + text
    )
    raw = call_llm(prompt, max_output_tokens=256)

    #was running into issue where this would return as a markdown so remove the block formatting
    cleaned = raw.strip()
    if cleaned.startswith("```"):
        cleaned = cleaned.split('\n', 1)[1] #removes first line of json
    if cleaned.endswith("```"):
        cleaned = cleaned.rsplit('\n', 1)[0] #removes last

    #put cleaned json string into python dict
    data = json.loads(cleaned)
    return {
        "bill_number": data.get("bill_number"), #gets the bill num
        "sponsors": data.get("sponsors", [])  #gets sponsors list (was running into issues where the sponsors list never appears)
    }

In [9]:
def extract_text_from_pdf(path: str) -> str:
    texts = []
    with pdfplumber.open(path) as pdf:  #open pdf
        for page in pdf.pages:        #loop each page
            text = page.extract_text()  #extract text from current page
            if text:            #only add non empty text (had issue where some text was empty?)
                texts.append(text)
    return "\n\n".join(texts) #combine all pages with double line breaks

def summarize_bill(text: str) -> str:
    prompt = "Summarize this bill in 3-5 sentences:\n\n" + text #prompt llm to create summary of the pdf
    return call_llm(prompt, max_output_tokens=256)

def extract_keywords(text: str) -> str:
    prompt = "List the main topics and keywords from this bill, comma-separated:\n\n" + text  #prompt llm to create keywords
    return call_llm(prompt, max_output_tokens=200)

#team names and descriptions of team
TEAM_DESCRIPTIONS = {
    "Data + Policy": "Data handling, analysis, dashboards, AI policy, digital services, technology policy, accessibility, public notices",
    "Direct File": "State tax filing, IRS Direct File program, Treasury department, tax-related legislation",
    "Doula Medicaid Enrollment": "Doula care under Medicaid, managed care organizations (MCOs), Department of Human Services",
    "Food Security": "Nutrition benefits, SNAP, school lunch programs, EBT, Summer EBT, WIC, USDA programs, food assistance"
}


def match_teams(text: str) -> str:
  #context string with all team descriptions
    context = "\n".join(f"{team}: {desc}" for team, desc in TEAM_DESCRIPTIONS.items())
    #ask the llm to match the bill to one of the teams we have in our list
    prompt = f"Based on these teams:\n{context}\n\nBill:\n{text}\n\nWhich teams are relevant? Return comma-separated team names."
    return call_llm(prompt, max_output_tokens=128)

# Process pdf function

In [10]:
print(f"Processing {len(pdf_paths)} bills...")

results = []
for i, pdf_path in enumerate(pdf_paths, 1):#loop each pdf file
    print(f"Processing {i}/{len(pdf_paths)}: {os.path.basename(pdf_path)}")#show progress

    bill_text = extract_text_from_pdf(pdf_path) #extract text from pdf
    metadata = extract_metadata(bill_text)  #get bill num and sponsors from llm

    #creates a new row for each pdf
    results.append({
        "Bill number": metadata.get("bill_number"),
        "Sponsors/Co-sponsors": ", ".join(metadata.get("sponsors", [])),
        "Short bill summary": summarize_bill(bill_text),
        "Topics and Keywords": extract_keywords(bill_text),
        "Relevant Team Names": match_teams(bill_text)
    })
  #convert results list to df
df = pd.DataFrame(results)
df.to_csv(OUTPUT_CSV, index=False)
print(f"Results saved to: {OUTPUT_CSV}")
df.head()

Processing 6 bills...
Processing 1/6: 20210SB156_96.pdf
Processing 2/6: 20250AB1018_97.pdf
Processing 3/6: 20250SB17_97.pdf
Processing 4/6: 20250SJR3_99.pdf
Processing 5/6: Bill Text - AB-1049 California Food Assistance Program_ sponsor deeming rules..pdf
Processing 6/6: California-2023-AB904-Chaptered.pdf
Results saved to: /content/drive/MyDrive/OfficeInnovation_BillAnalysis/bill_summary.csv


Unnamed: 0,Bill number,Sponsors/Co-sponsors,Short bill summary,Topics and Keywords,Relevant Team Names
0,SB 156,Committee on Budget and Fiscal Review,"Senate Bill 156, enacted in July 2021, address...","Communications, broadband, broadband infrastru...",Data + Policy\n
1,AB 1018,"Assembly Member Bauer-Kahan, Assembly Members ...","Assembly Bill 1018, the Automated Decisions Sa...","Automated decision systems, artificial intelli...",Data + Policy\n
2,SB 17,"Ochoa Bogh, Grove, Valladares, Hurtado, Seyarto",Senate Bill 17 amends California's Revenue and...,"Personal income taxes, deductions, tips, tax l...","Data + Policy, Direct File\n"
3,SJR 3,"Arreguín, Weber Pierson",Senate Joint Resolution 3 urges the U.S. Congr...,"Food assistance, Supplemental Nutrition Assist...",Food Security\n
4,AB-1049,"Celeste Rodriguez, Lee",AB-1049 amends California's Food Assistance Pr...,"California Food Assistance Program (CFAP), Cal...","Food Security, Data + Policy\n"
