# test
\
While updating the DPA repo and looking for new reports, I noticed that the reference table coming out of the `segment` task has no rows.

Upon further inspection with a fresh clone of the repo, the DPA link used in the `scrape` task is still working, but the webpage looks slightly different. I think the yearly links should still be working, but something is going wrong from the get-go.



In [1]:
# dependencies
import re
import hashlib
from random import randint
import yaml
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def read_yaml(yaml_file):
    with open(yaml_file, 'r') as f:
        data = yaml.safe_load(f)
        f.close()
    return data


def get_user_agent():
    idx = randint(0, len(users)-1)
    return users[idx]


# should play nice with any webpage we need step through
def parse_link(url):
    # i had the "allow_redirects" arg set to False but it didnt work?
    res = requests.get(url, headers={
        "User-Agent" : get_user_agent()
})
    html = res.content
    parsed = BeautifulSoup(html, "html.parser")
    print(f"page url:\t{url}")
    try:
        print(f"page title:\t{parsed.title.string}\n")
    except:
        return parsed
    return parsed

# bc we want links, this by default only looks for that type attribute
# flexible enough for the first few rounds of html sifting
def find_content(parsed, tagtype, classname, kw):
    # there are other sections on the page outside this scope
    if classname:
        tags = parsed.find_all(tagtype, class_=classname)
    else:
        tags = parsed.find_all(tagtype)
    if kw:
        return [link['href'] for tag in tags for link in tag.find_all("a")
                 if kw in tag.text.lower()]
    return [link['href'] for tag in tags for link in tag.find_all("a")]


# the initial links are separated by years but all appear under the same section name
# the focus of this work is complaints so that is the default kw used to filter results
def find_complaint_years(parsed, tagtype="div", classname="sfgov-section__content", kw="complaints"):
    return find_content(parsed, tagtype, classname, kw)


# make initial links more organized
# initial links can very in source / host platform, so this separation helps organizes the solution
def sort_links_by_year(links):
    """2025 link is picked up as a partial url missing domain, 
    but other years still work as expected."""
    links = {link[link.find("20"):link.find("20")+4]: link
            for link in links}
    domain = "https://www.sf.gov"
    links = {year: link if ('sf.gov' in link) | ('wayback.archive-it.org' in link)
             else f"{domain}{link}" for year, link in links.items()}
    return links


# this function can be passed to `soup.find_all()` via the `href` arg
# then it filters the soup for links matching the kw
def openness_files(href, kw="openness|CSR"):
    return href and re.compile(kw, flags=re.IGNORECASE).search(href)


# assumes we want to download the full content without doing any filtering
# p sure this would work for not-pdfs but that is the type of file we expect here
def download_file(pdf_url, filename):
    response = requests.get(pdf_url, headers={
        "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
})
    if os.path.exists(filename):
        print(f"*** WARNING: filename '{filename}' already exists. Skipping write. ***\n")
        return 0
    pdf = open(filename, 'wb')
    pdf.write(response.content)
    pdf.close()
    print(f"successful download:\t{pdf_url}\n")
    return 1


def download_pdfs(pdf_links, output_dir):
    print("begin downloading pdfs from list of urls")
    curr = 1
    successes = {pdf_link:None for pdf_link in pdf_links}
    for pdf_link in pdf_links:
        print("------------------------------------")
        print(f"link:\t{curr} of {len(pdf_links)}")
        print("------------------------------------")
        # assumes our save location/env might change
        # + we want to preserve the uploaded pdf's filename
        output_stub = f"{output_dir}"
        corrfname = f"{pdf_link[pdf_link.rfind('files/')+6:]}".replace("/", "-")
        filename = f"{output_stub}/{corrfname}"
        curr += 1
        try:
            print(f"\nattempting to download:\t{pdf_link}")
            attempt = download_file(pdf_link, filename)
            if attempt == 1:
                print(f"\nsaved content as:\n\t\t{filename}\n")
            successes[pdf_link] = filename
        except:
            print(f"\t\t!!\t\tERROR WHILE DOWNLOADING:\t{pdf_link}\n")
            continue
    print()
    print("--->   end of download list   <---")
    return successes


# maybe I'm doing the link scraping wrong but the links need to be prefixed with the domain to work
def add_domain(pdf_links, domain):
    return [f"{domain}{link}" if domain not in link else link for link in pdf_links]


def download_yearly_pdfs(yearly_pdfs, output_dir):
    downloaded = {year:{} for year in yearly_pdfs.keys()}
    print(downloaded.keys())
    for year, pdf_list in yearly_pdfs.items():
        print(f"attempting to download files from {year}")
        downloaded[year] = download_pdfs(pdf_list, output_dir)
        # TODO: check again for 2025 reports; none posted as of 14-JAN-25
        if year != '2025': assert downloaded[year]
    doc_data = [(url, filename) for year, file_status in downloaded.items()
                for url, filename in file_status.items()]
    doc_df = pd.DataFrame(doc_data, columns=['pdf_url', 'pdf_file'])
    return doc_df


def hashid(fname):
    if pd.isna(fname): return None
    with open(fname, 'rb') as f:
        digest = hashlib.file_digest(f, "sha1")
    return digest.hexdigest()[:8]

In [3]:
# main
output_dir = 'output/pdfs'
users = read_yaml("../hand/useragents.yml")

# read data, initial verification
expected = [str(year) for year in range(2020, 2026, 1)]
main_parsed = parse_link("https://sf.gov/information/reports-policing-complaints")

# find_content() and find_complaint_years() take us from 1 home page link
# with other stuff on the page we don't want to scrape
# to a dict of years and the associated urls for that year's complaint reports
yearly_links = find_complaint_years(main_parsed)
yearly_links = sort_links_by_year(yearly_links)

# initial parsing of the links for complaint reports
years_parsed = {year: parse_link(link)
                for year, link in yearly_links.items()}

# since the 2020 link goes to the wayback page
# and the wayback page just organizes the years (1998, 2021) by anchors
# we can just read 2020 and collect all the pdfs
yearly_pdf_links = {year: [tag["href"]
                           for tag in parsed.find_all(href=openness_files)]
                    for year, parsed in years_parsed.items()
                    if year in expected}
yearly_pdf_links = {year:
                    add_domain(parsed, domain="https://sf.gov")
                    if year in ("2025", "2024", "2023", "2022", "2021")
                    else add_domain(parsed, domain="https://wayback.archive-it.org")
                    for year, parsed in yearly_pdf_links.items()}

# we don't need to be downloading the reports if the debugging works, we'll put changes in the script and run from there
#ref_table = download_yearly_pdfs(yearly_pdf_links, output_dir)
#ref_table['fileid'] = ref_table.pdf_file.apply(hashid)

page url:	https://sf.gov/information/reports-policing-complaints
page title:	Reports on policing complaints | SF.gov



### what does the first pass at processing the data result in?

In [4]:
yearly_links

{}

### what are we getting from reading the page?

In [5]:
str(main_parsed)[:1000]

'<!DOCTYPE html>\n<html lang="en"><head><meta charset="utf-8"/><meta content="width=device-width" name="viewport"/><title>Reports on policing complaints | SF.gov</title><meta content="3" name="next-head-count"/><link href="/icon.ico" rel="icon"/><link as="font" crossorigin="anonymous" data-next-font="size-adjust" href="/_next/static/media/b5c95a22b4347906-s.p.woff2" rel="preload" type="font/woff2"/><link as="font" crossorigin="anonymous" data-next-font="size-adjust" href="/_next/static/media/d3d85d86ad5c431a-s.p.woff2" rel="preload" type="font/woff2"/><link as="font" crossorigin="anonymous" data-next-font="size-adjust" href="/_next/static/media/12d86e8d7e1c2769-s.p.woff2" rel="preload" type="font/woff2"/><link as="style" href="/_next/static/css/9a18325f5710f76e.css" rel="preload"/><link data-n-g="" href="/_next/static/css/9a18325f5710f76e.css" rel="stylesheet"/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/_next/static/chunks/polyfills-42372ed130431b0a.js"></scr

### okay, so we are finding HTML on the page but our processing is broken.

I made changes ~3 weeks ago to support scraping the new 2025 tab, but there were no reports for 2025 added and there still aren't as of Tue 4 Feb 2025.
- I can confirm I updated the [complaints.xlsx](https://docs.google.com/spreadsheets/d/1-yldlizXlRBGhu2RJEP4ogj6ZCT6QSR7/edit?usp=drive_link&ouid=108469006105963119869&rtpof=true&sd=true) file we share with Zac + team on 15 Jan 2025, so I was able to run the repo at that time and I know since I was fiddling with the scrape task, I would have been cleaning and re-rebuilding the PDF collection from scratch.

So, something changed on the website in the last 3 weeks and completely broke our processing.

### the first step in processing the extracted HTML document is identifying the year tabs with `find_complaint_years()`

In [6]:
def find_complaint_years(parsed, tagtype="div", classname="sfgov-section__content", kw="complaints"):
    return find_content(parsed, tagtype, classname, kw)

In [7]:
find_complaint_years(main_parsed)

[]

In [8]:
str_mainp = str(main_parsed)

In [9]:
'sfgov-section__content' in str_mainp

False

In [10]:
'complaints' in str_mainp

True

### allrrrigght. So the class name changed.

I went and used the Inspect Element tool on the DPA reports page and checked the class name of the 2024 and 2023 links.

As found on the page:
> <a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a>

As code:
`<a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a>`

In [11]:
newclass = 'text-primary500 cursor-pointer'

In [12]:
find_complaint_years(main_parsed, tagtype="div", classname=newclass, kw="complaints")

[]

### Hmm. That didn't work as expected.

In [13]:
newclass in str_mainp

True

In [14]:
str_mainp[str_mainp.find(newclass):str_mainp.find(newclass) + 120]

'text-primary500 cursor-pointer" href="https://www.sf.gov/resource--2025--reports-policing-complaints-2025">2025</a></p><'

### Okay. So we're confirming that we can scrape the links from the DPA reports page but there's more to fix than just the classname.

The `find_complaint_years()` method calls `find_content()` on the HTML, so let's look closer at that.

In [15]:
def find_content(parsed, tagtype, classname, kw):
    # there are other sections on the page outside this scope
    if classname:
        tags = parsed.find_all(tagtype, class_=classname)
    else:
        tags = parsed.find_all(tagtype)
    if kw:
        return [link['href'] for tag in tags for link in tag.find_all("a")
                 if kw in tag.text.lower()]
    return [link['href'] for tag in tags for link in tag.find_all("a")]

In [16]:
main_parsed.find_all(tagtype="div", class_=newclass)

[]

More sifted through the HTML to find what the expected section of the page is going by these days.

The full outer section, I think, is this:
> <div class="flex flex-col gap-[36px] md:gap-28 lg:gap-40 col-span-full lg:col-span-7"><span class="flex flex-col space-y-40" as="main"><section class="flex flex-col gap-y-12" data-testid="block-901d7f67-7fb4-4474-bc9a-9161f38c12d4"><p class="mb-16">The Department of Public Accountability (DPA) reports on the complaints we receive about police officers.&nbsp;</p><p class="mb-16">These reports summarize DPA activities. They do not include any identifying information about the people involved.&nbsp;</p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource--2025--reports-policing-complaints-2025">2025</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/reports-policing-complaints-2023">2023</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/reports-policing-complaints-2022">2022</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2021/2021-reports-complaints-against-police">2021</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2020">2020</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2019">2019</a>&nbsp;</p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2018">2018</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2017">2017</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2016">2016</a></p></section><section class="flex flex-col gap-y-12" data-testid="block-947fa67b-2c83-4072-b2ac-7d21cee38c33"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body" id="">Audit Reports</h2><p class="mb-16">The DPA Audit Division conducts performance audits of the San Francisco Police Department.</p><p class=""><a class="text-primary500 cursor-pointer" href="https://sf.gov/departments/department-police-accountability/audit-division">Audit Reports</a></p></section><section class="flex flex-col gap-y-12" data-testid="block-9f7d1c6d-5902-49c6-a3f6-2e4e16448a4e"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body" id="">SB 1421 Records</h2><p class="mb-16">The DPA Public Records division is responsible for releasing Senate Bill 1421 cases.</p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/records-of-released-officer-involved-shooting-case-files">SB1421 Officer Involved Shooting Records</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2021/dpa-released-sb-1421-records-0">SB1421 Great Bodily Injury Records</a></p></section><section class="flex flex-col gap-y-12" data-testid="block-492bcfb7-c760-4d0b-a6e9-6883bcd42122"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body" id="">SB 16 Records</h2><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/records-released-unlawful-arrest-or-search-case">SB16 Unlawful Arrest or Search Records</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/biased-policing">SB16 Biased Policing</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/sb-16-excessive-or-unnecessary-force">SB 16 Excessive or Unnecessary Force</a></p></section></span><div id="divisions"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body !mb-20">Departments</h2><div class="flex flex-col md:grid gap-x-28 md:grid-cols-3 "><div class=" pb-20 md:[&amp;:nth-child(3n+1)]:border-r-1  md:[&amp;:nth-child(3n+1)]:pr-28 md:[&amp;:nth-child(3n)]:border-l-1  md:[&amp;:only-child]:border-r-0 md:border-neutral200 md:[&amp;:not(:nth-child(3n+1))]:pl-28  "><a class="text-primary500" href="/departments--department-police-accountability">Department of Police Accountability</a></div><div class=" pb-20 md:[&amp;:nth-child(3n+1)]:border-r-1  md:[&amp;:nth-child(3n+1)]:pr-28 md:[&amp;:nth-child(3n)]:border-l-1  md:[&amp;:only-child]:border-r-0 md:border-neutral200 md:[&amp;:not(:nth-child(3n+1))]:pl-28  "><a class="text-primary500" href="/departments--police-department">Police Department</a></div><div class=" pb-20 md:[&amp;:nth-child(3n+1)]:border-r-1  md:[&amp;:nth-child(3n+1)]:pr-28 md:[&amp;:nth-child(3n)]:border-l-1  md:[&amp;:only-child]:border-r-0 md:border-neutral200 md:[&amp;:not(:nth-child(3n+1))]:pl-28  "><a class="text-primary500" href="/departments--sheriffs-office">Sheriff's Office</a></div></div></div></div>

The specific section we want, as an element copied from the webpage:
> <section class="flex flex-col gap-y-12" data-testid="block-901d7f67-7fb4-4474-bc9a-9161f38c12d4"><p class="mb-16">The Department of Public Accountability (DPA) reports on the complaints we receive about police officers.&nbsp;</p><p class="mb-16">These reports summarize DPA activities. They do not include any identifying information about the people involved.&nbsp;</p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource--2025--reports-policing-complaints-2025">2025</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/reports-policing-complaints-2023">2023</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/reports-policing-complaints-2022">2022</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2021/2021-reports-complaints-against-police">2021</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2020">2020</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2019">2019</a>&nbsp;</p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2018">2018</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2017">2017</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2016">2016</a></p></section>

In [17]:
main_parsed.find_all(tagtype="div", class_='flex flex-col gap-[36px] md:gap-28 lg:gap-40 col-span-full lg:col-span-7')

[]

### Okay, this filtering is still not capturing the expected section of the page. Let's review the `find_all()` method and see if we can drop the tagtype parameter.

In [18]:
main_parsed.find_all?

[0;31mSignature:[0m
[0mmain_parsed[0m[0;34m.[0m[0mfind_all[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mattrs[0m[0;34m=[0m[0;34m{[0m[0;34m}[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrecursive[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstring[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlimit[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Look in the children of this PageElement and find all
PageElements that match the given criteria.

All find_* methods take a common set of arguments. See the online
documentation for detailed explanations.

:param name: A filter on tag name.
:param attrs: A dictionary of filters on attribute values.
:param recursive: If this is True, find_all() will perform a

In [19]:
main_parsed.find_all(class_='flex flex-col gap-[36px] md:gap-28 lg:gap-40 col-span-full lg:col-span-7')

[<div class="flex flex-col gap-[36px] md:gap-28 lg:gap-40 col-span-full lg:col-span-7"><span as="main" class="flex flex-col space-y-40"><section class="flex flex-col gap-y-12" data-testid="block-901d7f67-7fb4-4474-bc9a-9161f38c12d4"><p class="mb-16">The Department of Public Accountability (DPA) reports on the complaints we receive about police officers. </p><p class="mb-16">These reports summarize DPA activities. They do not include any identifying information about the people involved. </p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource--2025--reports-policing-complaints-2025">2025</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/reports-policing-complaints-2023">2023</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://s

### the section we're extracting with the above function call, as an element

><div class="flex flex-col gap-[36px] md:gap-28 lg:gap-40 col-span-full lg:col-span-7"><span as="main" class="flex flex-col space-y-40"><section class="flex flex-col gap-y-12" data-testid="block-901d7f67-7fb4-4474-bc9a-9161f38c12d4"><p class="mb-16">The Department of Public Accountability (DPA) reports on the complaints we receive about police officers. </p><p class="mb-16">These reports summarize DPA activities. They do not include any identifying information about the people involved. </p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource--2025--reports-policing-complaints-2025">2025</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/reports-policing-complaints-2023">2023</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/reports-policing-complaints-2022">2022</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2021/2021-reports-complaints-against-police">2021</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2020">2020</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2019">2019</a> </p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2018">2018</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2017">2017</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2016">2016</a></p></section><section class="flex flex-col gap-y-12" data-testid="block-947fa67b-2c83-4072-b2ac-7d21cee38c33"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body" id="">Audit Reports</h2><p class="mb-16">The DPA Audit Division conducts performance audits of the San Francisco Police Department.</p><p class=""><a class="text-primary500 cursor-pointer" href="https://sf.gov/departments/department-police-accountability/audit-division">Audit Reports</a></p></section><section class="flex flex-col gap-y-12" data-testid="block-9f7d1c6d-5902-49c6-a3f6-2e4e16448a4e"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body" id="">SB 1421 Records</h2><p class="mb-16">The DPA Public Records division is responsible for releasing Senate Bill 1421 cases.</p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/records-of-released-officer-involved-shooting-case-files">SB1421 Officer Involved Shooting Records</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2021/dpa-released-sb-1421-records-0">SB1421 Great Bodily Injury Records</a></p></section><section class="flex flex-col gap-y-12" data-testid="block-492bcfb7-c760-4d0b-a6e9-6883bcd42122"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body" id="">SB 16 Records</h2><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/records-released-unlawful-arrest-or-search-case">SB16 Unlawful Arrest or Search Records</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/biased-policing">SB16 Biased Policing</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/sb-16-excessive-or-unnecessary-force">SB 16 Excessive or Unnecessary Force</a></p></section></span><div id="divisions"><h2 class="font-slab text-heading-xxl lg:text-desktop-heading-xxl text-neutral900 mb-space-body !mb-20">Departments</h2><div class="flex flex-col md:grid gap-x-28 md:grid-cols-3"><div class="pb-20 md:[&amp;:nth-child(3n+1)]:border-r-1 md:[&amp;:nth-child(3n+1)]:pr-28 md:[&amp;:nth-child(3n)]:border-l-1 md:[&amp;:only-child]:border-r-0 md:border-neutral200 md:[&amp;:not(:nth-child(3n+1))]:pl-28"><a class="text-primary500" href="/departments--department-police-accountability">Department of Police Accountability</a></div><div class="pb-20 md:[&amp;:nth-child(3n+1)]:border-r-1 md:[&amp;:nth-child(3n+1)]:pr-28 md:[&amp;:nth-child(3n)]:border-l-1 md:[&amp;:only-child]:border-r-0 md:border-neutral200 md:[&amp;:not(:nth-child(3n+1))]:pl-28"><a class="text-primary500" href="/departments--police-department">Police Department</a></div><div class="pb-20 md:[&amp;:nth-child(3n+1)]:border-r-1 md:[&amp;:nth-child(3n+1)]:pr-28 md:[&amp;:nth-child(3n)]:border-l-1 md:[&amp;:only-child]:border-r-0 md:border-neutral200 md:[&amp;:not(:nth-child(3n+1))]:pl-28"><a class="text-primary500" href="/departments--sheriffs-office">Sheriff's Office</a></div></div></div></div>

### Okay, we're back in business.

Still, we'd like to just capture the section with the info we want, so we don't burden the downstream steps with unexpected report types.

### Let's try with the classname from the specific section's element.

In [20]:
main_parsed.find_all(class_='flex flex-col gap-y-12')[0]

<section class="flex flex-col gap-y-12" data-testid="block-901d7f67-7fb4-4474-bc9a-9161f38c12d4"><p class="mb-16">The Department of Public Accountability (DPA) reports on the complaints we receive about police officers. </p><p class="mb-16">These reports summarize DPA activities. They do not include any identifying information about the people involved. </p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource--2025--reports-policing-complaints-2025">2025</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/reports-policing-complaints-2023">2023</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/reports-policing-complaints-2022">2022</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https

### The first item found when used the narrower classname, as an element

><section class="flex flex-col gap-y-12" data-testid="block-901d7f67-7fb4-4474-bc9a-9161f38c12d4"><p class="mb-16">The Department of Public Accountability (DPA) reports on the complaints we receive about police officers. </p><p class="mb-16">These reports summarize DPA activities. They do not include any identifying information about the people involved. </p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource--2025--reports-policing-complaints-2025">2025</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://www.sf.gov/resource/2024/reports-policing-complaints-2024">2024</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2023/reports-policing-complaints-2023">2023</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2022/reports-policing-complaints-2022">2022</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://sf.gov/resource/2021/2021-reports-complaints-against-police">2021</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2020">2020</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2019">2019</a> </p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2018">2018</a></p><p class="mb-16"><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2017">2017</a></p><p class=""><a class="text-primary500 cursor-pointer" href="https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2016">2016</a></p></section>

### Perfect. Okay, time to update the methods in the scrape script.

In [21]:
def find_content(parsed, classname, kw):
    # there are other sections on the page outside this scope
    if classname:
        tags = parsed.find_all(class_=classname)
    else:
        tags = parsed.find_all()
    if kw:
        return [link['href'] for tag in tags for link in tag.find_all("a")
                 if kw in tag.text.lower()]
    return [link['href'] for tag in tags for link in tag.find_all("a")]


def find_complaint_years(parsed, classname="flex flex-col gap-y-12", kw="complaints"):
    return find_content(parsed, classname, kw)

In [22]:
# read data, initial verification
expected = [str(year) for year in range(2020, 2026, 1)]
main_parsed = parse_link("https://sf.gov/information/reports-policing-complaints")

# find_content() and find_complaint_years() take us from 1 home page link
# with other stuff on the page we don't want to scrape
# to a dict of years and the associated urls for that year's complaint reports
yearly_links = find_complaint_years(main_parsed)
yearly_links = sort_links_by_year(yearly_links)

# initial parsing of the links for complaint reports
years_parsed = {year: parse_link(link)
                for year, link in yearly_links.items()}

# since the 2020 link goes to the wayback page
# and the wayback page just organizes the years (1998, 2021) by anchors
# we can just read 2020 and collect all the pdfs
yearly_pdf_links = {year: [tag["href"]
                           for tag in parsed.find_all(href=openness_files)]
                    for year, parsed in years_parsed.items()
                    if year in expected}
yearly_pdf_links = {year:
                    add_domain(parsed, domain="https://sf.gov")
                    if year in ("2025", "2024", "2023", "2022", "2021")
                    else add_domain(parsed, domain="https://wayback.archive-it.org")
                    for year, parsed in yearly_pdf_links.items()}
#ref_table = download_yearly_pdfs(yearly_pdf_links, output_dir)
#ref_table['fileid'] = ref_table.pdf_file.apply(hashid)

page url:	https://sf.gov/information/reports-policing-complaints
page title:	Reports on policing complaints | SF.gov

page url:	https://www.sf.gov/resource--2025--reports-policing-complaints-2025
page title:	Reports on policing complaints in 2025 | SF.gov

page url:	https://www.sf.gov/resource/2024/reports-policing-complaints-2024
page title:	Reports on policing complaints in 2024 | SF.gov

page url:	https://sf.gov/resource/2023/reports-policing-complaints-2023
page title:	Reports on policing complaints in 2023 | SF.gov

page url:	https://sf.gov/resource/2022/reports-policing-complaints-2022
page title:	Reports on policing complaints in 2022 | SF.gov

page url:	https://sf.gov/resource/2021/2021-reports-complaints-against-police
page title:	Reports on policing complaints in 2021 | SF.gov

page url:	https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2020
page title:	Reports & Statistics | Department of Police Accountability

page url:	https://wayback.archiv

In [23]:
yearly_links

{'2025': 'https://www.sf.gov/resource--2025--reports-policing-complaints-2025',
 '2024': 'https://www.sf.gov/resource/2024/reports-policing-complaints-2024',
 '2023': 'https://sf.gov/resource/2023/reports-policing-complaints-2023',
 '2022': 'https://sf.gov/resource/2022/reports-policing-complaints-2022',
 '2021': 'https://sf.gov/resource/2021/2021-reports-complaints-against-police',
 '2020': 'https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2020',
 '2019': 'https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2019',
 '2018': 'https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2018',
 '2017': 'https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2017',
 '2016': 'https://wayback.archive-it.org/org-571/3/https://sfgov.org/dpa/reports-statistics#2016'}

In [24]:
yearly_pdf_links

{'2025': [],
 '2024': ['https://sf.govhttps://media.api.sf.gov/documents/Jan_Openness_24.xlsx.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/Feb_Openness.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/CSR_03-24_Redact_0.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/CSR_Redacted_4.24_0.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/CSR_Redacted_05_-2024.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/Readacted_CSR_6-2024.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/CSR_Redacted_07.2024.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/August_2024_openness_cases.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/Sept_Openness_2024.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/Oct_Openness_2024.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/Nov_openness_2024_SDYnXrN.pdf',
  'https://sf.govhttps://media.api.sf.gov/documents/Dec_Openness_2024.pdf'],
 '2023': ['https://sf.govhttps://medi

# Review

- [x] updated initial classname from 'sfgov-section__content' to 'flex flex-col gap-y-12'
- [x] dropped the `tagtype` parameter from filtering
- [x] applied fixes to scrape script and confirmed we're back to collecting the yearly PDFs as expected

# Comments

After implementing these changes in the script, I found PDF URL values that were missing a corresponding PDF file path in the repo, implying that there was a problem with downloading.

I reviewed the log outputs during a scrape call and noticed the naming convention of the links had changed for the more recent years (the ones I was prefixing with the "sf.gov" domain rather than the wayback machine domain). Ex) "https://media.api.sf.gov/documents/Jan_Openness_24.xlsx.pdf".

These more recent years now come with a domain/host already included, so I dropped the domain prefix in `add_domain()` by changing the domain arg to an empty str.

By the time we download the file locally, we need to clean the link by removing: ':--media.api.sf.gov-documents-'.

Then things are fully back to normal.