In [None]:
import pandas as pd
import requests
from io import BytesIO

In [1]:
# Defnining a function to download and combine excel files into a dataframe. 

# Takes as input a list of urls, and a number of rows to skip, which varies by year.

def download_excel_files(urls, rows_to_skip, sheet_number = 1):
    all_data = []

    for i, url in enumerate(urls, 1):
        # Download an Excel file
        response = requests.get(url)
        
        if response.status_code == 200:
            # Read the Excel file into a BytesIO object
            excel_file = BytesIO(response.content)
            
            # Read the second sheet of the Excel file, skip the first 4 rows of metadata
            df = pd.read_excel(excel_file, skiprows = rows_to_skip, sheet_name = sheet_number)
            
            # Add columns to identify which file this data came from
            df['source_file'] = f"file_{i}"
            df['year'] = url.split('/')[-1].split('_')[0][-2:]  # Extract year from filename
            
            all_data.append(df)
            
            print(f"Downloaded and processed file {i}")
        else:
            print(f"Failed to download file from {url}. Status code: {response.status_code}")

    if all_data:
        # Combine all DataFrames into one
        combined_df = pd.concat(all_data, ignore_index=True)
        return combined_df
    else:
        print("No data was successfully downloaded.")
        return None

In [2]:
# Attempted to use beautifulsoup to scrape the specialty care clinic file urls, but neither xpath or css selector worked reliably. 

# I split the urls into 2 lists, pre_2018_urls and post_2018_urls, on account of inconsistent data structure.

# I also define a data dictionary url to create a dataframe that can be used to standardize column names between the newer and older datasets. 

# The plan is to create two dataframes for pre- and post-2018, modify the former's structure to match the latter, and combine.

# Define a list of urls for files pertaining to 2013–2017

pre_2018_urls = [
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/896c699c-07fc-4049-bda0-ff98ac8e3913/download/spcl13utildatafinal.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/91fa31b7-8f40-47f1-8bca-bbc063221993/download/spcl14utildatafinal.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/171f7631-4cb2-4b20-b238-d5ab3512ae10/download/spcl15utildatafinal.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/c6a99713-427a-44df-947d-d46c3402a4d6/download/spcl16_util_data_final-ver2.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/e7a2def1-c0dd-41af-a283-46e095bc0af2/download/spcl17_util_data_final.xlsx"
]

# Define a list of urls for files pertaining to 2018–2023

post_2018_urls = [
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/8ad9b464-cbbd-4ad5-b37d-d2daa924768b/download/spcl23_util_data_prelim.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/00a9d637-d75a-4ba5-9ed5-87bb01f3a6e3/download/spcl22_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/f6339c46-8e35-4466-b972-ce132c43cbf4/download/spcl21_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/9c883633-b661-4da3-b39f-50536f60e573/download/spcl20_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/188b31e3-2307-479e-9bee-632083f902ba/download/spcl19_util_data_final.xlsx",
    "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/e891cdff-6092-4316-b406-dcbcf4a9c016/download/spcl18_util_data_final.xlsx"
]

data_dictionary_url = "https://data.chhs.ca.gov/dataset/17bbc0b0-869e-4168-b03b-48fa60c78577/resource/188b31e3-2307-479e-9bee-632083f902ba/download/spcl19_util_data_final.xlsx"

In [3]:
pre_2018_df = download_excel_files(pre_2018_urls, rows_to_skip = [1,2,3])

post_2018_df = download_excel_files(post_2018_urls, rows_to_skip = [1,2,3,4])

data_dictionary = download_excel_files(data_dictionary_url, rows_to_skip = 0, sheet_number = 3)

In [None]:
pre_2018_df.head()

In [None]:
post_2018_df

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9a5367f9-570e-4fa7-91cf-2de38c70a230' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>