In [210]:
import polars as pl
from datetime import datetime
import pyarrow.parquet as pq


In [240]:
# URL of the Excel file
url_2022 = "https://elections.cdn.sos.ca.gov/sov/2022-general/ssov/ballot-measures-political-districts.xlsx"

url_2020 = "https://elections.cdn.sos.ca.gov/sov/2020-general/ssov/ballot-measure-all.xlsx"

url_2018 = "https://elections.cdn.sos.ca.gov/sov/2018-general/ssov/ballot-measures-pol-districts.xls"

# Function processing Excel files 
# Returns a combined DataFrame

def process_excel_files(urls_and_years):
    combined_df = pl.DataFrame()
    
    for url, year in urls_and_years:
        # Read the Excel file
        df = pl.read_excel(url)
        
        # Get the first two columns
        # This works because the first two columns are always the county and sub_county
        first_two_cols = df.columns[:2]
        
        # The below code chunk finds columns with "Kidney" or "Dialysis" (yes votes) and their following columns
        # This works because the yes vote/no vote columns appear consecutively in the original data
        
        # Initialize an empty list to store columns related to kidney/dialysis propositions
        kidney_dialysis_cols = []

        # Iterate through all the columns in the DataFrame
        for i, col in enumerate(df.columns):
            # Checking if the current column contains "Kidney" or "Dialysis" in any of its values
            if df[col].str.contains("Kidney|Dialysis").any():
                # Adding matching columns to the list
                kidney_dialysis_cols.extend([
                    col,  # yes vote column
                    df.columns[i+1] if i+1 < len(df.columns) else None  # no vote column
                ])
        
        # Combining all required columns
        selected_cols = first_two_cols + kidney_dialysis_cols
        
        # Select only the required columns
        df_processed = df.select(selected_cols)
        
        # Renaming columns
        new_names = {
            df_processed.columns[0]: "county",
            df_processed.columns[1]: "sub_county"
        }
        for col in df_processed.columns[2:]:
            if "Proposition" in col:
                new_names[col] = "yes"
            else:
                new_names[col] = "no"
        
        df_processed = df_processed.rename(new_names)
        
        # Add the year column
        df_processed = df_processed.with_columns(pl.lit(year).alias("year"))
        
        # Append to the combined DataFrame
        combined_df = pl.concat([combined_df, df_processed])
    
    return combined_df

# URLs and corresponding years
urls_and_years = [
    (url_2022, 2022),
    (url_2020, 2020),
    (url_2018, 2018)
]

# Process all files and store the combined DataFrame as df
df = process_excel_files(urls_and_years)

# Display the first few rows of the combined DataFrame
print("Combined Data:")
print(combined_df.head())

# Displaying some basic information about the combined DataFrame
# This should help with data validation
print("\nDataFrame Info:")
print(f"Shape: {combined_df.shape}")
print(f"Columns: {combined_df.columns}")
print("\nYear distribution:")
print(combined_df.group_by("year").agg(pl.count()).sort("year"))

Combined Data:
shape: (5, 5)
┌────────────────┬───────────────┬─────────────────────┬────────────────────┬──────┐
│ county         ┆ sub_county    ┆ yes                 ┆ no                 ┆ year │
│ ---            ┆ ---           ┆ ---                 ┆ ---                ┆ ---  │
│ str            ┆ str           ┆ str                 ┆ str                ┆ i32  │
╞════════════════╪═══════════════╪═════════════════════╪════════════════════╪══════╡
   ┆ null               ┆ 2022 │ ┆ Regulates Kidney
│                ┆               ┆ Dialysis Cli…       ┆                    ┆      │
│ null           ┆ null          ┆ YES                 ┆ NO                 ┆ 2022 │
│ Alameda County ┆ null          ┆ null                ┆ null               ┆ 2022 │
│ null           ┆ County Totals ┆ 182697              ┆ 290746             ┆ 2022 │
│ null           ┆ Percent       ┆ 0.38589017051683094 ┆ 0.6141098294831691 ┆ 2022 │
└────────────────┴───────────────┴─────────────────────┴─────────────

  print(combined_df.group_by("year").agg(pl.count()).sort("year"))


In [241]:
# Forward filling county names

df = df.with_columns(pl.col("county").forward_fill())


In [242]:

# Creating a column to identify the geopolitical entity of observations

df = df.with_columns(
    pl.when(pl.col("sub_county") == "Cities")
    .then(pl.lit("city"))
    .when(pl.col("sub_county") == "County Totals")
    .then(pl.lit("city_end"))
    .when(pl.col("sub_county").str.contains("State Senate"))
    .then(pl.lit("state_senate_district"))
    .when(pl.col("sub_county").str.contains("State Assembly"))
    .then(pl.lit("state_assembly_district"))
    .when(pl.col("sub_county").str.contains("Congressional"))
    .then(pl.lit("federal_congressional_district"))
    .when(pl.col("sub_county").str.contains("County Supervisorial"))
    .then(pl.lit("county_supervisorial_district"))
    .otherwise(None)
    .alias("geo_type")
).with_columns(pl.col("geo_type").forward_fill())

df = (df.with_columns(
    pl.col("sub_county").str.extract(r"\d+", 0).alias("sub_county_id"))
) 

# Stripping the "County" suffix from the county names
df = (df.with_columns(
    pl.col("county").str.strip_suffix(" County").alias("county"))
)

# Stripping the "County1" suffix from the county names
# This is necessary because County1 is an erroneous suffix in the original data

df = (df.with_columns(
    pl.col("county").str.strip_suffix(" County1").alias("county"))
)



In [243]:
df

county,sub_county,yes,no,year,geo_type,sub_county_id
str,str,str,str,i32,str,str
,,"""Regulates Kidney Dialysis Cli…",,2022,,
,,"""YES""","""NO""",2022,,
"""Alameda""",,,,2022,,
"""Alameda""","""County Totals""","""182697""","""290746""",2022,"""city_end""",
"""Alameda""","""Percent""","""0.38589017051683094""","""0.6141098294831691""",2022,"""city_end""",
"""Alameda""","""County Supervisorial 1""","""31161""","""65845""",2022,"""county_supervisorial_district""","""1"""
"""Alameda""","""County Supervisorial 2""","""28778""","""44258""",2022,"""county_supervisorial_district""","""2"""
"""Alameda""","""County Supervisorial 3""","""34338""","""48590""",2022,"""county_supervisorial_district""","""3"""
"""Alameda""","""County Supervisorial 4""","""33897""","""60759""",2022,"""county_supervisorial_district""","""4"""
"""Alameda""","""County Supervisorial 5""","""54523""","""71294""",2022,"""county_supervisorial_district""","""5"""


In [244]:
# Get the original number of rows for data validation
original_row_count = df.shape[0]

original_geo_type_count = df.select("geo_type").unique().count()

original_sub_county_count = df.select("sub_county").unique().count()

# Drop rows where 'geo_type' is null or 'city_end', and where 'sub_county' is 'Cities'
df_cleaned = (df
    .drop_nulls(subset=['geo_type'])
    .filter((pl.col('geo_type') != 'city_end') & (pl.col('sub_county') != 'Cities'))
)

# Get the new number of rows
new_row_count = df_cleaned.shape[0]

new_geo_type_count = df_cleaned.select("geo_type").unique().count()

new_sub_county_count = df_cleaned.select("sub_county").unique().count()

# Calculate the number of dropped rows
dropped_row_count = original_row_count - new_row_count
dropped_geo_type_count = original_geo_type_count - new_geo_type_count
dropped_sub_county_count = original_sub_county_count - new_sub_county_count

# Create a DataFrame to show the results
result_df = pl.DataFrame({
    'Original Rows': [original_row_count],
    'Rows After Cleaning': [new_row_count],
    'Rows Dropped': [dropped_row_count],
    'Geo Types After Cleaning': [new_geo_type_count],
    'Geo Types Dropped': [dropped_geo_type_count],
    'Sub Counties After Cleaning': [new_sub_county_count],
    'Sub Counties Dropped': [dropped_sub_county_count]
})

# Calculate and display the percentage of rows dropped
percentage_dropped = (dropped_row_count / original_row_count) * 100
print(f"Percentage of rows dropped: {percentage_dropped:.2f}%")


# Display the result
result_df


Percentage of rows dropped: 21.91%


Original Rows,Rows After Cleaning,Rows Dropped,Geo Types After Cleaning,Geo Types Dropped,Sub Counties After Cleaning,Sub Counties Dropped
i64,i64,i64,object,object,object,object
4560,3561,999,"shape: (1, 1) ┌──────────┐ │ geo_type │ │ --- │ │ u32 │ ╞══════════╡ │ 5 │ └──────────┘","shape: (1, 1) ┌──────────┐ │ geo_type │ │ --- │ │ u32 │ ╞══════════╡ │ 1 │ └──────────┘","shape: (1, 1) ┌────────────┐ │ sub_county │ │ --- │ │ u32 │ ╞════════════╡ │ 912 │ └────────────┘","shape: (1, 1) ┌────────────┐ │ sub_county │ │ --- │ │ u32 │ ╞════════════╡ │ 15 │ └────────────┘"


In [245]:
df_cleaned

county,sub_county,yes,no,year,geo_type,sub_county_id
str,str,str,str,i32,str,str
"""Alameda""","""County Supervisorial 1""","""31161""","""65845""",2022,"""county_supervisorial_district""","""1"""
"""Alameda""","""County Supervisorial 2""","""28778""","""44258""",2022,"""county_supervisorial_district""","""2"""
"""Alameda""","""County Supervisorial 3""","""34338""","""48590""",2022,"""county_supervisorial_district""","""3"""
"""Alameda""","""County Supervisorial 4""","""33897""","""60759""",2022,"""county_supervisorial_district""","""4"""
"""Alameda""","""County Supervisorial 5""","""54523""","""71294""",2022,"""county_supervisorial_district""","""5"""
"""Alameda""","""US Congressional 10""","""4465""","""7055""",2022,"""federal_congressional_district""","""10"""
"""Alameda""","""US Congressional 12""","""100680""","""135563""",2022,"""federal_congressional_district""","""12"""
"""Alameda""","""US Congressional 14""","""67996""","""131310""",2022,"""federal_congressional_district""","""14"""
"""Alameda""","""US Congressional 17""","""9556""","""16818""",2022,"""federal_congressional_district""","""17"""
"""Alameda""","""State Senate 5""","""23169""","""56283""",2022,"""state_senate_district""","""5"""


In [246]:
# Unpivoting the data so it's in "tidy" format
# Tidy essentially means  each variable has its own column, and each observation has its own row
# This makes it easier to merge, analyze, model, and visualize data
# Also in this step we rename columns to be more informative

df_cleaned = (
    df_cleaned
    .unpivot(
        index = ['county', 'sub_county', 'geo_type', 'sub_county_id', 'year'],
        on = ['yes', 'no'],
        value_name = "vote_count"
    )
    .with_columns(
        pl.col("variable")
    .alias("vote_type"))
    .with_columns(
        pl.col("sub_county_id")
        .alias("district_id"))
    .drop(pl.col(["variable", "sub_county_id"]))
)

In [247]:

# Reordering the columns for clarity/readability

df_cleaned = df_cleaned.select(["year", "county", "sub_county", "district_id", "geo_type", "vote_type", "vote_count"])

df_cleaned

year,county,sub_county,district_id,geo_type,vote_type,vote_count
i32,str,str,str,str,str,str
2022,"""Alameda""","""County Supervisorial 1""","""1""","""county_supervisorial_district""","""yes""","""31161"""
2022,"""Alameda""","""County Supervisorial 2""","""2""","""county_supervisorial_district""","""yes""","""28778"""
2022,"""Alameda""","""County Supervisorial 3""","""3""","""county_supervisorial_district""","""yes""","""34338"""
2022,"""Alameda""","""County Supervisorial 4""","""4""","""county_supervisorial_district""","""yes""","""33897"""
2022,"""Alameda""","""County Supervisorial 5""","""5""","""county_supervisorial_district""","""yes""","""54523"""
2022,"""Alameda""","""US Congressional 10""","""10""","""federal_congressional_district""","""yes""","""4465"""
2022,"""Alameda""","""US Congressional 12""","""12""","""federal_congressional_district""","""yes""","""100680"""
2022,"""Alameda""","""US Congressional 14""","""14""","""federal_congressional_district""","""yes""","""67996"""
2022,"""Alameda""","""US Congressional 17""","""17""","""federal_congressional_district""","""yes""","""9556"""
2022,"""Alameda""","""State Senate 5""","""5""","""state_senate_district""","""yes""","""23169"""


In [248]:

# Generating a timestamp for the files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Defining output paths without timestamp to replace the prior version
output_path = '../../../003_data/001_raw-data/2018-2022_ballot-measure_sub-county_data.parquet'

# Function to save Polars DataFrame as Parquet with metadata
def save_polars_parquet_with_metadata(df, output_path, description):
    try:
        # Convert Polars DataFrame to Arrow Table
        arrow_table = df.to_arrow()

        # Get existing metadata
        metadata = arrow_table.schema.metadata if arrow_table.schema.metadata else {}

        # Update metadata
        metadata.update({
            b'created_at': str(datetime.now()).encode('utf-8'),
            b'description': description.encode('utf-8'),
            b'version': b'1.0',
            b'cleaning_steps': b'''
                1. Downloaded Excel files from California Secretary of State website for 2018, 2020, and 2022 elections.
                2. Processed each Excel file:
                   a. Selected first two columns (county and sub_county).
                   b. Identified and selected columns related to kidney/dialysis propositions and their corresponding No vote columns.
                   c. Renamed columns: first two as 'county' and 'sub_county', proposition column as 'yes', following column as 'no'.
                   d. Added 'year' column to each dataset.
                3. Combined data from all years into a single DataFrame.
                4. Unnested sub-county data into geo_type and sub_county_id.
                5. Reshaped data:
                   a. Unpivoted 'yes' and 'no' columns.
                6. Renamed columns to make them easier to interpret :
                   a. yes/no vote column as 'vote_type'
                   b. sub_county_id as 'district_id'
                7. Reordered columns for clarity/readability
            '''
        })

        # Creating a new Arrow Table with updated metadata
        updated_table = arrow_table.replace_schema_metadata(metadata)

        # Writing the updated table to a Parquet file with Snappy compression to reduce file size
        pq.write_table(updated_table, output_path, compression='snappy')

        print(f"Data saved to {output_path}")
    except Exception as e:
        print(f"Error saving data: {e}")
        raise

# Saving the merged dataframes as parquet files
save_polars_parquet_with_metadata(df_cleaned, output_path, "Merged 2018-2022 ballot measure sub-county data")

Data saved to ../../../003_data/001_raw-data/2018-2022_ballot-measure_sub-county_data.parquet
