Requirements Setting and Import Packages

In [None]:
%%writefile requirements.txt
pandas==2.1.4

In [None]:
!pip install -r requirements.txt

In [2]:
import pandas as pd

Import Data into Pandas Dataframes

In [3]:
ballot_measures_2018_df = pd.read_csv('Ballot Measures/California Ballot Measures Nov 2018.csv')
ballot_measures_2020_df = pd.read_csv('Ballot Measures/California Ballot Measures Nov 2020.csv')
ballot_measures_2022_df = pd.read_csv('Ballot Measures/California Ballot Measures Nov 2022.csv')

In [9]:
voter_reg_2018_df = pd.read_csv('Voter Registration/Voter Registration by County 2018.csv')
voter_reg_2020_df = pd.read_csv('Voter Registration/Voter Registration by County 2020.csv')
voter_reg_2022_df = pd.read_csv('Voter Registration/Voter Registration by County 2022.csv')

Preprocessing Functions - Ballot Measures

In [5]:
''' 
Ballot Measures Functions

    Cleaning
        rename_ballot_columns: lower cases column names, replaces underscores with spaces, replaces 'count' with 'vote
        select_relevant_columns selects columns from the full datafrom
        select_prop_data: selects rows of data relevant to the Dialysis propositions
        str_to_int: replaces comma and converts the string to int
        convert_vote_counts: applies the str_to_int function to the vote count columns
        get_vote_percent: calculates the total vote count, percent yes vote, and percent no vote for each county
        spec_year_cols: adds in the year to relevant columns for later query use

    Calling
        preprocess_ballot_data: aggregation of the function calls for preprocessing of the ballot measures data

    Merging
        merge_ballot_data: merges the ballot measures data from 3 years on the county name columns
'''

def rename_ballot_columns(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        lower cases column names, replaces underscores with spaces, replaces 'count' with 'vote
        arguments: ballot measures dataframe raw data
        returns: ballot measures dataframe with clean column names
    '''
    df_in.columns = map(str.lower, df_in.columns)
    df_in.columns = [x.replace('_', ' ').replace('votes', 'count') for x in df_in.columns]
    return df_in

def select_relevant_columns(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        selects columns from the full datafrom
        arguments: ballot measures dataframe data
        returns: ballot measures dataframe only relevant columns
    '''
    return df_in[['election date', 'county name', 'ballot measure name', 'yes count', 'no count']].copy()

def select_prop_data(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        selects rows of data relevant to the Dialysis propositions
        arguments: ballot measures dataframe data
        returns: ballot measures dataframe with relevant proposition rows
    '''
    # checks the year in the date column and applies the relevant proposition number
    if '2018' in df_in['election date'].unique()[0]:
        return df_in[df_in['ballot measure name']=='Proposition 8'].copy()
    elif '2020' in df_in['election date'].unique()[0]:
        return df_in[df_in['ballot measure name']=='Proposition 23'].copy()
    elif '2022' in df_in['election date'].unique()[0]:
        return df_in[df_in['ballot measure name']=='Proposition 29'].copy()
    
def str_to_int(row):
    '''
        replaces comma and converts the string to int
        arguments: row of dataframe
        returns: row of dataframe
    '''
    return int(row.replace(',',''))
    
def convert_vote_counts(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        applies the str_to_int function to the vote count columns
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    df_in['yes count'] = df_in['yes count'].apply(str_to_int)
    df_in['no count'] = df_in['no count'].apply(str_to_int)
    return df_in
    
def get_vote_percent(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        calculates the total vote count, percent yes vote, and percent no vote for each county
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    df_in['total count'] = df_in['yes count'] + df_in['no count']
    df_in['yes perc'] = df_in['yes count']/df_in['total count']
    df_in['no perc'] = df_in['no count']/df_in['total count']
    return df_in

def spec_year_cols(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        adds in the year to relevant columns for later query use
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    if '2018' in df_in['election date'].unique()[0]:
        df_in.columns = [x+' 2018' if (x!= 'county name') and (x != 'ballot measure name') and (x != 'election date') 
                         else x for x in df_in.columns]
    elif '2020' in df_in['election date'].unique()[0]:
        df_in.columns = [x+' 2020' if (x!= 'county name') and (x != 'ballot measure name') and (x != 'election date') 
                         else x for x in df_in.columns]
    elif '2022' in df_in['election date'].unique()[0]:
        df_in.columns = [x+' 2022' if (x!= 'county name') and (x != 'ballot measure name') and (x != 'election date') 
                         else x for x in df_in.columns]
    df_in.drop(columns=['election date', 'ballot measure name'], inplace=True)
    return df_in

def preprocess_ballot_data(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        aggregates the function calls for preprocessing of the ballot measures data
        arguments: ballot measures dataframe
        returns: ballot measures dataframe
    '''
    df_in = select_prop_data(select_relevant_columns(rename_ballot_columns(df_in)))
    df_in = get_vote_percent(convert_vote_counts(df_in))
    return spec_year_cols(df_in)

def merge_ballot_data(df_list_in: list[pd.DataFrame]) -> pd.DataFrame:
    '''
        merges the ballot measures data from 3 years on the county name columns
        arguments: list of ballot measure dataframes
        returns: merged ballot measure dataframe
    '''
    #print(len(df_list_in[0]), len(df_list_in[1]), len(df_list_in[2]))
    df_merged = df_list_in[0].merge(df_list_in[1], on='county name', how='outer')
    df_merged = df_merged.merge(df_list_in[2], on='county name', how='outer')
    return df_merged.dropna().copy()

Preprocessing Functions - Voter Registration

In [6]:
''' 
Voter Registration Functions

    Cleaning
        drop_na_percent: drops fully empty rows and the summary stat rows called percent and state total
        rename_clean_cols: lower cases column names, renames the county column to be county name
        assign_year_cols: adds in the year marker on each column for easier queries later on
        get_percentage_total: calculates the percentage of each registration part from the total number of registrations
        convert_reg_counts: applies the str_to_int function to the all voter registration count columns

    Calling
        preprocess_voter_reg: applies preprocessing functions to the voter registration data
'''

def drop_na_percent(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        drops fully empty rows and the summary stat rows called percent and state total
        arguments: voter registration dataframe
        returns: voter registration dataframe
    '''
    df_in = df_in.dropna(axis=0, how='all')
    df_in = df_in[(df_in['County']!='Percent')&(df_in['County']!='State Total')].copy()
    return df_in.reset_index(drop=True)

def rename_clean_cols(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        lower cases column names, renames the county column to be county name
        arguments: voter registration dataframe
        returns: voter registration dataframe
    '''
    df_in.columns = map(str.lower, df_in.columns)
    df_in.rename(columns={'county':'county name'}, inplace=True)
    return df_in

def assign_year_cols(df_in: pd.DataFrame, year_in: str) -> pd.DataFrame:
    '''
        adds in the year marker on each column for easier queries later on
        arguments: voter registration dataframe, year to be added to col names
        returns: voter registration dataframe
    '''
    if year_in == '2018':
        df_in.columns = [x+' 2018' if (x!= 'county name') else x for x in df_in.columns]
        return df_in
    elif year_in == '2020':
        df_in.columns = [x+' 2020' if (x!= 'county name') else x for x in df_in.columns]
        return df_in
    elif year_in == '2022':
        df_in.columns = [x+' 2022' if (x!= 'county name') else x for x in df_in.columns]
        return df_in
    
def get_percentage_total(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        calculates the percentage of each registration part from the total number of registrations
        arguments: voter registration dataframe
        returns: voter registration dataframe
    '''
    df_in[df_in.columns[3:]] = df_in[df_in.columns[3:]].div(df_in['total registered'], axis=0)
    return df_in

def convert_reg_counts(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        applies the str_to_int function to the all voter registration count columns
        arguments: voter registration dataframe
        returns: voter registration dataframe
    '''
    df_in['Eligible'] = df_in['Eligible'].apply(str_to_int)
    df_in['Total Registered'] = df_in['Total Registered'].apply(str_to_int)
    df_in['Democratic'] = df_in['Democratic'].apply(str_to_int)
    df_in['Republican'] = df_in['Republican'].apply(str_to_int)
    df_in['American Independent'] = df_in['American Independent'].apply(str_to_int)
    df_in['Green'] = df_in['Green'].apply(str_to_int)
    df_in['Libertarian'] = df_in['Libertarian'].apply(str_to_int)
    df_in['Peace and Freedom'] = df_in['Peace and Freedom'].apply(str_to_int)
    df_in['Unknown'] = df_in['Unknown'].apply(str_to_int)
    df_in['Other'] = df_in['Other'].apply(str_to_int)
    df_in['No Party Preference'] = df_in['No Party Preference'].apply(str_to_int)
    return df_in

def preprocess_voter_reg(df_in: pd.DataFrame) -> pd.DataFrame:
    '''
        applies preprocessing functions to the voter registration data
        arguments: voter registration dataframe raw
        returns: voter registration dataframe clean
    '''
    return get_percentage_total(rename_clean_cols(convert_reg_counts(drop_na_percent(voter_reg_2018_df))))

Function Calls - Voter Registration

In [13]:
voter_reg_2018_df= assign_year_cols(preprocess_voter_reg(voter_reg_2018_df), '2018')
voter_reg_2018_df.head()

Index(['county name', 'eligible', 'total registered', 'democratic',
       'republican', 'american independent', 'green', 'libertarian',
       'peace and freedom', 'unknown', 'other', 'no party preference'],
      dtype='object')


Unnamed: 0,county name,eligible 2018,total registered 2018,democratic 2018,republican 2018,american independent 2018,green 2018,libertarian 2018,peace and freedom 2018,unknown 2018,other 2018,no party preference 2018
0,Alameda,1089154,881491,0.556651,0.110469,0.01862,0.007602,0.005286,0.003049,3.4e-05,0.006646,0.291643
1,Alpine,939,758,0.411609,0.270449,0.032982,0.006596,0.007916,0.002639,0.0,0.003958,0.263852
2,Amador,27117,22305,0.287962,0.439901,0.042233,0.004573,0.013226,0.002735,0.000269,0.002286,0.206815
3,Butte,171771,122741,0.349052,0.341817,0.034552,0.007626,0.011259,0.00312,0.002623,0.009752,0.240197
4,Calaveras,36101,29591,0.273698,0.41445,0.045453,0.006353,0.015106,0.003177,0.00294,0.007908,0.230915


In [10]:
voter_reg_2020_df= assign_year_cols(preprocess_voter_reg(voter_reg_2020_df), '2020')
voter_reg_2020_df.head()

Index(['county name', 'eligible', 'total registered', 'democratic',
       'republican', 'american independent', 'green', 'libertarian',
       'peace and freedom', 'unknown', 'other', 'no party preference'],
      dtype='object')


Unnamed: 0,county name,eligible 2020,total registered 2020,democratic 2020,republican 2020,american independent 2020,green 2020,libertarian 2020,peace and freedom 2020,unknown 2020,other 2020,no party preference 2020
0,Alameda,1089154,881491,0.556651,0.110469,0.01862,0.007602,0.005286,0.003049,3.4e-05,0.006646,0.291643
1,Alpine,939,758,0.411609,0.270449,0.032982,0.006596,0.007916,0.002639,0.0,0.003958,0.263852
2,Amador,27117,22305,0.287962,0.439901,0.042233,0.004573,0.013226,0.002735,0.000269,0.002286,0.206815
3,Butte,171771,122741,0.349052,0.341817,0.034552,0.007626,0.011259,0.00312,0.002623,0.009752,0.240197
4,Calaveras,36101,29591,0.273698,0.41445,0.045453,0.006353,0.015106,0.003177,0.00294,0.007908,0.230915


In [11]:
voter_reg_2022_df= assign_year_cols(preprocess_voter_reg(voter_reg_2022_df), '2022')
voter_reg_2022_df.head()

Index(['county name', 'eligible', 'total registered', 'democratic',
       'republican', 'american independent', 'green', 'libertarian',
       'peace and freedom', 'unknown', 'other', 'no party preference'],
      dtype='object')


Unnamed: 0,county name,eligible 2022,total registered 2022,democratic 2022,republican 2022,american independent 2022,green 2022,libertarian 2022,peace and freedom 2022,unknown 2022,other 2022,no party preference 2022
0,Alameda,1089154,881491,0.556651,0.110469,0.01862,0.007602,0.005286,0.003049,3.4e-05,0.006646,0.291643
1,Alpine,939,758,0.411609,0.270449,0.032982,0.006596,0.007916,0.002639,0.0,0.003958,0.263852
2,Amador,27117,22305,0.287962,0.439901,0.042233,0.004573,0.013226,0.002735,0.000269,0.002286,0.206815
3,Butte,171771,122741,0.349052,0.341817,0.034552,0.007626,0.011259,0.00312,0.002623,0.009752,0.240197
4,Calaveras,36101,29591,0.273698,0.41445,0.045453,0.006353,0.015106,0.003177,0.00294,0.007908,0.230915


In [15]:
clean_voter_data = merge_ballot_data([voter_reg_2018_df, voter_reg_2020_df, voter_reg_2022_df])#.to_csv('Voter Registration Data Clean.csv')

In [None]:
clean_voter_data.head()

Cleaning and Merging Function Calls - Ballot Measures

In [None]:
clean_ballot_data = merge_ballot_data([preprocess_ballot_data(ballot_measures_2020_df), preprocess_ballot_data(ballot_measures_2018_df), preprocess_ballot_data(ballot_measures_2022_df)])

In [None]:
clean_ballot_data.head()