In [2]:
import pandas as pd
import re

# Read the CSV file
df = pd.read_csv('../002_clean-data/Ballot Measures Data Clean.csv')

# Create a list to store the restructured data
restructured_data = []

# Iterate through each row in the original dataframe
for _, row in df.iterrows():
    county_name = row['county name']
    
    # Process data for each year
    for year in [2018, 2020, 2022]:
        new_row = {
            'year': year,
            'county_name': county_name,
            'yes_count': row[f'yes count {year}'],
            'no_count': row[f'no count {year}'],
            'total_count': row[f'total count {year}'],
            'yes_perc': row[f'yes perc {year}'],
            'no_perc': row[f'no perc {year}']
        }
        restructured_data.append(new_row)

# Create a new dataframe from the restructured data
new_df = pd.DataFrame(restructured_data)

# Sort the dataframe by year and county name
new_df = new_df.sort_values(['year', 'county_name'])

# Reset the index
new_df = new_df.reset_index(drop=True)

new_df.head()

Unnamed: 0,year,county_name,yes_count,no_count,total_count,yes_perc,no_perc
0,2018,Alameda,275550.0,280735.0,556285.0,0.49534,0.50466
1,2018,Alpine,298.0,281.0,579.0,0.51468,0.48532
2,2018,Amador,5256.0,11775.0,17031.0,0.308614,0.691386
3,2018,Butte,30908.0,55394.0,86302.0,0.358138,0.641862
4,2018,Calaveras,6688.0,14224.0,20912.0,0.319816,0.680184


In [6]:
# Save the new dataframe to a CSV file
# new_df.to_csv('BALLOT_CEANED.csv', index=False)

In [3]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('../002_clean-data/Voter Registration Data Clean.csv')

# Create a list to store the restructured data
restructured_data = []

# List of all party names
parties = ['democratic', 'republican', 'american independent', 'green', 'libertarian', 
           'peace and freedom', 'unknown', 'other', 'no party preference']

# Iterate through each row in the original dataframe
for _, row in df.iterrows():
    county_name = row['county name']
    
    # Process data for each year
    for year in [2018, 2020, 2022]:
        new_row = {
            'year': year,
            'county_name': county_name,
            'eligible': row[f'eligible {year}'],
            'total_registered': row[f'total registered {year}']
        }
        
        # Add party percentages
        for party in parties:
            new_row[f'{party}_perc'] = row[f'{party} {year}']
        
        restructured_data.append(new_row)

# Create a new dataframe from the restructured data
new_df = pd.DataFrame(restructured_data)

# Sort the dataframe by year and county name
new_df = new_df.sort_values(['year', 'county_name'])

# Reset the index
new_df = new_df.reset_index(drop=True)

new_df.head()

Unnamed: 0,year,county_name,eligible,total_registered,democratic_perc,republican_perc,american independent_perc,green_perc,libertarian_perc,peace and freedom_perc,unknown_perc,other_perc,no party preference_perc
0,2018,Alameda,1089154,881491,0.556651,0.110469,0.01862,0.007602,0.005286,0.003049,3.4e-05,0.006646,0.291643
1,2018,Alpine,939,758,0.411609,0.270449,0.032982,0.006596,0.007916,0.002639,0.0,0.003958,0.263852
2,2018,Amador,27117,22305,0.287962,0.439901,0.042233,0.004573,0.013226,0.002735,0.000269,0.002286,0.206815
3,2018,Butte,171771,122741,0.349052,0.341817,0.034552,0.007626,0.011259,0.00312,0.002623,0.009752,0.240197
4,2018,Calaveras,36101,29591,0.273698,0.41445,0.045453,0.006353,0.015106,0.003177,0.00294,0.007908,0.230915


In [4]:

# Save the new dataframe to a CSV file
#new_df.to_csv('VOTER_CLEANED.csv', index=False)


In [6]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('../002_clean-data/Additional Demographics Data Clean.csv')

# Rename columns
df = df.rename(columns={
    'Population (January 2023)': 'population_2023',
    'Median Household Income (2021)': 'median_household_income_2021',
    'Race/Ethnicity: American Indian (2023)': 'race_american_indian_2023',
    'Race/Ethnicity: Asian (2023)': 'race_asian_2023',
    'Race/Ethnicity: Black (2023)': 'race_black_2023',
    'Race/Ethnicity: Hispanic (2023)': 'race_hispanic_2023',
    'Race/Ethnicity: Multi-Racial/Ethnic (2023)': 'race_multi_racial_2023',
    'Race/Ethnicity: Hawaiian/ Pacific Island (2023)': 'race_hawaiian_pacific_2023',
    'Race/Ethnicity: White (2023)': 'race_white_2023',
    'Age: 0-5 (2023)': 'age_0_5_2023',
    'Age: 6-17 (2023)': 'age_6_17_2023',
    'Age: 18-64 (2023)': 'age_18_64_2023',
    'Age: 65+ (2023)': 'age_65_plus_2023'
})

# List of years in the dataset
years = [2021, 2023]

# List of column prefixes
prefixes = ['population', 'median_household_income', 'race_american_indian', 'race_asian', 
            'race_black', 'race_hispanic', 'race_multi_racial', 'race_hawaiian_pacific', 
            'race_white', 'age_0_5', 'age_6_17', 'age_18_64', 'age_65_plus']

# Create a list to store the restructured data
restructured_data = []

# Iterate through each row in the dataframe
for _, row in df.iterrows():
    for year in years:
        new_row = {'year': year, 'county_name': row['county name']}
        for prefix in prefixes:
            col_name = f'{prefix}_{year}'
            if col_name in df.columns:
                new_row[prefix] = row[col_name]
        restructured_data.append(new_row)

# Create a new dataframe from the restructured data
new_df = pd.DataFrame(restructured_data)

# Sort the dataframe by year and county name
new_df = new_df.sort_values(['year', 'county_name'])

# Reset the index
new_df = new_df.reset_index(drop=True)
new_df.head()

Unnamed: 0,year,county_name,median_household_income,population,race_american_indian,race_asian,race_black,race_hispanic,race_multi_racial,race_hawaiian_pacific,race_white,age_0_5,age_6_17,age_18_64,age_65_plus
0,2021,Alameda,108971.0,,,,,,,,,,,,
1,2021,Alpine,87570.0,,,,,,,,,,,,
2,2021,Amador,68159.0,,,,,,,,,,,,
3,2021,Butte,62982.0,,,,,,,,,,,,
4,2021,Calaveras,68298.0,,,,,,,,,,,,


In [10]:
# Save the new dataframe to a CSV file
#new_df.to_csv('DEMO_CLEANED.csv', index=False)


In [8]:
# Read the CSV file
df = pd.read_csv('../002_clean-data/Census Demographics 2020 Clean.csv')
df.head()

Unnamed: 0,county name,Under 5 years %,5 to 9 years %,10 to 14 years %,15 to 19 years %,20 to 24 years %,25 to 29 years %,30 to 34 years %,35 to 39 years %,40 to 44 years %,...,"Rented, not occupied %",For sale only %,"Sold, not occupied %","For seasonal, recreational, or occasional use %",All other vacants %,Homeowner vacancy rate (percent) [4] %,Rental vacancy rate (percent) [5] %,Occupied housing units %.1,Owner-occupied housing units %,Renter-occupied housing units %
0,Alameda,5.3,5.7,5.9,6.1,6.5,7.4,8.5,8.1,7.2,...,0.2,0.4,0.2,0.3,1.4,(X),(X),100.0,51.8,48.2
1,Alpine,4.3,4.3,4.0,5.5,4.5,6.6,5.6,5.1,4.4,...,0.2,0.7,0.3,60.5,0.8,(X),(X),100.0,70.4,29.6
2,Amador,4.1,4.3,4.7,4.4,3.7,4.6,5.6,5.8,5.4,...,0.2,1.5,0.4,10.8,2.4,(X),(X),100.0,75.7,24.3
3,Butte,5.3,5.7,6.0,7.0,9.0,6.8,6.5,6.3,5.5,...,0.4,0.8,0.3,1.8,2.3,(X),(X),100.0,56.2,43.8
4,Calaveras,4.3,4.8,5.3,5.0,3.9,4.4,5.3,5.2,4.9,...,0.3,1.6,0.7,25.3,2.7,(X),(X),100.0,79.0,21.0


In [9]:
# Read the CSV file
df = pd.read_csv('../002_clean-data/Census Demographics 2020 Clean.csv')

# Add 'year' column to the left of the dataframe and fill it with 2020
df.insert(0, 'year', 2020)

df.head()

# # Save the new dataframe to a CSV file
# df.to_csv('CENSUS_CLEANED.csv', index=False)


Unnamed: 0,year,county name,Under 5 years %,5 to 9 years %,10 to 14 years %,15 to 19 years %,20 to 24 years %,25 to 29 years %,30 to 34 years %,35 to 39 years %,...,"Rented, not occupied %",For sale only %,"Sold, not occupied %","For seasonal, recreational, or occasional use %",All other vacants %,Homeowner vacancy rate (percent) [4] %,Rental vacancy rate (percent) [5] %,Occupied housing units %.1,Owner-occupied housing units %,Renter-occupied housing units %
0,2020,Alameda,5.3,5.7,5.9,6.1,6.5,7.4,8.5,8.1,...,0.2,0.4,0.2,0.3,1.4,(X),(X),100.0,51.8,48.2
1,2020,Alpine,4.3,4.3,4.0,5.5,4.5,6.6,5.6,5.1,...,0.2,0.7,0.3,60.5,0.8,(X),(X),100.0,70.4,29.6
2,2020,Amador,4.1,4.3,4.7,4.4,3.7,4.6,5.6,5.8,...,0.2,1.5,0.4,10.8,2.4,(X),(X),100.0,75.7,24.3
3,2020,Butte,5.3,5.7,6.0,7.0,9.0,6.8,6.5,6.3,...,0.4,0.8,0.3,1.8,2.3,(X),(X),100.0,56.2,43.8
4,2020,Calaveras,4.3,4.8,5.3,5.0,3.9,4.4,5.3,5.2,...,0.3,1.6,0.7,25.3,2.7,(X),(X),100.0,79.0,21.0
