In [33]:
import pandas as pd
import json

us_elections_data = pd.read_csv('data/1976-2020-president.csv')
states_electoral_votes = pd.read_csv('data/Electoral_College.csv')

state_by_region = json.load(open('data/StateAugmentationData/state_by_region.json'))

# The first year we consider in our charts
FIRST_YEAR = 2014

In [34]:
# Column renaming
us_elections_data.rename(columns={
    'year': 'Year',
    'state': 'State',
    'party_simplified': 'Party',
    'candidatevotes': 'Candidate votes',
    'totalvotes': 'Total votes',
}, inplace=True)

states_electoral_votes.rename(columns={
    'Votes': 'Total electoral votes'
}, inplace=True)

We process the votes by states and years for the Democrat, Republican and any other party

In [35]:
us_elections_data = us_elections_data[us_elections_data['Year'] >= FIRST_YEAR - 4] # ensure we have data for the previous election

# ensure the state names coincide with the state_data
us_elections_data['State'] = us_elections_data['State'].str.title()

us_elections_data['State'] = us_elections_data['State'].replace('District Of Columbia', 'District of Columbia')
states_electoral_votes = states_electoral_votes.replace('D.C.', 'District of Columbia')

us_elections_data['Party'] = us_elections_data['Party'].str.title()

# keep only the relevant columns
us_elections_data = us_elections_data[['Year', 'State', 'Party', 'Candidate votes', 'Total votes']]

us_elections_data['Party'] = us_elections_data['Party'].replace('Libertarian', 'Other')

state_election_df = us_elections_data.groupby(['Year', 'State', 'Party']).agg({
    'Candidate votes': 'sum',
    'Total votes': 'first'
}).reset_index()

state_election_df['Vote share'] = state_election_df['Candidate votes'] / state_election_df['Total votes']

state_election_df

Unnamed: 0,Year,State,Party,Candidate votes,Total votes,Vote share
0,2012,Alabama,Democrat,795696,2074338,0.383590
1,2012,Alabama,Other,22717,2074338,0.010951
2,2012,Alabama,Republican,1255925,2074338,0.605458
3,2012,Alaska,Democrat,122640,300495,0.408127
4,2012,Alaska,Other,13179,300495,0.043858
...,...,...,...,...,...,...
453,2020,Wisconsin,Other,56991,3298041,0.017280
454,2020,Wisconsin,Republican,1610184,3298041,0.488224
455,2020,Wyoming,Democrat,73491,278503,0.263879
456,2020,Wyoming,Other,11453,278503,0.041123


We group by the different areas we will consider with the percentage of votes for each party

In [36]:

state_data = pd.DataFrame(list(state_by_region.items()), columns=['State', 'region'])
region_election_df = state_election_df.merge(state_data, on='State')

assert len(region_election_df) == len(state_election_df)

region_election_df = region_election_df.groupby(['Year', 'region', 'Party']).agg({
    'Candidate votes': 'sum',
    'Total votes': 'sum'
}).reset_index()

region_election_df['Vote share'] = region_election_df['Candidate votes'] / region_election_df['Total votes']

# add the United States
us_election_df = state_election_df.groupby(['Year', 'Party']).agg({
    'Candidate votes': 'sum',
    'Total votes': 'sum'
}).reset_index()

us_election_df['Vote share'] = us_election_df['Candidate votes'] / us_election_df['Total votes']

us_election_df['Area'] = 'United States'

area_election_df = pd.concat([
    state_election_df.rename(columns={'State': 'Area'}),
    region_election_df.rename(columns={'region': 'Area'}),
    us_election_df
]).reset_index(drop=True)

# Function to expand rows to cover four years in office
def expand_years(df):
    expanded_rows = []
    for _, row in df.iterrows():
        for year_offset in range(4):
            new_row = row.copy()
            new_row['Year'] = row['Year'] + year_offset
            expanded_rows.append(new_row)
    return pd.DataFrame(expanded_rows).reset_index(drop=True)

area_election_df = expand_years(area_election_df)

area_election_df = area_election_df[area_election_df['Year'] >= FIRST_YEAR].reset_index(drop=True)

area_election_df.to_csv('data/area_election_data.csv', index=False)

area_election_df

Unnamed: 0,Year,Area,Party,Candidate votes,Total votes,Vote share
0,2014,Alabama,Democrat,795696,2074338,0.383590
1,2015,Alabama,Democrat,795696,2074338,0.383590
2,2014,Alabama,Other,22717,2074338,0.010951
3,2015,Alabama,Other,22717,2074338,0.010951
4,2014,Alabama,Republican,1255925,2074338,0.605458
...,...,...,...,...,...,...
1703,2023,United States,Other,3043449,158528503,0.019198
1704,2020,United States,Republican,74216146,158528503,0.468156
1705,2021,United States,Republican,74216146,158528503,0.468156
1706,2022,United States,Republican,74216146,158528503,0.468156


Group using the electoral votes for each state

In [37]:
# add the electoral votes to the state_election_df, merge using the state and year
state_election_df = state_election_df.merge(states_electoral_votes, on=['State', 'Year'], how='left')

# check that there are no nans after the merge
assert state_election_df.isnull().sum().sum() == 0

state_election_df['Total electoral votes'] = state_election_df['Total electoral votes'].astype(int)

# add a column Electoral votes that is the total of the electoral votes if the party won
state_election_df['Electoral votes'] = state_election_df.apply(
    lambda row: row['Total electoral votes'] if row['Candidate votes'] == state_election_df[
        (state_election_df['Year'] == row['Year']) & (state_election_df['State'] == row['State'])
    ]['Candidate votes'].max() else 0, axis=1
)

state_election_df

Unnamed: 0,Year,State,Party,Candidate votes,Total votes,Vote share,Total electoral votes,Electoral votes
0,2012,Alabama,Democrat,795696,2074338,0.383590,9,0
1,2012,Alabama,Other,22717,2074338,0.010951,9,0
2,2012,Alabama,Republican,1255925,2074338,0.605458,9,9
3,2012,Alaska,Democrat,122640,300495,0.408127,3,0
4,2012,Alaska,Other,13179,300495,0.043858,3,0
...,...,...,...,...,...,...,...,...
453,2020,Wisconsin,Other,56991,3298041,0.017280,10,0
454,2020,Wisconsin,Republican,1610184,3298041,0.488224,10,0
455,2020,Wyoming,Democrat,73491,278503,0.263879,3,0
456,2020,Wyoming,Other,11453,278503,0.041123,3,0


In [38]:
region_electoral_df = state_election_df.merge(state_data, on='State')

assert len(region_electoral_df) == len(state_election_df)

region_electoral_df = region_electoral_df.groupby(['Year', 'region', 'Party']).agg({
    'Electoral votes': 'sum'
}).reset_index()

# add the United States
us_electoral_df = state_election_df.groupby(['Year', 'Party']).agg({
    'Electoral votes': 'sum'
}).reset_index()

us_electoral_df['Area'] = 'United States'

area_electoral_df = pd.concat([
    state_election_df.rename(columns={'State': 'Area'})[['Year', 'Area', 'Party', 'Electoral votes']],
    region_electoral_df.rename(columns={'region': 'Area'}),
    us_electoral_df
]).reset_index(drop=True)

area_electoral_df = expand_years(area_electoral_df)

area_electoral_df = area_electoral_df[area_electoral_df['Year'] >= FIRST_YEAR].reset_index(drop=True)

area_electoral_df.to_csv('data/area_electoral_df.csv', index=False)

area_electoral_df

Unnamed: 0,Year,Area,Party,Electoral votes
0,2014,Alabama,Democrat,0
1,2015,Alabama,Democrat,0
2,2014,Alabama,Other,0
3,2015,Alabama,Other,0
4,2014,Alabama,Republican,9
...,...,...,...,...
1703,2023,United States,Other,0
1704,2020,United States,Republican,232
1705,2021,United States,Republican,232
1706,2022,United States,Republican,232
