In [1]:
import pandas as pd

# Load in 2012 US Voter Data

This is 2012 population and voter registration data 
from the US Census Bureau, 
broken down by state and by age group.

In [2]:
df = pd.read_csv("2012_US_Voter_data.csv")

# Initial EDA

In [3]:
df.head()

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters
0,Alabama,18 to 24,439000,428000,212000,155000
1,Alabama,25 to 34,576000,535000,359000,271000
2,Alabama,35 to 44,615000,582000,410000,330000
3,Alabama,45 to 64,1297000,1275000,1051000,939000
4,Alabama,65+,667000,660000,523000,459000


In [4]:
df.tail()

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters
250,Wyoming,18 to 24,56000,55000,21000,18000
251,Wyoming,25 to 34,73000,71000,44000,39000
252,Wyoming,35 to 44,68000,66000,41000,36000
253,Wyoming,45 to 64,155000,154000,101000,95000
254,Wyoming,65+,74000,73000,61000,59000


In [5]:
df.sample(10)

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters
60,Idaho,18 to 24,130000,121000,51000,39000
186,Oregon,25 to 34,461000,399000,256000,215000
99,Maine,65+,219000,215000,187000,170000
107,Massachusetts,35 to 44,888000,783000,580000,530000
41,District Of Columbia,25 to 34,154000,133000,114000,102000
189,Oregon,65+,616000,604000,513000,491000
8,Alaska,45 to 64,186000,182000,144000,122000
81,Kansas,25 to 34,406000,359000,236000,172000
78,Iowa,45 to 64,828000,812000,652000,586000
23,California,45 to 64,9356000,7827000,5459000,4926000


In [6]:
df.columns

Index(['State', 'Age', 'Total Population', 'Citizen Population',
       'Registered Voters', 'Confirmed Voters'],
      dtype='object')

In [7]:
df. describe()

Unnamed: 0,Total Population,Citizen Population,Registered Voters,Confirmed Voters
count,255.0,255.0,255.0,255.0
mean,922545.1,843470.6,600600.0,521380.4
std,1157013.0,1007374.0,720314.5,641386.9
min,56000.0,55000.0,21000.0,18000.0
25%,238500.0,224000.0,146500.0,128500.0
50%,568000.0,540000.0,371000.0,303000.0
75%,1159000.0,1029000.0,765500.0,664000.0
max,9356000.0,7827000.0,5459000.0,4926000.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   State               255 non-null    object
 1   Age                 255 non-null    object
 2   Total Population    255 non-null    int64 
 3   Citizen Population  255 non-null    int64 
 4   Registered Voters   255 non-null    int64 
 5   Confirmed Voters    255 non-null    int64 
dtypes: int64(4), object(2)
memory usage: 12.1+ KB


# Questions to think about for the US_Voter_2012 dataset

some of the questions that I'd like to
explore here are things like 

	• how does voter turnout compare across states?

	• Are certain age ranges more or less likely to vote in general?

	• And do these trends hold true nationwide, 

or are there state specific insights that we can uncover here?

# Lets look into the "Total Population", "Citizen Population", "Registered Voters", "Confirmed Voters"

Let's cut our data by state as rows 

and pull in some of the metrics like 'Citizen Population' and 'Confirmed Voters'.

For now, let's use citizen population since only citizens can vote and confirmed voters and start there
and we'll go ahead and format both those as numbers with 1000 separator and no decimal points.
There we go.
And finally, I'll sort my states descending by the citizen population.

So as expected, we see big states like California, Texas, Florida and New York at the top of our
list.

And then down at the bottom, obviously, are less densely populated states like the Dakotas and Alaska,
Vermont and Wyoming.

In [9]:
df.groupby("State")['Citizen Population','Confirmed Voters'].sum().reset_index().sort_values('Confirmed Voters', ascending=False)

  df.groupby("State")['Citizen Population','Confirmed Voters'].sum().reset_index().sort_values('Confirmed Voters', ascending=False)


Unnamed: 0,State,Citizen Population,Confirmed Voters
4,California,23418000,13463000
43,Texas,16062000,8643000
9,Florida,13326000,8107000
32,New York,13081000,7676000
38,Pennsylvania,9451000,5824000
13,Illinois,8831000,5428000
35,Ohio,8550000,5395000
22,Michigan,7228000,4831000
33,North Carolina,6711000,4624000
10,Georgia,6738000,4168000



# Create a calculated field '% Voter Population'

From here, I want to add a calculated field, because what I'm interested in looking at isn't just

the number of citizens and the number of confirmed voters, which is interesting but not entirely surprising.

What I want to do is actually calculate a ratio so that I can see out of the citizen population what

percentage of those actually became confirmed voters.

${\% Voter Population = \frac{(Confirmed Voters)}{Citizen Population}}$ 

## Lets find the state with the highest '% Voter Population'

at the state level

In [10]:
df_state_level = df.groupby("State")['Citizen Population','Confirmed Voters'].sum().reset_index()

df_state_level.head(5)

  df_state_level = df.groupby("State")['Citizen Population','Confirmed Voters'].sum().reset_index()


Unnamed: 0,State,Citizen Population,Confirmed Voters
0,Alabama,3480000,2154000
1,Alaska,495000,289000
2,Arizona,4315000,2412000
3,Arkansas,2110000,1124000
4,California,23418000,13463000


In [11]:
df_state_level['% Voter Population - State_Level'] = round((df_state_level['Confirmed Voters'] / df_state_level['Citizen Population'] * 100), 2)

In [12]:
df_state_level = df_state_level.sort_values(by=['% Voter Population - State_Level'], ascending=False)

## the Top 5 states with highest '% Voter Population'

We see District of Columbia or D.C. at the top of the list with a voter population percentage of 76.1%.

That makes sense.

It's the nation's capital.

It's the political heart of the country.

But then you also have states like Mississippi and Wisconsin, which, you know, may come as a surprise
to some.

In [13]:
df_state_level.head()

Unnamed: 0,State,Citizen Population,Confirmed Voters,% Voter Population - State_Level
8,District Of Columbia,461000,351000,76.14
24,Mississippi,2130000,1588000,74.55
49,Wisconsin,4247000,3128000,73.65
23,Minnesota,3903000,2859000,73.25
21,Massachusetts,4773000,3382000,70.86


## the Bottom 5 states with lowest '% Voter Population'

Now, scrolling down on the other end of the spectrum, we see West Virginia, Hawaii, Oklahoma and
Arkansas.

Now, this isn't necessarily the time or place to dig much deeper into why these trends are the way
they are.

So what actually causes the West Virginia population to be less likely to convert into voters compared
to Massachusetts or Colorado, for instance?

In [14]:
df_state_level.tail()

Unnamed: 0,State,Citizen Population,Confirmed Voters,% Voter Population - State_Level
43,Texas,16062000,8643000,53.81
3,Arkansas,2110000,1124000,53.27
36,Oklahoma,2734000,1431000,52.34
11,Hawaii,930000,481000,51.72
48,West Virginia,1443000,689000,47.75


So maybe things like 

	• income per capita 

	• or education level 

	• or additional demographics 

might help to provide some additional context 

and give you tools to better understand the why or the driving forces behind

these voter rates that we're seeing here.

In [15]:
df_state_level

Unnamed: 0,State,Citizen Population,Confirmed Voters,% Voter Population - State_Level
8,District Of Columbia,461000,351000,76.14
24,Mississippi,2130000,1588000,74.55
49,Wisconsin,4247000,3128000,73.65
23,Minnesota,3903000,2859000,73.25
21,Massachusetts,4773000,3382000,70.86
5,Colorado,3543000,2495000,70.42
15,Iowa,2232000,1548000,69.35
29,New Hampshire,992000,688000,69.35
33,North Carolina,6711000,4624000,68.9
19,Maine,1020000,699000,68.53


# Lets look at the age bucket distribution overall

Now, the next thing that 
I'd like to explore a bit more here to kick things off is 
the composition of the voter population.

In [29]:
## Let's look at Voting per Age bucket
df_age_level = df.groupby("Age")['Citizen Population','Confirmed Voters'].sum().reset_index()
# new_df = new_df.sort_values(by=['Citizen Population'], ascending=False)
df_age_level


  df_age_level = df.groupby("Age")['Citizen Population','Confirmed Voters'].sum().reset_index()


Unnamed: 0,Age,Citizen Population,Confirmed Voters
0,18 to 24,27537000,11351000
1,25 to 34,35474000,18978000
2,35 to 44,34266000,20965000
3,45 to 64,76641000,52013000
4,65+,41167000,29645000


### lets' add calculated field-  '% Voter Population - Age_Level'

This '% Voter Population - Age_Level' is calculated per Age Bucket over all 50 states

Now, what this tells me is that among all confirmed voters in this data set, 
so in the year 2012,

I know that the largest share of those voters fall into the 45 to 64 range, 
followed by the 65 plus range at 22%.

And on the other end, 
you've got your smallest proportion or share of voters which fall into 18 to 24.

In [30]:

df_age_level['% Voter Population - Age_Level_overall'] = round((df_age_level['Confirmed Voters'] / df_age_level['Citizen Population'] * 100),2)
df_age_level

Unnamed: 0,Age,Citizen Population,Confirmed Voters,% Voter Population - Age_Level_overall
0,18 to 24,27537000,11351000,41.22
1,25 to 34,35474000,18978000,53.5
2,35 to 44,34266000,20965000,61.18
3,45 to 64,76641000,52013000,67.87
4,65+,41167000,29645000,72.01


In [31]:
# Let's add another calculated field which a % of column total

In [32]:
total_confirmed_voters = df_age_level['Confirmed Voters'].sum()
total_confirmed_voters

132952000

In [33]:
df_age_level['% of Total Confirmed Voters'] = round((df_age_level['Confirmed Voters'] / total_confirmed_voters *100), 2)

In [34]:
df_age_level.columns

Index(['Age', 'Citizen Population', 'Confirmed Voters',
       '% Voter Population - Age_Level_overall',
       '% of Total Confirmed Voters'],
      dtype='object')

In [37]:
df_age_level = df_age_level[['Age', 
                             'Citizen Population', 
                             'Confirmed Voters',
                             '% of Total Confirmed Voters',
                             '% Voter Population - Age_Level_overall',
       ]]
df_age_level

Unnamed: 0,Age,Citizen Population,Confirmed Voters,% of Total Confirmed Voters,% Voter Population - Age_Level_overall
0,18 to 24,27537000,11351000,8.54,41.22
1,25 to 34,35474000,18978000,14.27,53.5
2,35 to 44,34266000,20965000,15.77,61.18
3,45 to 64,76641000,52013000,39.12,67.87
4,65+,41167000,29645000,22.3,72.01


Now, what this tells me is that among all confirmed voters in this data set, 
so in the year 2012,

I know that the largest share of those voters fall into the 45 to 64 range, 
followed by the 65 plus range at 22%.

And on the other end, 

you've got your smallest proportion or share of voters which fall into 18 to 24.

So the youngest voters make up only 8.54% of the total number of confirmed voters nationwide.

# Lets calculate the '% of Total Confirmed Voters per State'

### lets' add calculated field-  '% Voter Population - Age_Level per State'

This '% Voter Population - Age_Level' is calculated per Age Bucket over all 50 states

In [38]:
df['State'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District Of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [39]:
# for Alabama

test_state = df['State'].unique()[0]
test_state

'Alabama'

In [40]:
test_state_df = df[df['State'] == test_state]
test_state_df

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters
0,Alabama,18 to 24,439000,428000,212000,155000
1,Alabama,25 to 34,576000,535000,359000,271000
2,Alabama,35 to 44,615000,582000,410000,330000
3,Alabama,45 to 64,1297000,1275000,1051000,939000
4,Alabama,65+,667000,660000,523000,459000


In [41]:
test_state_df['% of Total Confirmed Voters per State'] = round((test_state_df['Confirmed Voters']/test_state_df['Confirmed Voters'].sum()*100),2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_state_df['% of Total Confirmed Voters per State'] = round((test_state_df['Confirmed Voters']/test_state_df['Confirmed Voters'].sum()*100),2)


In [42]:
test_state_df

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State
0,Alabama,18 to 24,439000,428000,212000,155000,7.2
1,Alabama,25 to 34,576000,535000,359000,271000,12.58
2,Alabama,35 to 44,615000,582000,410000,330000,15.32
3,Alabama,45 to 64,1297000,1275000,1051000,939000,43.59
4,Alabama,65+,667000,660000,523000,459000,21.31


In [54]:
# state_list = df['State'].unique()
# state_list[1:]

array(['Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
       'Connecticut', 'Delaware', 'District Of Columbia', 'Florida',
       'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
       'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [59]:
pd.set_option('display.max_columns', None)

In [65]:
state_list = df['State'].unique()
df_list = []

for state in state_list:
    # print(state)
    a = df[df['State'] == state]
    a['% of Total Confirmed Voters per State'] = round((a['Confirmed Voters']/a['Confirmed Voters'].sum()*100),2)
    # print(a.head())
    df_list.append(a)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['% of Total Confirmed Voters per State'] = round((a['Confirmed Voters']/a['Confirmed Voters'].sum()*100),2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  a['% of Total Confirmed Voters per State'] = round((a['Confirmed Voters']/a['Confirmed Voters'].sum()*100),2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

In [66]:
df_list[6]

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State
30,Connecticut,18 to 24,333000,308000,145000,123000,7.84
31,Connecticut,25 to 34,402000,331000,210000,177000,11.28
32,Connecticut,35 to 44,441000,382000,255000,223000,14.21
33,Connecticut,45 to 64,1047000,993000,768000,695000,44.3
34,Connecticut,65+,503000,486000,383000,351000,22.37


In [77]:
# next concat list of df into one big dataframe
df_age_level_concat = pd.concat(df_list)

In [78]:
df_age_level_concat

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State
0,Alabama,18 to 24,439000,428000,212000,155000,7.20
1,Alabama,25 to 34,576000,535000,359000,271000,12.58
2,Alabama,35 to 44,615000,582000,410000,330000,15.32
3,Alabama,45 to 64,1297000,1275000,1051000,939000,43.59
4,Alabama,65+,667000,660000,523000,459000,21.31
...,...,...,...,...,...,...,...
250,Wyoming,18 to 24,56000,55000,21000,18000,7.29
251,Wyoming,25 to 34,73000,71000,44000,39000,15.79
252,Wyoming,35 to 44,68000,66000,41000,36000,14.57
253,Wyoming,45 to 64,155000,154000,101000,95000,38.46


In [70]:
# now merge with state level % vote calculations
df_state_level.head(10)

Unnamed: 0,State,Citizen Population,Confirmed Voters,% Voter Population - State_Level
8,District Of Columbia,461000,351000,76.14
24,Mississippi,2130000,1588000,74.55
49,Wisconsin,4247000,3128000,73.65
23,Minnesota,3903000,2859000,73.25
21,Massachusetts,4773000,3382000,70.86
5,Colorado,3543000,2495000,70.42
15,Iowa,2232000,1548000,69.35
29,New Hampshire,992000,688000,69.35
33,North Carolina,6711000,4624000,68.9
19,Maine,1020000,699000,68.53


In [72]:
df_state_level_premerge = df_state_level[['State', '% Voter Population - State_Level']]

In [73]:
df_state_level_premerge.head(10)

Unnamed: 0,State,% Voter Population - State_Level
8,District Of Columbia,76.14
24,Mississippi,74.55
49,Wisconsin,73.65
23,Minnesota,73.25
21,Massachusetts,70.86
5,Colorado,70.42
15,Iowa,69.35
29,New Hampshire,69.35
33,North Carolina,68.9
19,Maine,68.53


In [80]:
# merge on 'State' Column

merged_df = pd.merge(df_age_level_concat, df_state_level_premerge, on='State')
merged_df

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State,% Voter Population - State_Level
0,Alabama,18 to 24,439000,428000,212000,155000,7.20,61.90
1,Alabama,25 to 34,576000,535000,359000,271000,12.58,61.90
2,Alabama,35 to 44,615000,582000,410000,330000,15.32,61.90
3,Alabama,45 to 64,1297000,1275000,1051000,939000,43.59,61.90
4,Alabama,65+,667000,660000,523000,459000,21.31,61.90
...,...,...,...,...,...,...,...,...
250,Wyoming,18 to 24,56000,55000,21000,18000,7.29,58.95
251,Wyoming,25 to 34,73000,71000,44000,39000,15.79,58.95
252,Wyoming,35 to 44,68000,66000,41000,36000,14.57,58.95
253,Wyoming,45 to 64,155000,154000,101000,95000,38.46,58.95


In [85]:
merged_df.sort_values(['% Voter Population - State_Level', 'Age'], ascending=[False,True])[:15]

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State,% Voter Population - State_Level
40,District Of Columbia,18 to 24,71000,61000,43000,38000,10.83,76.14
41,District Of Columbia,25 to 34,154000,133000,114000,102000,29.06,76.14
42,District Of Columbia,35 to 44,79000,67000,59000,55000,15.67,76.14
43,District Of Columbia,45 to 64,140000,129000,109000,99000,28.21,76.14
44,District Of Columbia,65+,73000,71000,60000,57000,16.24,76.14
120,Mississippi,18 to 24,300000,294000,222000,187000,11.78,74.55
121,Mississippi,25 to 34,397000,390000,333000,279000,17.57,74.55
122,Mississippi,35 to 44,320000,309000,255000,222000,13.98,74.55
123,Mississippi,45 to 64,734000,727000,614000,563000,35.45,74.55
124,Mississippi,65+,416000,410000,371000,337000,21.22,74.55


In [86]:
merged_df.sort_values(['% Voter Population - State_Level', 'Age'], ascending=[False,True])[-15:]

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State,% Voter Population - State_Level
180,Oklahoma,18 to 24,325000,323000,130000,88000,6.15,52.34
181,Oklahoma,25 to 34,567000,540000,296000,184000,12.86,52.34
182,Oklahoma,35 to 44,433000,418000,281000,212000,14.81,52.34
183,Oklahoma,45 to 64,1001000,980000,720000,602000,42.07,52.34
184,Oklahoma,65+,483000,473000,378000,345000,24.11,52.34
55,Hawaii,18 to 24,107000,91000,33000,24000,4.99,51.72
56,Hawaii,25 to 34,172000,154000,65000,55000,11.43,51.72
57,Hawaii,35 to 44,166000,153000,83000,69000,14.35,51.72
58,Hawaii,45 to 64,343000,317000,217000,196000,40.75,51.72
59,Hawaii,65+,224000,215000,149000,137000,28.48,51.72


And now we have an entirely new set of insights at our fingertips.

So we can drill in state by state and see how these proportions by age ranges differ from place to place.

So DC, for instance, is kind of an interesting story, 

which is that the largest share of confirmed

voters within D.C. actually come from the 25 to 34 bucket, 

whereas most other states like Mississippi,

Wisconsin, Minnesota show the largest share of voters coming from the 45 to 64 bucket.

So for whatever reason, the voter population skews a little bit younger in D.C. than it does in many
of these other states.

And then scrolling down, another interesting insight to see here is that for these low voter population

states like Oklahoma, Hawaii and West Virginia, the smallest proportions are particularly for the

youngest age group, 18 to 24.

So less than 5% of voters in Hawaii fell into the 18 to 24 bucket and only 6% in Oklahoma fell into

that range.

So for whatever reason, the younger populations in these states really don't seem to inclined to get

out there and vote.

So definitely another interesting story, really just starting to scratch the surface into some of the

interesting angles that we can take with this data set.

So that's our kickoff for the US voter case study.

# Analyzing U.S. Voter Demographics

## How many states had a % Voter Population below 55%? Which states?

In [87]:
merged_df

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State,% Voter Population - State_Level
0,Alabama,18 to 24,439000,428000,212000,155000,7.20,61.90
1,Alabama,25 to 34,576000,535000,359000,271000,12.58,61.90
2,Alabama,35 to 44,615000,582000,410000,330000,15.32,61.90
3,Alabama,45 to 64,1297000,1275000,1051000,939000,43.59,61.90
4,Alabama,65+,667000,660000,523000,459000,21.31,61.90
...,...,...,...,...,...,...,...,...
250,Wyoming,18 to 24,56000,55000,21000,18000,7.29,58.95
251,Wyoming,25 to 34,73000,71000,44000,39000,15.79,58.95
252,Wyoming,35 to 44,68000,66000,41000,36000,14.57,58.95
253,Wyoming,45 to 64,155000,154000,101000,95000,38.46,58.95


In [96]:
a = merged_df.groupby('State')['% Voter Population - State_Level'].max().reset_index().sort_values('% Voter Population - State_Level', ascending=False)

In [98]:
a.head()

Unnamed: 0,State,% Voter Population - State_Level
8,District Of Columbia,76.14
24,Mississippi,74.55
49,Wisconsin,73.65
23,Minnesota,73.25
21,Massachusetts,70.86


In [99]:
a[a['% Voter Population - State_Level'] < 55]

Unnamed: 0,State,% Voter Population - State_Level
43,Texas,53.81
3,Arkansas,53.27
36,Oklahoma,52.34
11,Hawaii,51.72
48,West Virginia,47.75


As we can see above there are 5 states that have ['% Voter Population - State_Level'] < 55. 

The five states are Texas (53.81%), Arkansas (53.27%), Oklahoma (52.34%), Hawaii (51.72%) and West Virginia (47.75%)

## How many confirmed voters in California were over 65 years old in 2012? 
## What percentage does that represent out of the total confirmed voters in California? 
## What percentage out of the confirmed voters in the entire country?

In [100]:
merged_df[merged_df['State']=='California']

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State,% Voter Population - State_Level
20,California,18 to 24,3962000,3479000,1787000,1447000,10.75,57.49
21,California,25 to 34,5357000,4153000,2548000,2070000,15.38,57.49
22,California,35 to 44,5030000,3675000,2478000,2118000,15.73,57.49
23,California,45 to 64,9356000,7827000,5459000,4926000,36.59,57.49
24,California,65+,4653000,4284000,3084000,2902000,21.56,57.49


### How many confirmed voters in California were over 65 years old in 2012?

From the above dataframe we can see that there are 2,902,000 confirmed voters in California were over 65 years old in 2012.

### What percentage does that represent out of the total confirmed voters in California?

From the above dataframe, we can see that 65 years old represented 21.56% of the total confirmed voters in California.

In [101]:
california = merged_df[merged_df['State']=='California']

In [112]:
percent_65_older_California_over_country = round((california[california['Age']=='65+']['Confirmed Voters'] / merged_df['Confirmed Voters'].sum() * 100), 2)
percent_65_older_California_over_country

24    2.18
Name: Confirmed Voters, dtype: float64

### What percentage out of the confirmed voters in the entire country?

From the above calculate, we can see that in California 65 years old represented 2.18% of the total confirmed voters over the entire country.

# Show both Citizen Population and Confirmed Voters by Age, as % of Column Total. 
# What percentage of the citizen population do 45 to 64 year olds represent? 
# What percentage of the confirmed voter population? 

In [114]:
# we calculated df_age_level with our initial Exploratory Data Analysis

df_age_level

Unnamed: 0,Age,Citizen Population,Confirmed Voters,% of Total Confirmed Voters,% Voter Population - Age_Level_overall
0,18 to 24,27537000,11351000,8.54,41.22
1,25 to 34,35474000,18978000,14.27,53.5
2,35 to 44,34266000,20965000,15.77,61.18
3,45 to 64,76641000,52013000,39.12,67.87
4,65+,41167000,29645000,22.3,72.01


In [115]:
df_age_level['% of Total Citizen Population'] = round((df_age_level['Citizen Population'] / df_age_level['Citizen Population'].sum() * 100), 2)

In [116]:
df_age_level.columns

Index(['Age', 'Citizen Population', 'Confirmed Voters',
       '% of Total Confirmed Voters', '% Voter Population - Age_Level_overall',
       '% of Total Citizen Population'],
      dtype='object')

### Show both Citizen Population and Confirmed Voters by Age, as % of Column Total. 


In [119]:
df_age_level = df_age_level[['Age', 
                            'Citizen Population', 
                            '% of Total Citizen Population',
                            'Confirmed Voters',
                            '% of Total Confirmed Voters', 
                            '% Voter Population - Age_Level_overall'
       ]]

df_age_level

Unnamed: 0,Age,Citizen Population,% of Total Citizen Population,Confirmed Voters,% of Total Confirmed Voters,% Voter Population - Age_Level_overall
0,18 to 24,27537000,12.8,11351000,8.54,41.22
1,25 to 34,35474000,16.49,18978000,14.27,53.5
2,35 to 44,34266000,15.93,20965000,15.77,61.18
3,45 to 64,76641000,35.63,52013000,39.12,67.87
4,65+,41167000,19.14,29645000,22.3,72.01


### What percentage of the citizen population do 45 to 64 year olds represent? 

From the above dataframe, we can see that 45 to 65 year olds represent 35.63% of the citizen population of the entire country.

### What percentage of the confirmed voter population? 
From the above dataframe, we can see that 45 to 65 year olds represent 35.63% of the Total Confirmed Voters of the entire country.

# Create a new calculated field named "Voter Turnout" (Confirmed Voters/Registered Voters), formatted as a percentage with 2 decimal points. 
# Which state had the highest voter turnout rate? 
# What about among 18-24 year old voters specifically?

In [129]:
df_state_level2 = df.groupby("State").sum().reset_index()

df_state_level2.head(5)

  df_state_level2 = df.groupby("State").sum().reset_index()


Unnamed: 0,State,Total Population,Citizen Population,Registered Voters,Confirmed Voters
0,Alabama,3594000,3480000,2555000,2154000
1,Alaska,516000,495000,360000,289000
2,Arizona,4863000,4315000,2811000,2412000
3,Arkansas,2198000,2110000,1376000,1124000
4,California,28358000,23418000,15356000,13463000


In [149]:
df_state_level2['Voter Turnout'] = round((df_state_level2['Confirmed Voters'] / df_state_level2['Registered Voters'] * 100), 2)
df_state_level2.sort_values('Voter Turnout', ascending=False).head(5)

Unnamed: 0,State,Total Population,Citizen Population,Registered Voters,Confirmed Voters,Voter Turnout
5,Colorado,3817000,3543000,2635000,2495000,94.69
49,Wisconsin,4351000,4247000,3318000,3128000,94.27
23,Minnesota,4054000,3903000,3085000,2859000,92.67
50,Wyoming,426000,419000,268000,247000,92.16
7,Delaware,693000,642000,469000,430000,91.68


In [150]:
merged_df['Voter Turnout'] = round((merged_df['Confirmed Voters'] / merged_df['Registered Voters'] * 100), 2)
merged_df[merged_df['Age']=='18 to 24'].sort_values('Voter Turnout', ascending=False).head(5)

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State,% Voter Population - State_Level,Voter Turnout
245,Wisconsin,18 to 24,539000,513000,308000,287000,9.18,73.65,93.18
25,Colorado,18 to 24,493000,461000,278000,259000,10.38,70.42,93.17
115,Minnesota,18 to 24,531000,510000,329000,303000,10.6,73.25,92.1
200,South Carolina,18 to 24,474000,451000,265000,243000,11.11,64.69,91.7
40,District Of Columbia,18 to 24,71000,61000,43000,38000,10.83,76.14,88.37


# As a politician seeking to improve voter turnout rates among young adults (18-24), which particular states would you target first?

In [152]:
merged_df[merged_df['Age']=='18 to 24'].sort_values('Voter Turnout', ascending=False).tail(5)

Unnamed: 0,State,Age,Total Population,Citizen Population,Registered Voters,Confirmed Voters,% of Total Confirmed Voters per State,% Voter Population - State_Level,Voter Turnout
130,Montana,18 to 24,82000,79000,53000,33000,6.67,65.65,62.26
205,South Dakota,18 to 24,95000,94000,52000,31000,8.36,61.22,59.62
215,Texas,18 to 24,2538000,2249000,971000,572000,6.62,53.81,58.91
15,Arkansas,18 to 24,288000,281000,126000,70000,6.23,53.27,55.56
240,West Virginia,18 to 24,163000,162000,78000,37000,5.37,47.75,47.44


Notes for Pandas Pivot table

https://www.youtube.com/watch?v=5yFox2cReTw

how to sort in pandas pivot table

https://www.statology.org/pandas-pivot-table-sort/

Notes to add a calculated field to a pandas pivot table

https://stackoverflow.com/questions/70281086/how-to-add-calculated-fields-in-pandas-pivot-table

In [151]:
# this is the visual pivot of the previous dateframe 'df_state_level'
 
pivot_StateLevel_VoterPopPer = pd.pivot_table(data=df, index=['State','Age'], values=['Citizen Population', 'Confirmed Voters'], aggfunc='sum')
p = pivot_StateLevel_VoterPopPer.sort_values(by=['Citizen Population'], ascending=False).reset_index()
p['% Voter Population'] = p['Confirmed Voters'] / p['Citizen Population']
pd.pivot_table(data=p, index=['State','Age']).sort_values(by=['State','% Voter Population'], ascending=[False,False])

Unnamed: 0_level_0,Unnamed: 1_level_0,% Voter Population,Citizen Population,Confirmed Voters
State,Age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Wyoming,65+,0.808219,73000,59000
Wyoming,45 to 64,0.616883,154000,95000
Wyoming,25 to 34,0.549296,71000,39000
Wyoming,35 to 44,0.545455,66000,36000
Wyoming,18 to 24,0.327273,55000,18000
...,...,...,...,...
Alabama,45 to 64,0.736471,1275000,939000
Alabama,65+,0.695455,660000,459000
Alabama,35 to 44,0.567010,582000,330000
Alabama,25 to 34,0.506542,535000,271000
