# Data Cleaning (Local)

**Overview of Notebook:**
- Data sourced from [MIT Election Data and Science Lab](https://doi.org/10.7910/DVN/CHYXUP)
- Cleaned names and split into two columns
- Removed non-names and one-name names
- Applied gender guesser package to name_1 column
- Removed candidates running for state-level office
- Identified unique # of female candidates
- Aggregated to county level by fips code
- Created gender proportion variables
- Removed counties with high levels of unclassified gender
- Add regional variables 

In [1]:
# Set Up
import pandas as pd
import numpy as np
import gender_guesser.detector as gender
import random as random

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

%xmode Minimal

Exception reporting mode: Minimal


### Load Data 

In [2]:
# load data (ensure fips are loaded as strings to avoid python dropping the leading zero)
local = pd.read_csv('final_data/LOCAL_precinct_general_18.csv', low_memory=False, 
                    converters={'state_fips': str, 'county_fips' : str})

In [3]:
# Explore the data
local.head()
local.shape # 1829286 rows and 25 columns
local.info()
local.isnull().sum() 

print('unique states: ' + str(len(local.state.unique()))) # 32
print('unique office names: ' + str(len(local.office.unique())))  # 4839
print('unique precints: ' + str(len(local.precinct.unique()))) #63571
print('unique county fips: ' + str(len(local.county_fips.unique()))) # 1444

Unnamed: 0,precinct,office,party_detailed,party_simplified,mode,votes,county_name,county_fips,jurisdiction_name,jurisdiction_fips,candidate,district,magnitude,dataverse,year,stage,state,special,writein,state_po,state_fips,state_cen,state_ic,date,readme_check
0,SEVERE,COUNTY SHERIFF,,,TOTAL,0,PERRY,1105,PERRY,1105,OVER VOTES,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False
1,STEWART,COUNTY SHERIFF,,,TOTAL,4,HALE,1065,HALE,1065,WRITEIN,,1.0,LOCAL,2018,GEN,ALABAMA,False,True,AL,1,63,41,2018-11-06,False
2,CVCC,CIRCUIT CLERK,,,TOTAL,0,RUSSELL,1113,RUSSELL,1113,OVER VOTES,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,True
3,10 JONES COMMUNITY CTR,CIRCUIT CLERK,,,TOTAL,0,AUTAUGA,1001,AUTAUGA,1001,OVER VOTES,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False
4,10 JONES COMMUNITY CTR,CIRCUIT CLERK,,,TOTAL,106,AUTAUGA,1001,AUTAUGA,1001,UNDER VOTES,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False


(1829286, 25)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1829286 entries, 0 to 1829285
Data columns (total 25 columns):
 #   Column             Dtype  
---  ------             -----  
 0   precinct           object 
 1   office             object 
 2   party_detailed     object 
 3   party_simplified   object 
 4   mode               object 
 5   votes              int64  
 6   county_name        object 
 7   county_fips        object 
 8   jurisdiction_name  object 
 9   jurisdiction_fips  int64  
 10  candidate          object 
 11  district           object 
 12  magnitude          float64
 13  dataverse          object 
 14  year               int64  
 15  stage              object 
 16  state              object 
 17  special            bool   
 18  writein            bool   
 19  state_po           object 
 20  state_fips         object 
 21  state_cen          int64  
 22  state_ic           int64  
 23  date               object 
 24  readme_check       bool   
dtypes: bool(3), float6

precinct                   0
office                     0
party_detailed        757018
party_simplified      755972
mode                       0
votes                      0
county_name                0
county_fips                0
jurisdiction_name          0
jurisdiction_fips          0
candidate               7430
district             1068429
magnitude                  0
dataverse                  0
year                       0
stage                      0
state                      0
special                    0
writein                    0
state_po                   0
state_fips                 0
state_cen                  0
state_ic                   0
date                       0
readme_check               0
dtype: int64

unique states: 32
unique office names: 4839
unique precints: 63571
unique county fips: 1444


In [4]:
# remove rows with no candidate names
local_clean = local[local.candidate.notnull()]

print('rows removed due to no candidate name: ' + str(len(local) - len(local_clean))) # 7430

rows removed due to no candidate name: 7430


### Clean Candidate Names

In [5]:
# Examing unique number of local candidates 
#local_clean.candidate.nunique() # 28319 unique values
candidate_names = local_clean.groupby('candidate').size().reset_index()
candidate_names.columns = ['candidate', 'counts']
candidate_names = candidate_names.sort_values(by = 'counts', ascending = False)

# Explore duplicate candidates 
candidate_names_duplicates = candidate_names[candidate_names['counts'] > 1]
#candidate_names_duplicates.shape # 24934 names 
candidate_names_duplicates.head(50)

# Create list of irrelevant names based on review of duplicates
non_names = ['UNDERVOTES', 'WRITEIN', 'OVERVOTES', 'YES', 'NO', 'OVER VOTES', 'UNDER VOTES', 'TIMES BLANK VOTED',
            'MAINTAINED', 'REPEALED', 'REJECTED', 'BLANK BALLOTS', 'APPROVE', 'REJECT']

# remove non-names from the list of names
local_clean_names = local_clean[~local_clean['candidate'].isin(non_names)]
print('non-names rows removed: ' + str((len(local_clean)) - (len(local_clean_names)))) 

Unnamed: 0,candidate,counts
27239,UNDERVOTES,121917
28224,WRITEIN,115254
20350,OVERVOTES,113432
28244,YES,93916
20194,NO,89787
20349,OVER VOTES,21977
27238,UNDER VOTES,21977
26398,TIMES BLANK VOTED,9230
17446,MAINTAINED,7264
21868,REPEALED,7264


non-names rows removed: 611586


In [6]:
# strip names of any quotations marks (these are nicknames)
pd.set_option('mode.chained_assignment', None)
local_clean_names['candidate_clean'] = local_clean_names.candidate.str.replace('"', "")

#local_clean_names.head()

In [7]:
# examine names
test = local_clean_names.groupby('candidate_clean').size().reset_index()
test.head(10)

# since we can clearly see the first five names contain (assumedly) referendum titles, instead of candidate
# names, let's create a list so we can remove them from the main df

# create list of values to remove
drop_values = []

for name in test.candidate_clean[0:5]: 
    value = name
    drop_values.append(value)
    
#drop_values

# remove these from the main df 
local_clean_names2 = local_clean_names[~local_clean_names['candidate_clean'].isin(drop_values)]
print("number of rows removed: " + str(len(local_clean_names)-len(local_clean_names2))) # 307 names removed

Unnamed: 0,candidate_clean,0
0,1 EMERGENCY INFORMATION AM RADIO STATION,41
1,2 PLEASANT VALLEY WAY SIDEWALK REPAIRS BETWEEN BE,41
2,3 SPECIAL NEEDS PLAYGROUND OF INCLUSIVE PLAY,41
3,4 OSPAC IMPROVEMENT AND HANDICAP PARKING PROJECT,41
4,A - LOCATED AT THE PRESENT SITE OF ALBI STADIUM,143
5,A AILEEN WITKOWSKI,8
6,A COSTELLO,1
7,A GAYLE TROUTMAN,13
8,A GENE BUCKNER,48
9,A J SCHAEFER,2


number of rows removed: 307


In [8]:
# split names into two columns: 'name_1' and 'name_2'
local_clean_names2[['name_1', 'name_2']] = local_clean_names2['candidate_clean'].str.split(' ', n = 1, expand = True)
local_clean_names2.head()

Unnamed: 0,precinct,office,party_detailed,party_simplified,mode,votes,county_name,county_fips,jurisdiction_name,jurisdiction_fips,candidate,district,magnitude,dataverse,year,stage,state,special,writein,state_po,state_fips,state_cen,state_ic,date,readme_check,candidate_clean,name_1,name_2
6,10 JONES COMMUNITY CTR,CIRCUIT CLERK,REPUBLICAN,REPUBLICAN,TOTAL,166,AUTAUGA,1001,AUTAUGA,1001,DEBRA HILL,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False,DEBRA HILL,DEBRA,HILL
10,100 TRINITY METHODIST,CIRCUIT CLERK,REPUBLICAN,REPUBLICAN,TOTAL,1447,AUTAUGA,1001,AUTAUGA,1001,DEBRA HILL,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False,DEBRA HILL,DEBRA,HILL
14,101 D,CIRCUIT CLERK,DEMOCRAT,DEMOCRAT,TOTAL,1374,MONTGOMERY,1101,MONTGOMERY,1101,GINA JOBE ISHMAN,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False,GINA JOBE ISHMAN,GINA,JOBE ISHMAN
18,102 V,CIRCUIT CLERK,DEMOCRAT,DEMOCRAT,TOTAL,2179,MONTGOMERY,1101,MONTGOMERY,1101,GINA JOBE ISHMAN,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False,GINA JOBE ISHMAN,GINA,JOBE ISHMAN
22,103 M,CIRCUIT CLERK,DEMOCRAT,DEMOCRAT,TOTAL,1598,MONTGOMERY,1101,MONTGOMERY,1101,GINA JOBE ISHMAN,,1.0,LOCAL,2018,GEN,ALABAMA,False,False,AL,1,63,41,2018-11-06,False,GINA JOBE ISHMAN,GINA,JOBE ISHMAN


In [9]:
# remove rows with only one name (since we can't say whether this is their first or last name, etc.)
local_clean_names2.isnull().sum() # includes 114869 null values for name_2
local_clean_names2 = local_clean_names2[local_clean_names2.name_2.notnull()]

len(local_clean_names2) # resulting in 1,095,094 values

precinct                  0
office                    0
party_detailed       214165
party_simplified     214155
mode                      0
votes                     0
county_name               0
county_fips               0
jurisdiction_name         0
jurisdiction_fips         0
candidate                 0
district             628654
magnitude                 0
dataverse                 0
year                      0
stage                     0
state                     0
special                   0
writein                   0
state_po                  0
state_fips                0
state_cen                 0
state_ic                  0
date                      0
readme_check              0
candidate_clean           0
name_1                    0
name_2               114869
dtype: int64

1095094

### Classify Gender 

In [10]:
## set up gender classifier package
g = gender.Detector(case_sensitive = False)

# note, for the purpose of classifying gender, we only care about  the first name and not the last name. 
# The gender-guesser only yields results with single first names and not double-barrelled names 
# (such as Mary Ann). However, since the first name of a double-barrelled name is likely to indicate the 
# gender (for example, Mary would yield female which is the same as Mary Ann)

g.get_gender(u'mary') # yields mostly_female
g.get_gender(u'mary ann') # yields unknown
g.get_gender(u'john') # yields male
g.get_gender(u'john paul') # yields unknown

'mostly_female'

'unknown'

'male'

'unknown'

In [11]:
# classify gender for each "first" name
gender_package = []

for name in local_clean_names2.name_1:
    gender_class = g.get_gender(name)
    gender_package.append(gender_class)
    
# add these genders as a new column 
local_clean_names2['gender_package'] = gender_package

In [12]:
# create a clean column that aggregates the female and male results and updates terminology to better correspond to 
# gender terminology 

gender_test = []

for i in local_clean_names2.gender_package:
    if i == 'mostly_female': 
        new1 = 'woman'
        gender_test.append(new1)
    elif i == 'female':
        new2 = 'woman'
        gender_test.append(new2)
    elif i == 'mostly_male':
        new3 = 'man'
        gender_test.append(new3)
    elif i == 'male':
        new4 = 'man'
        gender_test.append(new4)
    else:
        new5 = i
        gender_test.append(new5)
        
#gender_test

local_clean_names2['derived_gender'] = gender_test

In [13]:
# review 
gender_class = local_clean_names2.groupby(['candidate_clean','derived_gender']).size().reset_index()
gender_class = gender_class[['candidate_clean', 'derived_gender']]
gender_class.sample(5, random_state = 0)

women = gender_class[gender_class.derived_gender == 'woman']
men = gender_class[gender_class.derived_gender == 'man']
unknown = gender_class[gender_class.derived_gender == 'unknown']
andy = gender_class[gender_class.derived_gender == 'andy']
notclassified = len(andy) + len(unknown)
total = len(gender_class)

print("Success rate of gender classification: " + str(round(((len(women)+len(men))/total),3)))

Unnamed: 0,candidate_clean,derived_gender
12913,JOE MULLINS,man
7068,DEWEY COHEN,man
2122,BLAKE SUTHERS,man
19605,MYRA BEGAY,woman
15986,LANCE WOODFORD,man


Success rate of gender classification: 0.945


### Remove State-Level offices

In [14]:
# restructure df so unit of analysis is the unique candidate
df = local_clean_names2.groupby(['county_fips', 'county_name', 'state_fips', 'state','candidate_clean', 
                                'derived_gender','office']).size().reset_index()

df.columns = ['county_fips', 'county_name', 'state_fips', 'state','candidate_clean', 'derived_gender','office','count']

df.head()

Unnamed: 0,county_fips,county_name,state_fips,state,candidate_clean,derived_gender,office,count
0,1001,AUTAUGA,1,ALABAMA,BUSTER BARBER,man,COUNTY CORONER,20
1,1001,AUTAUGA,1,ALABAMA,DEBRA HILL,woman,CIRCUIT CLERK,20
2,1001,AUTAUGA,1,ALABAMA,JOE SEDINGER,man,COUNTY SHERIFF,20
3,1001,AUTAUGA,1,ALABAMA,JOY PACE BOOTH,woman,DISTRICT COURT JUDGE,20
4,1001,AUTAUGA,1,ALABAMA,KAREN H JACKSON,woman,JUDGE OF PROBATE,20


In [15]:
# Check to see if there are duplicate candidates
test = df.groupby(['candidate_clean','office']).size().reset_index()
test.columns = ['candidate_clean','office', 'county_count']
test = test.sort_values(by = 'county_count', ascending = False)
#test.head(50)

# it looks like many of the office titles for the highest duplicate counts are state-level offices 
# lets do a manual search of all the office names
test2 = df.groupby(['office']).size().reset_index()
test2.columns = ['office','county_count']
test2 = test.sort_values(by = 'county_count', ascending = False)
#test2

In [16]:
# define state-level terms to exclude based on the test dfs we explored above
local_exclude = ['STATE', 'ATTORNEY GENERAL', "STATE'S ATTORNEY", "STATE SENATE", 'JUDGE OF THE COURT OF APPEALS']

# ensure all names are uppercase 
local_clean_names2.office = [i.upper() for i in local_clean_names2.office]

# filter values out 
local_cl = local_clean_names2[~local_clean_names2['office'].str.contains('|'.join(local_exclude))].copy()

print('State-level values removed: ' + str((local_clean_names2.shape[0]-local_cl.shape[0])))

State-level values removed: 100957


In [17]:
# restructure cleaned df so unit of analysis is the unique candidate (but now we've removed state-level candidates)
df2 = local_cl.groupby(['county_fips', 'county_name', 'state_fips', 'state','candidate_clean', 
                                'derived_gender']).size().reset_index()

df2.columns = ['county_fips', 'county_name', 'state_fips', 'state','candidate_clean', 'derived_gender','count']

df2.head()
len(df2) # 28795

Unnamed: 0,county_fips,county_name,state_fips,state,candidate_clean,derived_gender,count
0,1001,AUTAUGA,1,ALABAMA,BUSTER BARBER,man,20
1,1001,AUTAUGA,1,ALABAMA,DEBRA HILL,woman,20
2,1001,AUTAUGA,1,ALABAMA,JOE SEDINGER,man,20
3,1001,AUTAUGA,1,ALABAMA,JOY PACE BOOTH,woman,20
4,1001,AUTAUGA,1,ALABAMA,KAREN H JACKSON,woman,20


28795

In [18]:
# Get dummies of derived gender
test = pd.get_dummies(df2.derived_gender)
test.head()
len(test)

# combine with original
test_result = pd.concat([df2, test], axis=1)
test_result = test_result.drop('count', axis = 1)
test_result.head()
len(test_result)

Unnamed: 0,andy,man,unknown,woman
0,0,1,0,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1


28795

Unnamed: 0,county_fips,county_name,state_fips,state,candidate_clean,derived_gender,andy,man,unknown,woman
0,1001,AUTAUGA,1,ALABAMA,BUSTER BARBER,man,0,1,0,0
1,1001,AUTAUGA,1,ALABAMA,DEBRA HILL,woman,0,0,0,1
2,1001,AUTAUGA,1,ALABAMA,JOE SEDINGER,man,0,1,0,0
3,1001,AUTAUGA,1,ALABAMA,JOY PACE BOOTH,woman,0,0,0,1
4,1001,AUTAUGA,1,ALABAMA,KAREN H JACKSON,woman,0,0,0,1


28795

### Aggregate to County Level

In [19]:
# Get dummies of derived gender 
test = pd.get_dummies(df2.derived_gender)
test.head()
#len(test)

# combine with original
test_result = pd.concat([df2, test], axis=1)
test_result = test_result.drop('count', axis = 1)
test_result.head()
#len(test_result)

Unnamed: 0,andy,man,unknown,woman
0,0,1,0,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1


Unnamed: 0,county_fips,county_name,state_fips,state,candidate_clean,derived_gender,andy,man,unknown,woman
0,1001,AUTAUGA,1,ALABAMA,BUSTER BARBER,man,0,1,0,0
1,1001,AUTAUGA,1,ALABAMA,DEBRA HILL,woman,0,0,0,1
2,1001,AUTAUGA,1,ALABAMA,JOE SEDINGER,man,0,1,0,0
3,1001,AUTAUGA,1,ALABAMA,JOY PACE BOOTH,woman,0,0,0,1
4,1001,AUTAUGA,1,ALABAMA,KAREN H JACKSON,woman,0,0,0,1


In [20]:
# Now aggregate this info to county-level unit of analysis
final = test_result.groupby(['county_fips','county_name', 'state_fips', 
                             'state']).agg({'andy':'sum', 'man':'sum', 'unknown':'sum', 'woman':'sum'}).reset_index()

final.head()

Unnamed: 0,county_fips,county_name,state_fips,state,andy,man,unknown,woman
0,1001,AUTAUGA,1,ALABAMA,0,2.0,0,4
1,1003,BALDWIN,1,ALABAMA,0,9.0,1,6
2,1005,BARBOUR,1,ALABAMA,2,9.0,0,4
3,1007,BIBB,1,ALABAMA,0,6.0,0,4
4,1009,BLOUNT,1,ALABAMA,1,5.0,0,3


In [21]:
# create gender proportion columns
final['total'] = final.andy + final.man + final.unknown + final.woman
final['prop_wom']= round((final.woman/final.total), 3)
final['prop_men']= round((final.man/final.total), 3)
final['prop_unknown']= round(((final.andy + final.unknown)/final.total), 3)

final.head()

Unnamed: 0,county_fips,county_name,state_fips,state,andy,man,unknown,woman,total,prop_wom,prop_men,prop_unknown
0,1001,AUTAUGA,1,ALABAMA,0,2.0,0,4,6.0,0.667,0.333,0.0
1,1003,BALDWIN,1,ALABAMA,0,9.0,1,6,16.0,0.375,0.562,0.062
2,1005,BARBOUR,1,ALABAMA,2,9.0,0,4,15.0,0.267,0.6,0.133
3,1007,BIBB,1,ALABAMA,0,6.0,0,4,10.0,0.4,0.6,0.0
4,1009,BLOUNT,1,ALABAMA,1,5.0,0,3,9.0,0.333,0.556,0.111


In [22]:
# convert counts to integers
final = final.astype({'andy': int, 'man': int, 'unknown': int, 'woman': int, 'total':int})
final.head()
#len(final)

Unnamed: 0,county_fips,county_name,state_fips,state,andy,man,unknown,woman,total,prop_wom,prop_men,prop_unknown
0,1001,AUTAUGA,1,ALABAMA,0,2,0,4,6,0.667,0.333,0.0
1,1003,BALDWIN,1,ALABAMA,0,9,1,6,16,0.375,0.562,0.062
2,1005,BARBOUR,1,ALABAMA,2,9,0,4,15,0.267,0.6,0.133
3,1007,BIBB,1,ALABAMA,0,6,0,4,10,0.4,0.6,0.0
4,1009,BLOUNT,1,ALABAMA,1,5,0,3,9,0.333,0.556,0.111


### Final Cleaning 

#### Remove Counties with Majority Unknown Gender

In [23]:
# let's see how many counties have prop_unknown are above 50%
high_unknown = final[final.prop_unknown >= 0.5]
#high_unknown
len(high_unknown) #16
#high_unknown.unknown.sum() #22 

# we could manually review and assign gender, but let's drop since our data set is already large enough
final_local = final[final.prop_unknown < 0.5]
len(final_local) # 1278 rows 

16

1278

#### Add Regional Variables

In [24]:
# Now let's add a new variable based on the region of the state
state_list = list(final_local.state.unique())
#state_list

# https://www.census.gov/programs-surveys/economic-census/guidance-geographies/levels.html
region_dict = {'STATE': state_list, 
               'REGION': ['south', 'west','west','northeast', 'south','south','south','west','midwest','midwest',
                          'south','south', 'northeast','south','midwest','south', 'west','west','northeast',
                          'northeast','south','midwest','south','northeast', 'northeast','south','west','south','west'],
               'DIVISIONS': ['east_south_central','mountain','mountain', 'new_england','south_atlantic',
                             'south_atlantic','south_atlantic', 'pacific', 'east_north_central', 'west_north_central',
                             'east_south_central','west_south_central','new_england','south_atlantic',
                             'west_north_central','east_south_central','mountain','mountain','new_england',
                             'middle_atlantic','south_atlantic','east_north_central','west_south_central',
                             'new_england','new_england','south_atlantic','pacific','south_atlantic','mountain']}

region_df = pd.DataFrame(data = region_dict)
region_df.head()

Unnamed: 0,STATE,REGION,DIVISIONS
0,ALABAMA,south,east_south_central
1,ARIZONA,west,mountain
2,COLORADO,west,mountain
3,CONNECTICUT,northeast,new_england
4,DELAWARE,south,south_atlantic


### Explore Final Local Dataset

In [25]:
# create final dataset with region data
final_local = pd.merge(final_local, region_df,
                       how = "left",
                       left_on = 'state',
                       right_on = "STATE")

final_local.head()

Unnamed: 0,county_fips,county_name,state_fips,state,andy,man,unknown,woman,total,prop_wom,prop_men,prop_unknown,STATE,REGION,DIVISIONS
0,1001,AUTAUGA,1,ALABAMA,0,2,0,4,6,0.667,0.333,0.0,ALABAMA,south,east_south_central
1,1003,BALDWIN,1,ALABAMA,0,9,1,6,16,0.375,0.562,0.062,ALABAMA,south,east_south_central
2,1005,BARBOUR,1,ALABAMA,2,9,0,4,15,0.267,0.6,0.133,ALABAMA,south,east_south_central
3,1007,BIBB,1,ALABAMA,0,6,0,4,10,0.4,0.6,0.0,ALABAMA,south,east_south_central
4,1009,BLOUNT,1,ALABAMA,1,5,0,3,9,0.333,0.556,0.111,ALABAMA,south,east_south_central


In [26]:
# drop extra column
final_local = final_local.drop(['STATE'], axis = 1)
final_local.head()

Unnamed: 0,county_fips,county_name,state_fips,state,andy,man,unknown,woman,total,prop_wom,prop_men,prop_unknown,REGION,DIVISIONS
0,1001,AUTAUGA,1,ALABAMA,0,2,0,4,6,0.667,0.333,0.0,south,east_south_central
1,1003,BALDWIN,1,ALABAMA,0,9,1,6,16,0.375,0.562,0.062,south,east_south_central
2,1005,BARBOUR,1,ALABAMA,2,9,0,4,15,0.267,0.6,0.133,south,east_south_central
3,1007,BIBB,1,ALABAMA,0,6,0,4,10,0.4,0.6,0.0,south,east_south_central
4,1009,BLOUNT,1,ALABAMA,1,5,0,3,9,0.333,0.556,0.111,south,east_south_central


In [27]:
# diagnostics of new df 
print("number of states: " + str(final_local.state.nunique()))
print("number of counties: " + str(final_local.county_fips.nunique()))
print("total number of candidates: " + str(final_local.total.sum()))
print("number of women candidates: " + str(final_local.woman.sum()))
print("number of men candidates: " + str(final_local.man.sum()))
print("total proportion of women candidates in dataset: " + str(round((final_local.woman.sum()/final_local.total.sum()),3)))
print("number of counties with no women candidates: " + str(len(final_local[final_local.woman == 0])))
print("number of counties with no men candidates: " + str(len(final_local[final_local.man == 0])))
print("max proportion of unknown gender: " + str(final_local.prop_unknown.max()))

number of states: 29
number of counties: 1278
total number of candidates: 28762
number of women candidates: 8088
number of men candidates: 19106
total proportion of women candidates in dataset: 0.281
number of counties with no women candidates: 187
number of counties with no men candidates: 60
max proportion of unknown gender: 0.4


### Save Data

In [28]:
# save as an object to be used later
%store final_local

Stored 'final_local' (DataFrame)


In [29]:
# OR export data as csv form 
#final_local.to_csv('final_data/2018_local_election_data.csv')