# Merging

**Overview of Notebook:**
- Merged three dataframes together
- Removed null values

In [1]:
# set up 
import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


### Load Datasets

In [2]:
%store -r final_local
%store -r local_dg_sm
%store -r repro_cl

### Explore datasets

In [3]:
repro_cl.head()
repro_cl.shape

Unnamed: 0,county_id,state_id,state_name,no_wom_demand_contracep_pub,total_titlex_clinics,no_wom_dem_contracep_13,no_wom_dem_contracep_20,total_pub_clinics,DUM_no_PP_clinics,DUM_no_PP_clinics_titlex,DUM_no_fed_centers,DUM_no_fed_centers_titlex,DUM_no_HD_clinics,DUM_no_HD_clinics_titlex,DUM_no_hosp_clinics,DUM_no_hosp_clinics_titlex,DUM_no_other_clinics,DUM_no_other_clinics_titlex
0,1001,AL,Alabama,3650,1,6660,5770,2,0,0,1,0,1,1,0,0,0,0
1,1003,AL,Alabama,11500,2,22880,20020,6,0,0,1,0,1,1,0,0,0,0
2,1005,AL,Alabama,1640,2,2370,2030,3,0,0,1,0,1,1,0,0,0,0
3,1007,AL,Alabama,1480,1,2360,2090,5,0,0,1,0,1,1,0,0,0,0
4,1009,AL,Alabama,3490,1,6320,5500,2,0,0,1,0,1,1,0,0,0,0


(1252, 18)

In [4]:
final_local.head()
final_local.shape

Unnamed: 0,county_fips,county_name,state_fips,state,andy,man,unknown,woman,total,prop_wom,prop_men,prop_unknown,REGION,DIVISIONS
0,1001,AUTAUGA,1,ALABAMA,0,2,0,4,6,0.667,0.333,0.0,south,east_south_central
1,1003,BALDWIN,1,ALABAMA,0,9,1,6,16,0.375,0.562,0.062,south,east_south_central
2,1005,BARBOUR,1,ALABAMA,2,9,0,4,15,0.267,0.6,0.133,south,east_south_central
3,1007,BIBB,1,ALABAMA,0,6,0,4,10,0.4,0.6,0.0,south,east_south_central
4,1009,BLOUNT,1,ALABAMA,1,5,0,3,9,0.333,0.556,0.111,south,east_south_central


(1278, 14)

In [5]:
local_dg_sm.head()
local_dg_sm.shape

Unnamed: 0,state,county,fips,fips_clean,total_population,cvap,white_pct,black_pct,hispanic_pct,nonwhite_pct,...,lesshs_whites_pct,lesscollege_whites_pct,rural_pct,ruralurban_cc,median_hh_inc,pres_16_CLINTON,pres_16_THIRD,pres_16_TRUMP,pres_12_OBAMA,pres_12_ROMNEY
0,Alabama,Autauga,1001,1001,55049.0,40690.0,0.757,0.184,0.026,0.243,...,0.1,0.741,0.42,2.0,53099.0,0,0,1,0,1
1,Alabama,Baldwin,1003,1003,199510.0,151770.0,0.832,0.092,0.044,0.168,...,0.078,0.684,0.423,3.0,51365.0,0,0,1,0,1
2,Alabama,Barbour,1005,1005,26614.0,20375.0,0.459,0.479,0.043,0.541,...,0.196,0.814,0.678,6.0,33956.0,0,0,1,1,0
3,Alabama,Bibb,1007,1007,22572.0,17590.0,0.748,0.212,0.022,0.252,...,0.15,0.875,0.684,1.0,39776.0,0,0,1,0,1
4,Alabama,Blount,1009,1009,57704.0,42430.0,0.877,0.016,0.087,0.123,...,0.166,0.862,0.9,1.0,46212.0,0,0,1,0,1


(3113, 27)

### Merge Data

In [6]:
# merge 1 (reproduction and demographic data)
len(repro_cl)
len(local_dg_sm)

# merge left on reproductive health datset since it has the fewest # of states 
merge_1 = pd.merge(repro_cl, local_dg_sm,
                  how= 'left',
                  left_on = 'county_id',
                  right_on = 'fips_clean')

len(merge_1)

merge_cl = merge_1.drop(['state', 'county', 'fips', 'fips_clean'], axis=1)
merge_cl.columns

1252

3113

1252

Index(['county_id', 'state_id', 'state_name', 'no_wom_demand_contracep_pub',
       'total_titlex_clinics', 'no_wom_dem_contracep_13',
       'no_wom_dem_contracep_20', 'total_pub_clinics', 'DUM_no_PP_clinics',
       'DUM_no_PP_clinics_titlex', 'DUM_no_fed_centers',
       'DUM_no_fed_centers_titlex', 'DUM_no_HD_clinics',
       'DUM_no_HD_clinics_titlex', 'DUM_no_hosp_clinics',
       'DUM_no_hosp_clinics_titlex', 'DUM_no_other_clinics',
       'DUM_no_other_clinics_titlex', 'total_population', 'cvap', 'white_pct',
       'black_pct', 'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct',
       'female_pct', 'age29andunder_pct', 'age65andolder_pct',
       'clf_unemploy_pct', 'lesshs_pct', 'lesscollege_pct',
       'lesshs_whites_pct', 'lesscollege_whites_pct', 'rural_pct',
       'ruralurban_cc', 'median_hh_inc', 'pres_16_CLINTON', 'pres_16_THIRD',
       'pres_16_TRUMP', 'pres_12_OBAMA', 'pres_12_ROMNEY'],
      dtype='object')

In [7]:
# merge 2
local_df1 = pd.merge(merge_1, final_local,
              how = "left",
              left_on = 'county_id',
              right_on = 'county_fips',
              indicator = "name_merge_status")

### Explore Final Dataset

In [8]:
local_df1.shape # 1252 
local_df1.head()
local_df1.columns
local_df1.REGION.unique()

(1252, 60)

Unnamed: 0,county_id,state_id,state_name,no_wom_demand_contracep_pub,total_titlex_clinics,no_wom_dem_contracep_13,no_wom_dem_contracep_20,total_pub_clinics,DUM_no_PP_clinics,DUM_no_PP_clinics_titlex,...,man,unknown,woman,total,prop_wom,prop_men,prop_unknown,REGION,DIVISIONS,name_merge_status
0,1001,AL,Alabama,3650,1,6660,5770,2,0,0,...,2.0,0.0,4.0,6.0,0.667,0.333,0.0,south,east_south_central,both
1,1003,AL,Alabama,11500,2,22880,20020,6,0,0,...,9.0,1.0,6.0,16.0,0.375,0.562,0.062,south,east_south_central,both
2,1005,AL,Alabama,1640,2,2370,2030,3,0,0,...,9.0,0.0,4.0,15.0,0.267,0.6,0.133,south,east_south_central,both
3,1007,AL,Alabama,1480,1,2360,2090,5,0,0,...,6.0,0.0,4.0,10.0,0.4,0.6,0.0,south,east_south_central,both
4,1009,AL,Alabama,3490,1,6320,5500,2,0,0,...,5.0,0.0,3.0,9.0,0.333,0.556,0.111,south,east_south_central,both


Index(['county_id', 'state_id', 'state_name', 'no_wom_demand_contracep_pub',
       'total_titlex_clinics', 'no_wom_dem_contracep_13',
       'no_wom_dem_contracep_20', 'total_pub_clinics', 'DUM_no_PP_clinics',
       'DUM_no_PP_clinics_titlex', 'DUM_no_fed_centers',
       'DUM_no_fed_centers_titlex', 'DUM_no_HD_clinics',
       'DUM_no_HD_clinics_titlex', 'DUM_no_hosp_clinics',
       'DUM_no_hosp_clinics_titlex', 'DUM_no_other_clinics',
       'DUM_no_other_clinics_titlex', 'state_x', 'county', 'fips',
       'fips_clean', 'total_population', 'cvap', 'white_pct', 'black_pct',
       'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct', 'female_pct',
       'age29andunder_pct', 'age65andolder_pct', 'clf_unemploy_pct',
       'lesshs_pct', 'lesscollege_pct', 'lesshs_whites_pct',
       'lesscollege_whites_pct', 'rural_pct', 'ruralurban_cc', 'median_hh_inc',
       'pres_16_CLINTON', 'pres_16_THIRD', 'pres_16_TRUMP', 'pres_12_OBAMA',
       'pres_12_ROMNEY', 'county_fips', 'county_name', 

array(['south', 'west', nan, 'northeast', 'midwest'], dtype=object)

In [9]:
# clean the state names so they are all uppercase
local_df1['state_clean'] = [i.upper() for i in local_df1.state_name]

# check that we still have 28
local_df1.state_clean.nunique() # 28

28

In [10]:
# Keep just the columns we want
local_cl = local_df1[['county_fips', 'county_name', 'state_fips','state_clean','REGION', 'DIVISIONS','total',
                      'prop_wom', 'prop_men','prop_unknown','total_population', 'cvap', 'white_pct', 'black_pct',
                      'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct', 'female_pct',
                      'age29andunder_pct', 'age65andolder_pct', 'clf_unemploy_pct','lesshs_pct', 'lesscollege_pct', 
                      'lesshs_whites_pct','lesscollege_whites_pct', 'rural_pct', 'ruralurban_cc', 'median_hh_inc',
                      'pres_16_CLINTON', 'pres_16_THIRD', 'pres_16_TRUMP', 'pres_12_OBAMA','pres_12_ROMNEY',
                      'no_wom_demand_contracep_pub', 'total_titlex_clinics', 'no_wom_dem_contracep_13',
                      'no_wom_dem_contracep_20', 'total_pub_clinics', 'DUM_no_PP_clinics',
                      'DUM_no_PP_clinics_titlex', 'DUM_no_fed_centers', 'DUM_no_fed_centers_titlex', 
                      'DUM_no_HD_clinics', 'DUM_no_HD_clinics_titlex', 'DUM_no_hosp_clinics',
                      'DUM_no_hosp_clinics_titlex', 'DUM_no_other_clinics','DUM_no_other_clinics_titlex']]
# check this worked
local_cl.columns

Index(['county_fips', 'county_name', 'state_fips', 'state_clean', 'REGION',
       'DIVISIONS', 'total', 'prop_wom', 'prop_men', 'prop_unknown',
       'total_population', 'cvap', 'white_pct', 'black_pct', 'hispanic_pct',
       'nonwhite_pct', 'foreignborn_pct', 'female_pct', 'age29andunder_pct',
       'age65andolder_pct', 'clf_unemploy_pct', 'lesshs_pct',
       'lesscollege_pct', 'lesshs_whites_pct', 'lesscollege_whites_pct',
       'rural_pct', 'ruralurban_cc', 'median_hh_inc', 'pres_16_CLINTON',
       'pres_16_THIRD', 'pres_16_TRUMP', 'pres_12_OBAMA', 'pres_12_ROMNEY',
       'no_wom_demand_contracep_pub', 'total_titlex_clinics',
       'no_wom_dem_contracep_13', 'no_wom_dem_contracep_20',
       'total_pub_clinics', 'DUM_no_PP_clinics', 'DUM_no_PP_clinics_titlex',
       'DUM_no_fed_centers', 'DUM_no_fed_centers_titlex', 'DUM_no_HD_clinics',
       'DUM_no_HD_clinics_titlex', 'DUM_no_hosp_clinics',
       'DUM_no_hosp_clinics_titlex', 'DUM_no_other_clinics',
       'DUM_no_othe

In [11]:
# check for null
local_cl.isna().sum()

county_fips                    169
county_name                    169
state_fips                     169
state_clean                      0
REGION                         169
DIVISIONS                      169
total                          169
prop_wom                       169
prop_men                       169
prop_unknown                   169
total_population                 0
cvap                             0
white_pct                        0
black_pct                        0
hispanic_pct                     0
nonwhite_pct                     0
foreignborn_pct                  0
female_pct                       0
age29andunder_pct                0
age65andolder_pct                0
clf_unemploy_pct                 0
lesshs_pct                       0
lesscollege_pct                  0
lesshs_whites_pct                0
lesscollege_whites_pct           0
rural_pct                        0
ruralurban_cc                    0
median_hh_inc                    0
pres_16_CLINTON     

In [12]:
# remove null values 
local_cl = local_cl.dropna()
local_cl.isna().sum()

county_fips                    0
county_name                    0
state_fips                     0
state_clean                    0
REGION                         0
DIVISIONS                      0
total                          0
prop_wom                       0
prop_men                       0
prop_unknown                   0
total_population               0
cvap                           0
white_pct                      0
black_pct                      0
hispanic_pct                   0
nonwhite_pct                   0
foreignborn_pct                0
female_pct                     0
age29andunder_pct              0
age65andolder_pct              0
clf_unemploy_pct               0
lesshs_pct                     0
lesscollege_pct                0
lesshs_whites_pct              0
lesscollege_whites_pct         0
rural_pct                      0
ruralurban_cc                  0
median_hh_inc                  0
pres_16_CLINTON                0
pres_16_THIRD                  0
pres_16_TR

In [13]:
# create dummies of states, regions, divisions, and rural/urban codes
local_dummied = pd.get_dummies(local_cl, columns=['state_clean', 'REGION', 'DIVISIONS','ruralurban_cc'],
                               drop_first=False)
local_dummied.head()
local_dummied.columns
local_dummied.shape

Unnamed: 0,county_fips,county_name,state_fips,total,prop_wom,prop_men,prop_unknown,total_population,cvap,white_pct,...,DIVISIONS_west_south_central,ruralurban_cc_1.0,ruralurban_cc_2.0,ruralurban_cc_3.0,ruralurban_cc_4.0,ruralurban_cc_5.0,ruralurban_cc_6.0,ruralurban_cc_7.0,ruralurban_cc_8.0,ruralurban_cc_9.0
0,1001,AUTAUGA,1,6.0,0.667,0.333,0.0,55049.0,40690.0,0.757,...,0,0,1,0,0,0,0,0,0,0
1,1003,BALDWIN,1,16.0,0.375,0.562,0.062,199510.0,151770.0,0.832,...,0,0,0,1,0,0,0,0,0,0
2,1005,BARBOUR,1,15.0,0.267,0.6,0.133,26614.0,20375.0,0.459,...,0,0,0,0,0,0,1,0,0,0
3,1007,BIBB,1,10.0,0.4,0.6,0.0,22572.0,17590.0,0.748,...,0,1,0,0,0,0,0,0,0,0
4,1009,BLOUNT,1,9.0,0.333,0.556,0.111,57704.0,42430.0,0.877,...,0,1,0,0,0,0,0,0,0,0


Index(['county_fips', 'county_name', 'state_fips', 'total', 'prop_wom',
       'prop_men', 'prop_unknown', 'total_population', 'cvap', 'white_pct',
       'black_pct', 'hispanic_pct', 'nonwhite_pct', 'foreignborn_pct',
       'female_pct', 'age29andunder_pct', 'age65andolder_pct',
       'clf_unemploy_pct', 'lesshs_pct', 'lesscollege_pct',
       'lesshs_whites_pct', 'lesscollege_whites_pct', 'rural_pct',
       'median_hh_inc', 'pres_16_CLINTON', 'pres_16_THIRD', 'pres_16_TRUMP',
       'pres_12_OBAMA', 'pres_12_ROMNEY', 'no_wom_demand_contracep_pub',
       'total_titlex_clinics', 'no_wom_dem_contracep_13',
       'no_wom_dem_contracep_20', 'total_pub_clinics', 'DUM_no_PP_clinics',
       'DUM_no_PP_clinics_titlex', 'DUM_no_fed_centers',
       'DUM_no_fed_centers_titlex', 'DUM_no_HD_clinics',
       'DUM_no_HD_clinics_titlex', 'DUM_no_hosp_clinics',
       'DUM_no_hosp_clinics_titlex', 'DUM_no_other_clinics',
       'DUM_no_other_clinics_titlex', 'state_clean_ALABAMA',
       'state

(1083, 94)

### Final Diagnostics

In [14]:
print("number of states: " + str(local_cl.state_clean.nunique()))
print("number of counties: " + str(local_dummied.county_fips.nunique()))
print("total number of candidates: " + str(local_dummied.total.sum()))
print("number of women candidates: " + str(local_df1.woman.sum()))
print("number of men candidates: " + str(local_df1.man.sum()))
print("total proportion of women candidates in dataset: " + str(round((local_df1.woman.sum()/local_df1.total.sum()),3)))
print("number of counties with no women candidates: " + str(len(local_df1[local_df1.woman == 0])))
print("number of counties with no men candidates: " + str(len(local_df1[local_df1.man == 0])))
print("max proportion of unknown gender: " + str(local_dummied.prop_unknown.max()))

number of states: 28
number of counties: 1083
total number of candidates: 24862.0
number of women candidates: 6883.0
number of men candidates: 16564.0
total proportion of women candidates in dataset: 0.277
number of counties with no women candidates: 158
number of counties with no men candidates: 48
max proportion of unknown gender: 0.4


### Save DF

In [15]:
# Save DF 
%store local_cl
%store local_dummied

Stored 'local_cl' (DataFrame)
Stored 'local_dummied' (DataFrame)
