# Data Cleaning (Reproductive Health)

**Overview of Notebook:**
- Data sourced from the [Guttmacher Institute](https://data.guttmacher.org/counties)
- Loaded data for all states present in the original local datset
- Clean FIPS codes
- Concat dataframes together
- Create dummy variables 

In [1]:
# set up 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression, Ridge 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import mean_squared_error, confusion_matrix

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

%xmode Minimal

Exception reporting mode: Minimal


### Check State Values in Output Data 

In [2]:
# Load output data
%store -r final_local

In [3]:
# Identify states in our output dataset
final_local.head()
final_local.state.nunique()  # 29
final_local.state.unique()

Unnamed: 0,county_fips,county_name,state_fips,state,andy,man,unknown,woman,total,prop_wom,prop_men,prop_unknown,REGION,DIVISIONS
0,1001,AUTAUGA,1,ALABAMA,0,2,0,4,6,0.667,0.333,0.0,south,east_south_central
1,1003,BALDWIN,1,ALABAMA,0,9,1,6,16,0.375,0.562,0.062,south,east_south_central
2,1005,BARBOUR,1,ALABAMA,2,9,0,4,15,0.267,0.6,0.133,south,east_south_central
3,1007,BIBB,1,ALABAMA,0,6,0,4,10,0.4,0.6,0.0,south,east_south_central
4,1009,BLOUNT,1,ALABAMA,1,5,0,3,9,0.333,0.556,0.111,south,east_south_central


29

array(['ALABAMA', 'ARIZONA', 'COLORADO', 'CONNECTICUT', 'DELAWARE',
       'DISTRICT OF COLUMBIA', 'FLORIDA', 'HAWAII', 'ILLINOIS', 'IOWA',
       'KENTUCKY', 'LOUISIANA', 'MAINE', 'MARYLAND', 'MINNESOTA',
       'MISSISSIPPI', 'MONTANA', 'NEVADA', 'NEW HAMPSHIRE', 'NEW JERSEY',
       'NORTH CAROLINA', 'OHIO', 'OKLAHOMA', 'RHODE ISLAND', 'VERMONT',
       'VIRGINIA', 'WASHINGTON', 'WEST VIRGINIA', 'WYOMING'], dtype=object)

In [4]:
# define states
states = ['AL', 'AZ', 'CO', 'CT', 'DE','FL','HI','IA','IL','KY','LA','MD','ME','MN','MS','MT','NC',
         'NH', 'NJ', 'NV', 'OH','OK', 'RI', 'VA', 'VT', 'WA', 'WV', 'WY']

# confirm we have all the states
# There was no reproductive health data from this source for DC so we should have 28
len(states) # 28

28

### Load Reproductive Health Data for these States

In [5]:
# load reproductive health data (updated the pathname as appropriate)
for i in states:
    globals()[f'{i}_data'] = pd.read_csv(f'final_data/repro_data/{i}data.csv')

In [6]:
# combine all dataframes into one 

# ones that will need dropped zero added
fips_data =  pd.concat([AL_data, AZ_data, CO_data, CT_data], axis = 0)
fips_data.state_name.unique()

# all other states
other_data = pd.concat([DE_data, FL_data, HI_data, IA_data, IL_data, KY_data, LA_data, MD_data, ME_data, MN_data,
                        MS_data, MT_data, NC_data, NH_data, NJ_data, NV_data, OH_data, OK_data, RI_data, VA_data,
                        VT_data, WA_data, WV_data, WY_data], axis = 0)
other_data.state_name.unique()

array(['Alabama', '2016', '2015', 'Arizona', 'Colorado', 'Connecticut'],
      dtype=object)

array(['Delaware', '2016', '2015', 'Florida', 'Hawaii', 'Iowa',
       'Illinois', 'Kentucky', 'Louisiana', 'Maryland', 'Maine',
       'Minnesota', 'Mississippi', 'Montana', 'North Carolina',
       'New Hampshire', 'New Jersey', 'Nevada', 'Ohio', 'Oklahoma',
       'Rhode Island', 'Virginia', 'Vermont', 'Washington',
       'West Virginia', 'Wyoming'], dtype=object)

### Clean states that will need dropped zero added to fips code

In [7]:
# pivot to wide
new_fips = pd.pivot(fips_data, 
                   index= ['county_id','state_id','state_name'],
                   columns= ['measure_name'], 
                   values = 'datum').reset_index()

# remove state-level rows 
fips_cl = new_fips[new_fips['state_id'].apply(lambda x: len(x) == 2)]


# add dropped zero to front of fips code
fips_cl['county_id'] = ["0" + str(i) for i in fips_cl.county_id]

# rename columns
fips_cl.columns = ['county_id', 'state_id', 'state_name', 'no_PP_clinics','no_PP_clinics_titlex',
                     'no_fed_centers','no_fed_centers_titlex','no_HD_clinics','no_HD_clinics_titlex',
                     'no_hosp_clinics','no_hosp_clinics_titlex', 'no_other_clinics', 'no_other_clinics_titlex',
                     'no_wom_demand_contracep_pub','total_titlex_clinics', 'no_wom_dem_contracep_13', 
                     'no_wom_dem_contracep_20','total_pub_clinics']

# remove na values
fips_cl = fips_cl.dropna()

# update column types
fips_cl = fips_cl.astype({'no_PP_clinics': int,'no_PP_clinics_titlex': int,'no_fed_centers': int,
                          'no_fed_centers_titlex': int,'no_HD_clinics': int,'no_HD_clinics_titlex':int,
                          'no_hosp_clinics': int,'no_hosp_clinics_titlex': int, 'no_other_clinics': int,
                          'no_other_clinics_titlex': int,'no_wom_demand_contracep_pub': int,
                          'total_titlex_clinics': int, 'no_wom_dem_contracep_13': int, 
                          'no_wom_dem_contracep_20': int,'total_pub_clinics':int})
fips_cl.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fips_cl['county_id'] = ["0" + str(i) for i in fips_cl.county_id]


Unnamed: 0,county_id,state_id,state_name,no_PP_clinics,no_PP_clinics_titlex,no_fed_centers,no_fed_centers_titlex,no_HD_clinics,no_HD_clinics_titlex,no_hosp_clinics,no_hosp_clinics_titlex,no_other_clinics,no_other_clinics_titlex,no_wom_demand_contracep_pub,total_titlex_clinics,no_wom_dem_contracep_13,no_wom_dem_contracep_20,total_pub_clinics
0,1001,AL,Alabama,0,0,1,0,1,1,0,0,0,0,3650,1,6660,5770,2
1,1003,AL,Alabama,0,0,4,0,2,2,0,0,0,0,11500,2,22880,20020,6
2,1005,AL,Alabama,0,0,1,0,2,2,0,0,0,0,1640,2,2370,2030,3
3,1007,AL,Alabama,0,0,4,0,1,1,0,0,0,0,1480,1,2360,2090,5
4,1009,AL,Alabama,0,0,1,0,1,1,0,0,0,0,3490,1,6320,5500,2


### Clean Other States

In [8]:
# pivot to wide
new_other = pd.pivot(other_data, 
                     index= ['county_id','state_id','state_name'],
                     columns= ['measure_name'], 
                     values = 'datum').reset_index()

# remove state-level rows
other_cl = new_other[new_other['state_id'].apply(lambda x: len(x) == 2)]

# rename columns
other_cl.columns = ['county_id', 'state_id', 'state_name', 'no_PP_clinics','no_PP_clinics_titlex',
                     'no_fed_centers','no_fed_centers_titlex','no_HD_clinics','no_HD_clinics_titlex',
                     'no_hosp_clinics','no_hosp_clinics_titlex', 'no_other_clinics', 'no_other_clinics_titlex',
                     'no_wom_demand_contracep_pub','total_titlex_clinics', 'no_wom_dem_contracep_13', 
                     'no_wom_dem_contracep_20','total_pub_clinics']

# remove na values
other_cl = other_cl.dropna()

# update column types
other_cl = other_cl.astype({'no_PP_clinics': int,'no_PP_clinics_titlex': int,'no_fed_centers': int,
                          'no_fed_centers_titlex': int,'no_HD_clinics': int,'no_HD_clinics_titlex':int,
                          'no_hosp_clinics': int,'no_hosp_clinics_titlex': int, 'no_other_clinics': int,
                          'no_other_clinics_titlex': int,'no_wom_demand_contracep_pub': int,
                          'total_titlex_clinics': int, 'no_wom_dem_contracep_13': int, 
                          'no_wom_dem_contracep_20': int,'total_pub_clinics':int})
other_cl.head()

Unnamed: 0,county_id,state_id,state_name,no_PP_clinics,no_PP_clinics_titlex,no_fed_centers,no_fed_centers_titlex,no_HD_clinics,no_HD_clinics_titlex,no_hosp_clinics,no_hosp_clinics_titlex,no_other_clinics,no_other_clinics_titlex,no_wom_demand_contracep_pub,total_titlex_clinics,no_wom_dem_contracep_13,no_wom_dem_contracep_20,total_pub_clinics
0,10001,DE,Delaware,1,1,1,1,5,5,0,0,4,3,11980,10,22250,19490,11
1,10003,DE,Delaware,2,2,6,5,9,8,0,0,3,3,31500,18,72620,64040,20
2,10005,DE,Delaware,0,0,2,2,8,8,0,0,0,0,10570,10,20480,18100,10
3,12001,FL,Florida,0,0,4,0,4,4,0,0,0,0,28090,4,44570,39750,8
4,12003,FL,Florida,0,0,1,0,1,1,0,0,0,0,1500,1,3180,2750,2


### Concat DFs together

In [9]:
# concat together
repro = pd.concat([other_cl, fips_cl], axis = 0)
repro = repro.sort_values(by = 'county_id')
repro.head()
len(repro.state_id.unique()) # 28 states

Unnamed: 0,county_id,state_id,state_name,no_PP_clinics,no_PP_clinics_titlex,no_fed_centers,no_fed_centers_titlex,no_HD_clinics,no_HD_clinics_titlex,no_hosp_clinics,no_hosp_clinics_titlex,no_other_clinics,no_other_clinics_titlex,no_wom_demand_contracep_pub,total_titlex_clinics,no_wom_dem_contracep_13,no_wom_dem_contracep_20,total_pub_clinics
0,1001,AL,Alabama,0,0,1,0,1,1,0,0,0,0,3650,1,6660,5770,2
1,1003,AL,Alabama,0,0,4,0,2,2,0,0,0,0,11500,2,22880,20020,6
2,1005,AL,Alabama,0,0,1,0,2,2,0,0,0,0,1640,2,2370,2030,3
3,1007,AL,Alabama,0,0,4,0,1,1,0,0,0,0,1480,1,2360,2090,5
4,1009,AL,Alabama,0,0,1,0,1,1,0,0,0,0,3490,1,6320,5500,2


28

In [10]:
# update to binary variables
columns = ['no_PP_clinics','no_PP_clinics_titlex','no_fed_centers','no_fed_centers_titlex','no_HD_clinics',
           'no_HD_clinics_titlex','no_hosp_clinics','no_hosp_clinics_titlex', 'no_other_clinics', 
           'no_other_clinics_titlex']

for col in columns: 
    repro[f'DUM_{col}'] = np.where(repro[col] != 0, 1, 0)

In [11]:
# drop non-dummied columns
repro_cl = repro.drop(columns, axis =1)

In [12]:
repro_cl.head()

Unnamed: 0,county_id,state_id,state_name,no_wom_demand_contracep_pub,total_titlex_clinics,no_wom_dem_contracep_13,no_wom_dem_contracep_20,total_pub_clinics,DUM_no_PP_clinics,DUM_no_PP_clinics_titlex,DUM_no_fed_centers,DUM_no_fed_centers_titlex,DUM_no_HD_clinics,DUM_no_HD_clinics_titlex,DUM_no_hosp_clinics,DUM_no_hosp_clinics_titlex,DUM_no_other_clinics,DUM_no_other_clinics_titlex
0,1001,AL,Alabama,3650,1,6660,5770,2,0,0,1,0,1,1,0,0,0,0
1,1003,AL,Alabama,11500,2,22880,20020,6,0,0,1,0,1,1,0,0,0,0
2,1005,AL,Alabama,1640,2,2370,2030,3,0,0,1,0,1,1,0,0,0,0
3,1007,AL,Alabama,1480,1,2360,2090,5,0,0,1,0,1,1,0,0,0,0
4,1009,AL,Alabama,3490,1,6320,5500,2,0,0,1,0,1,1,0,0,0,0


### Save Dataframe

In [13]:
%store repro_cl

Stored 'repro_cl' (DataFrame)
