## Webscrape More Updated Data From Guttmacher's State Profiles

In [1]:
# SCRAPING THE CONTENTS FROM https://states.guttmacher.org/policies/

# DEPENDENCIES
import requests
import json
import pandas as pd
import numpy as np

In [2]:
url = "https://states.guttmacher.org/policies/guttmacher.json"

In [3]:
response = requests.get(url).json()
# print(json.dumps(response, indent=4, sort_keys=True))

### Pulling the data from Guttmacher's JSON source

In [4]:
all_abortion_df = pd.json_normalize(response)

In [5]:
all_abortion_df.head()
print(type(all_abortion_df))
all_abortion_df.info()

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 23 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   state                                                       51 non-null     object 
 1   state_slug                                                  51 non-null     object 
 2   environment_type                                            51 non-null     object 
 3   policies_currently_in_effect                                51 non-null     object 
 4   wora_in_state_15_49                                         51 non-null     object 
 5   wora_in_state_15_49_breakdown                               51 non-null     object 
 6   residents_in_state_born_outside_us                          51 non-null     int64  
 7   income_level_below_fpl                               

In [6]:
abortion_2020_data = all_abortion_df[["state", 
                                      "state_slug", 
                                      "policies_currently_in_effect",
                                      "environment_type", 
                                      "residents_in_state_born_outside_us", 
                                      "income_level_below_fpl", 
                                      "abortions_obtained_in_2017",
                                      "abortions_per_1000_women_aged_15_44_in_2017",
                                      "clinics_provided_abortion_in_2017", 
                                      "counties_without_abortion_provider_in_2017",
                                      "residents_in_counties_without_an_abortion_provider_in_2017" ]]
# abortion_2020_data.head()

In [7]:
# # Adjusting columns to display percentages as floats instead of whole numbers
# abortion_2020_data.loc[:,"residents_in_state_born_outside_us"] *= 0.01
# abortion_2020_data.loc[:,"income_level_below_fpl"] *= 0.01
# abortion_2020_data.loc[:,"residents_in_counties_without_an_abortion_provider_in_2017"] *= 0.01

In [8]:
# Changing number of abortions obtained from object to integer

abortion_2020_data.loc[:,"abortions_obtained_in_2017"] = abortion_2020_data["abortions_obtained_in_2017"].astype(str)

comma = ","
abortion_2020_data.loc[:,"abortions_obtained_in_2017"] = [each.replace(comma, "") for each in abortion_2020_data["abortions_obtained_in_2017"]]
abortion_2020_data.loc[:,"abortions_obtained_in_2017"] = abortion_2020_data["abortions_obtained_in_2017"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [9]:
# abortion_2020_data.head()

### Cleaning Driving Distance Data

In [10]:
# Field "aowdd_15_49_after_14_weeks" dropped for lack of data
driving_distance = all_abortion_df[["state",
                                    "aowdd_15_49_6_weeks", 
                                    "aowdd_15_49_12_weeks",
                                    "aowdd_15_49_14_weeks",
                                    "aowdd_15_49_20_weeks",
                                    "aowdd_15_49_22_weeks", 
                                    "aowdd_15_49_24_weeks",
                                    "aowdd_15_49_after_24_weeks"]]
# driving_distance.head()

In [11]:
# Driving distance is shown as a string in the dataset, e.g. ""Average one-way driving distance for women aged 15-49 in Arizona to the nearest clinic that performs abortions up to 20 weeks = 11 miles"
# New columns are created below to pull the number of average miles from each field
driving_distance.loc[:,"week_6"] = driving_distance.aowdd_15_49_6_weeks.str.split("\s+").str[-2]
driving_distance.loc[:,"week_12"] = driving_distance.aowdd_15_49_12_weeks.str.split("\s+").str[-2]
driving_distance.loc[:,"week_14"] = driving_distance.aowdd_15_49_14_weeks.str.split("\s+").str[-2]
driving_distance.loc[:,"week_20"] = driving_distance.aowdd_15_49_20_weeks.str.split("\s+").str[-2]
driving_distance.loc[:,"week_22"] = driving_distance.aowdd_15_49_22_weeks.str.split("\s+").str[-2]
driving_distance.loc[:,"week_24"] = driving_distance.aowdd_15_49_24_weeks.str.split("\s+").str[-2]
driving_distance.loc[:,"week_24_plus"] = driving_distance.aowdd_15_49_after_24_weeks.str.split("\s+").str[-2]

# driving_distance.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


In [12]:
# driving_distance.info()

In [13]:
# Shows the average one-way driving distance for a woman to obtain an abortion at the given week threshold
driving_distance = driving_distance[["state", "week_6", "week_12", "week_14", "week_20", "week_22", "week_24", "week_24_plus"]]
driving_distance["week_24_plus"] = driving_distance["week_24_plus"].str.replace("in", "0")
# driving_distance.info()

In [14]:
driving_distance2 = driving_distance.astype({"week_6": float, "week_12": float, "week_14": float, "week_20": float,"week_22": float, "week_24": float, "week_24_plus": float})
# driving_distance2

In [15]:
driving_distance3 = driving_distance2.melt(id_vars=["state"], var_name="week_threshold", value_name="avg_oneway_miles").sort_values(["state"])
# driving_distance3.head()

### Cleaning Race/Ethnicity Data

In [16]:
wora_ethnicities = all_abortion_df[["state","wora_in_state_15_49_breakdown"]]
# wora_ethnicities.head()

In [17]:
wora_ethnicities2 = wora_ethnicities.wora_in_state_15_49_breakdown.str.split(" ", expand = True)
wora_ethnicities2 = wora_ethnicities2[[1,4,7,10,13]]
# wora_ethnicities2.head()

In [18]:
wora_ethnicities3 = pd.merge(wora_ethnicities, wora_ethnicities2, left_index=True, right_index=True)
wora_ethnicities3.rename(columns={"wora_in_state_15_49_breakdown":"original_string", 1:"Hispanic", 4:"White", 7:"Black", 10:"Asian", 13:"Other"}, inplace=True)
# wora_ethnicities3.info()

In [19]:
char = "%"
wora_ethnicities3["Hispanic"] = ([each.replace(char, "") for each in wora_ethnicities3["Hispanic"]])
wora_ethnicities3["White"] = ([each.replace(char, "") for each in wora_ethnicities3["White"]])
wora_ethnicities3["Black"] = ([each.replace(char, "") for each in wora_ethnicities3["Black"]])
wora_ethnicities3["Asian"] = ([each.replace(char, "") for each in wora_ethnicities3["Asian"]])
wora_ethnicities3["Other"] = ([each.replace(char, "") for each in wora_ethnicities3["Other"]])
# wora_ethnicities3.info()

In [20]:
wora_ethnicities4 = wora_ethnicities3.astype({"Hispanic": float, "White": float, "Black": float, "Asian": float,"Other": float})

In [21]:
wora_ethnicities4 = wora_ethnicities4[["Hispanic", "White", "Black", "Asian", "Other"]]

In [22]:
wora_ethnicities5 = pd.merge(wora_ethnicities3, wora_ethnicities4, left_index = True, right_index = True)
wora_ethnicities5 = wora_ethnicities5[["state", "Hispanic_y", "White_y", "Black_y","Asian_y", "Other_y"]]
wora_ethnicities5 = wora_ethnicities5.rename(columns={"Hispanic_y":"Hispanic", "White_y":"White", "Black_y":"Black", "Asian_y":"Asian", "Other_y":"Other"})
# wora_ethnicities5.info()

In [23]:
wora_ethnicities6 = wora_ethnicities5.melt(id_vars=["state"], var_name="Race/Ethnicity", value_name="Portion of WORA").sort_values(["state"])
wora_ethnicities6 = pd.DataFrame(wora_ethnicities6)
# wora_ethnicities6.head()

### Cleaning Age Breakdown

In [24]:
ages = all_abortion_df[["state","wora_age_breakdown"]]
# print(ages)

In [25]:
ages_2 = ages.wora_age_breakdown.str.split(" ", expand = True)
ages_2 = ages_2[[1,5,9,13,17, 21]]
# ages_2.head()

In [26]:
ages_3 = pd.merge(ages, ages_2, left_index=True, right_index=True)
ages_3.rename(columns={"wora_age_breakdown":"original_string", 1:"15-17 Years", 5:"18-19 Years", 9:"20-24 Years", 13:"25-29 Years", 17:"30-39 Years", 21:"40-49 Years"}, inplace=True)
# ages_3.head()

In [27]:
char = "%"
ages_3["15-17 Years"] = ([each.replace(char, "") for each in ages_3["15-17 Years"]])
ages_3["18-19 Years"] = ([each.replace(char, "") for each in ages_3["18-19 Years"]])
ages_3["20-24 Years"] = ([each.replace(char, "") for each in ages_3["20-24 Years"]])
ages_3["25-29 Years"] = ([each.replace(char, "") for each in ages_3["25-29 Years"]])
ages_3["30-39 Years"] = ([each.replace(char, "") for each in ages_3["30-39 Years"]])
ages_3["40-49 Years"] = ([each.replace(char, "") for each in ages_3["40-49 Years"]])


In [28]:
ages_4 = ages_3.astype({"15-17 Years": float, 
                        "18-19 Years": float,
                        "20-24 Years": float,  
                        "25-29 Years": float, 
                        "30-39 Years": float,
                        "40-49 Years": float})

In [29]:
ages_4 = ages_4[["15-17 Years", "18-19 Years","20-24 Years",  "25-29 Years","30-39 Years","40-49 Years"]]

In [30]:
ages_5 = pd.merge(ages, ages_4, left_index = True, right_index = True)
ages_5 = ages_5[["state", "15-17 Years", "18-19 Years", "20-24 Years", "25-29 Years", "30-39 Years", "40-49 Years"]]

In [31]:
ages_6 = ages_5.melt(id_vars=["state"], var_name="Age Group", value_name="Portion of WORA").sort_values(["state"])
# ages_6.head(15)
ages_6 = pd.DataFrame(ages_6)
ages_6.head()

Unnamed: 0,state,Age Group,Portion of WORA
0,Alabama,15-17 Years,8.0
102,Alabama,20-24 Years,14.0
153,Alabama,25-29 Years,15.0
204,Alabama,30-39 Years,29.0
255,Alabama,40-49 Years,28.0


### Cleaning the number of women of reproductive age (WORA) in the state from object to integer

In [32]:
wora_in_state = all_abortion_df[["state", "wora_in_state_15_49"]]
# wora_in_state.head(20)

In [33]:
# Changing the number of women of reproductive age from object to integer

# split "million" from the cell by splitting on space
appx_wora = wora_in_state.wora_in_state_15_49.str.split(expand = True)

# remove the non-digits from the first column in order to change the values to an integer
point = "."
appx_wora[0] = ([each.replace(comma, "") for each in appx_wora[0]])
appx_wora[0] = ([each.replace(point, "") for each in appx_wora[0]])
appx_wora[0] = appx_wora[0].astype(int)

# multiply the values that had been labeled "million" by 100,000 (removing the decimal earlier changed the scale of the number from a million to 100,000)
appx_wora.loc[appx_wora[1]=="million", "number"] = appx_wora[0]*100000
appx_wora["final_number"] = appx_wora["number"].fillna(appx_wora[0])

# merge this cleaned data back to the wora_in_state dataframe
wora_in_state_2 = pd.merge(wora_in_state, appx_wora["final_number"], left_index = True, right_index=True)


In [34]:
# Combine dataframes wora_in_state_2, wora_ethnicities5, ages_5, driving_distance2 onto abortion_2020_data

abortion_data_1 = pd.merge(abortion_2020_data, wora_in_state_2, left_index=True, right_index=True)
abortion_data_2 = pd.merge(abortion_data_1,wora_ethnicities5, left_index=True, right_index=True)
abortion_data_3 = pd.merge(abortion_data_2,ages_5, left_index=True, right_index=True)
abortion_data = pd.merge(abortion_data_3,driving_distance2, left_index=True, right_index=True)

abortion_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 35 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   state_x                                                     51 non-null     object 
 1   state_slug                                                  51 non-null     object 
 2   policies_currently_in_effect                                51 non-null     object 
 3   environment_type                                            51 non-null     object 
 4   residents_in_state_born_outside_us                          51 non-null     int64  
 5   income_level_below_fpl                                      51 non-null     int64  
 6   abortions_obtained_in_2017                                  51 non-null     int32  
 7   abortions_per_1000_women_aged_15_44_in_2017                 51 non-null     float64
 8   cl

  abortion_data_3 = pd.merge(abortion_data_2,ages_5, left_index=True, right_index=True)


In [35]:
# NOTE: the field names from the webscrape were not updated to reflect the year of data they actually display
# Confirmed from Guttmacher data available for download here: 
    # https://www.poynter.org/reporting-editing/2022/the-us-saw-930160-abortions-in-2020-reversing-a-30-year-decline/
# It includes both 2017 and 2020 values for the number of abortions and rate per 1000 women
# Unless stated otherwise, assume data is from 2017

abortion_data = abortion_data.rename(columns={"state": "State", 
                                              "state_slug":"State URL Slug", 
                                              "environment_type":"Restrictive Category",
                                              "policies_currently_in_effect": "Current Policies",
                                              "residents_in_state_born_outside_us":"% WORA Residents Non-US", 
                                              "income_level_below_fpl": "% WORA Below FPL",
                                              "abortions_obtained_in_2017":"Abortions Obtained in 2020", 
                                              "abortions_per_1000_women_aged_15_44_in_2017": "Abortions per 1,000 WORA in 2020", 
                                              "clinics_provided_abortion_in_2017": "Abortion Clinics", 
                                              "counties_without_abortion_provider_in_2017":"Counties w/o Abortion Providers", 
                                              "residents_in_counties_without_an_abortion_provider_in_2017":"% WORA w/o County Abortion Provider", 
                                              "final_number": "Appx. Number of WORA", 
                                              "Hispanic":"% WORA: Hispanic", 
                                              "Black":"% WORA: Black", 
                                              "White":"% WORA: White", 
                                              "Asian":"% WORA: Asian", \
                                              "Other":"% WORA: Other", 
                                              "15-17 Years": "% WORA: 15-17 Years",
                                              "18-19 Years": "% WORA: 18-19 Years", 
                                              "20-24 Years": "% WORA: 20-24 Years", 
                                              "25-29 Years": "% WORA: 25-29 Years",
                                              "30-39 Years": "% WORA: 30-39 Years",
                                              "40-49 Years": "% WORA: 40-49 Years", 
                                              "week_6":"Avg. Distance (mi): 6 Weeks", 
                                              "week_12":"Avg. Distance (mi): 12 Weeks", 
                                              "week_14":"Avg. Distance (mi): 14 Weeks", 
                                              "week_20":"Avg. Distance (mi): 20 Weeks", 
                                              "week_22":"Avg. Distance (mi): 22 Weeks", 
                                              "week_24":"Avg. Distance (mi): 24 Weeks", 
                                              "week_24_plus":"Avg. Distance (mi): 24+ Weeks"})


In [36]:
abortion_data2 = abortion_data[["State","State URL Slug","Restrictive Category","Current Policies","Abortions Obtained in 2020", 
                               "Abortions per 1,000 WORA in 2020","Appx. Number of WORA", "Abortion Clinics","Counties w/o Abortion Providers",
                               "% WORA w/o County Abortion Provider", "% WORA Residents Non-US","% WORA Below FPL",
                               "% WORA: Hispanic","% WORA: Black","% WORA: White","% WORA: Asian","% WORA: Other", 
                               "% WORA: 15-17 Years", "% WORA: 18-19 Years", "% WORA: 20-24 Years", 
                               "% WORA: 25-29 Years","% WORA: 30-39 Years", "% WORA: 40-49 Years",
                               "Avg. Distance (mi): 6 Weeks", "Avg. Distance (mi): 12 Weeks", "Avg. Distance (mi): 14 Weeks", 
                               "Avg. Distance (mi): 20 Weeks", "Avg. Distance (mi): 22 Weeks","Avg. Distance (mi): 24 Weeks","Avg. Distance (mi): 24+ Weeks"]]


In [37]:
abortion_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 30 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   State                                51 non-null     object 
 1   State URL Slug                       51 non-null     object 
 2   Restrictive Category                 51 non-null     object 
 3   Current Policies                     51 non-null     object 
 4   Abortions Obtained in 2020           51 non-null     int32  
 5   Abortions per 1,000 WORA in 2020     51 non-null     float64
 6   Appx. Number of WORA                 51 non-null     float64
 7   Abortion Clinics                     51 non-null     int64  
 8   Counties w/o Abortion Providers      51 non-null     int64  
 9   % WORA w/o County Abortion Provider  51 non-null     int64  
 10  % WORA Residents Non-US              51 non-null     int64  
 11  % WORA Below FPL                  

In [38]:
abortion_data_cleaned = abortion_data2
abortion_data_cleaned

Unnamed: 0,State,State URL Slug,Restrictive Category,Current Policies,Abortions Obtained in 2020,"Abortions per 1,000 WORA in 2020",Appx. Number of WORA,Abortion Clinics,Counties w/o Abortion Providers,% WORA w/o County Abortion Provider,...,% WORA: 25-29 Years,% WORA: 30-39 Years,% WORA: 40-49 Years,Avg. Distance (mi): 6 Weeks,Avg. Distance (mi): 12 Weeks,Avg. Distance (mi): 14 Weeks,Avg. Distance (mi): 20 Weeks,Avg. Distance (mi): 22 Weeks,Avg. Distance (mi): 24 Weeks,Avg. Distance (mi): 24+ Weeks
0,Alabama,alabama,Most Restrictive,Abortion is completely banned with very limite...,5700,6.0,1100000.0,5,93,59,...,15.0,29.0,28.0,,115.0,118.0,137.0,179.0,474.0,507.0
1,Alaska,alaska,Protective,Only physicians can provide abortions and not ...,1240,8.6,161000.0,4,86,32,...,18.0,33.0,26.0,,9.0,11.0,,,,
2,Arizona,arizona,Restrictive,Pre-Roe abortion ban is still in place and can...,13320,9.3,1600000.0,8,80,18,...,15.0,28.0,27.0,,,,11.0,,22.0,25.0
3,Arkansas,arkansas,Most Restrictive,Abortion is completely banned with very limite...,3250,5.6,674000.0,3,97,77,...,15.0,29.0,28.0,,,,160.0,328.0,,360.0
4,California,california,Protective,"Abortion is banned at fetal viability, general...",154060,19.2,9200000.0,161,40,3,...,16.0,30.0,27.0,,4.0,8.0,9.0,12.0,,14.0
5,Colorado,colorado,Protective,Abortion is not restricted based on gestationa...,13420,11.2,1400000.0,18,80,27,...,16.0,32.0,26.0,,9.0,,11.0,20.0,22.0,851.0
6,Connecticut,connecticut,Some Restrictions/Protections,"Abortion is banned at fetal viability, general...",11170,16.7,780000.0,26,13,5,...,13.0,28.0,29.0,,6.0,,15.0,,17.0,64.0
7,DC,dc,Some Restrictions/Protections,State Medicaid coverage of abortion care is ba...,9410,48.9,212000.0,4,0,0,...,23.0,35.0,18.0,,,,7.0,,15.0,50.0
8,Delaware,delaware,Some Restrictions/Protections,"Abortion is banned at fetal viability, general...",1830,10.0,207000.0,4,33,18,...,15.0,30.0,27.0,,,,2.0,,,3.0
9,Florida,florida,Restrictive,Abortion is banned at 24 weeks and later | Pa...,77400,19.7,4600000.0,65,73,24,...,15.0,30.0,29.0,,8.0,10.0,12.0,16.0,22.0,910.0


In [39]:
ages_6.to_csv("WORA_ages_unpivoted.csv")
wora_ethnicities6.to_csv("WORA_ethnicity_unpivoted.csv")
driving_distance3.to_csv("Avg_Drive_Distance.csv")
abortion_data_cleaned.to_csv("Guttmacher_Webscrape.csv")