In [16]:
import pandas as pd
import numpy as np
from functools import reduce
from pathlib import Path

## <center> Reading in Data </center>
<center> We've collected our data from the below sources: </center>
<br>

<center> <a href="https://www.census.gov/data/datasets/time-series/demo/popest/2010s-state-total.html" target="_blank"> State Population Data</a> </center>
<br>
<center> <a href="http://www.usa.com/rank/us--average-education-index--state-rank.htm" target="_blank">State Educational Ranking Data</a> </center>
<br>

<center> <a href="https://ssti.org/blog/useful-stats-capita-personal-income-state-2010-2015" target="_blank">State Income Data</a> </center>
<br>
<center> <a href="https://wonder.cdc.gov/ucd-icd10.html" target="_blank">Crude Death Rate where Drug Overdose is the Cause of Death Data</a> </center>





In [2]:
state_income_data_url = 'https://raw.githubusercontent.com/wafiakmal/Turquoise-Team-Data-Analysis-DS-Salaries/main/00_Raw_Data/pds/state%20income%202010.csv'
state_income = pd.read_csv(state_income_data_url)

state_educ_url = 'https://raw.githubusercontent.com/wafiakmal/Turquoise-Team-Data-Analysis-DS-Salaries/main/00_Raw_Data/pds/states%20education%20rank%202010%20thru%202014.csv'
state_educ = pd.read_csv(state_educ_url)

policy_url = 'https://raw.githubusercontent.com/wafiakmal/Turquoise-Team-Data-Analysis-DS-Salaries/main/00_Raw_Data/pds/States%20with%20missing%20policys.csv'
missing_policy = pd.read_csv(policy_url)

state_pop_url = 'https://raw.githubusercontent.com/wafiakmal/Turquoise-Team-Data-Analysis-DS-Salaries/main/00_Raw_Data/pds/state%20population%202010.csv'
state_pop = pd.read_csv(state_pop_url)

state_crud_death_url = 'https://raw.githubusercontent.com/wafiakmal/Turquoise-Team-Data-Analysis-DS-Salaries/main/00_Raw_Data/pds/Underlying%20Cause%20of%20Death%2C%201999-2020.csv'
state_crud_death = pd.read_csv(state_crud_death_url)





## Merging data where the matching key is State

In [18]:
to_merge_data_frames = [state_income, state_educ, missing_policy, state_pop, state_crud_death]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['State'],
                                            how='outer'), to_merge_data_frames)

## Dropping NA to get rid of NA row and District of Columbia

In [19]:
df_merged = df_merged.dropna()

## Creating columns to calculate averages
Per state (Florida, Texas, Washington), we're comparing their educational ranking, population from 2010, crude death rate (where cause of death is related to a drug overdose), and income. Below is an example.  
<center> income(state_x)/income(Florida) </center>
<br>
We're comparing these demographics to our three states to ensure that the chosen states are similar and proper for comparison.
<br>
<br>
We're also creating a flag column to evaluate each state's policy on the opioid crisis, which is based on Table 1 from: 
<br>
<br>
<center> <em> How States Are Tackling the Opioid Crisis: </em> </center>
<center> <a href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5349480/" target="_blank">Read paper here</a> </center>
<br>
<p> This paper is from 2017, however we're comfortable using this data as we're counting which policies are missing in each state. Therefore, if the policy is missing in 2017, then we can reasonably assume that the policy was missing in 2010 as well.
 


In [20]:
for states in df_merged['State']:
    income_percent = df_merged['2010 average income']/38718.0
    df_merged['df_merged_income_fl'] = income_percent
    income_percent1 = df_merged['2010 average income']/42821.0
    df_merged['df_merged_income_wash'] = income_percent1
    income_percent2 = df_merged['2010 average income']/38282.0
    df_merged['df_merged_income_tx'] = income_percent2

    edu_percent = df_merged['Education Rank']/37.0
    df_merged['df_merged_edu_fl'] = edu_percent
    edu_percent1 = df_merged['Education Rank']/9.0
    df_merged['df_merged_edu_wash'] = edu_percent1
    edu_percent2 = df_merged['Education Rank']/43.0
    df_merged['df_merged_edu_tx'] = edu_percent2

    pop_percent = df_merged['2010 popula tion ']/18801310.0
    df_merged['df_merged_pop_fl'] = pop_percent
    pop_percent1 = df_merged['2010 popula tion ']/6724540.0
    df_merged['df_merged_pop_wash'] = pop_percent1
    pop_percent2 = df_merged['2010 popula tion ']/25145561.0
    df_merged['df_merged_pop_tx'] = pop_percent2

    crude_percent = df_merged['Crude Rate']/16.2701
    df_merged['df_merged_crude_fl'] = crude_percent
    crude_percent1 = df_merged['Crude Rate']/13.6069
    df_merged['df_merged_crude_wash'] = crude_percent1
    crude_percent2 = df_merged['Crude Rate']/9.5206
    df_merged['df_merged_crude_tx'] = crude_percent2

    policy_flag = df_merged['Sum of MISSING from Questionarre ']
    df_merged['df_merged_policy_flag_fl'] = policy_flag > 4.0
    policy_flag1 = df_merged['Sum of MISSING from Questionarre ']
    df_merged['df_merged_policy_flag_wash'] = policy_flag1 > 3.0
    policy_flag2 = df_merged['Sum of MISSING from Questionarre ']
    df_merged['df_merged_policy_flag_tx'] = policy_flag1 > 5.0

    




df_metrics = df_merged.copy()

## After calculating comparisons
After we create the columns that compare demographics to Flordia, Texas, and Washington, respectively, we compute an average of each comparison. We're using this as a measure of "closeness."

In [21]:
df_metrics['eval_metric_fl'] = df_metrics[["df_merged_income_fl", "df_merged_edu_fl", "df_merged_pop_fl", "df_merged_crude_fl"]].mean(axis=1)
df_metrics['eval_metric_tx'] = df_metrics[["df_merged_income_tx", "df_merged_edu_tx", "df_merged_pop_tx", "df_merged_crude_tx"]].mean(axis=1)
df_metrics['eval_metric_wash'] = df_metrics[["df_merged_income_wash", "df_merged_edu_wash", "df_merged_pop_wash", "df_merged_crude_wash"]].mean(axis=1)

## Creating bounds for "closeness" 
After averaging the comparisons, we are filtering such that states that are 20% +/- 100% of our initial states (Texas, Washington, Florida). 

In [22]:
def calc_new_col(row):
    if row['eval_metric_fl'] <= 1.2 and row['eval_metric_fl'] >= 0.8:
        return True
    else:
        return False
df_metrics['in_bounds_fl'] = df_metrics.apply(lambda row: calc_new_col(row), axis=1)


def calc_new_col_wash(row):
    if row['eval_metric_wash'] <= 1.2 and row['eval_metric_wash'] >= 0.8:
        return True
    else:
        return False

df_metrics['in_bounds_wash'] = df_metrics.apply(lambda row: calc_new_col_wash(row), axis=1)


def calc_new_col_tx(row):
    if row['eval_metric_tx'] <= 1.2 and row['eval_metric_tx'] >= 0.8:
        return True
    else:
        return False

df_metrics['in_bounds_tx'] = df_metrics.apply(lambda row: calc_new_col_tx(row), axis=1)





## Using our bounds and chosing states based on their implemented policies
After bounding our comparison metric, we're selecting states where the bound is satisfied and the quantity of missing policies is greated than our comparison states (Texas, Florida, Washington).

In [23]:
df_florida = df_metrics[((df_metrics['in_bounds_fl']==True) & (df_metrics['df_merged_policy_flag_fl'] == True))]
df_washington = df_metrics[((df_metrics['in_bounds_wash']==True) & (df_metrics['df_merged_policy_flag_wash'] == True))]
df_texas = df_metrics[((df_metrics['in_bounds_tx']==True) & (df_metrics['df_merged_policy_flag_tx'] == True))]

## Below are dataframes containing the states that will be used in our analysis for comparison.

### We're analyzing Flordia against: California, Nevada, New York, and Texas

In [10]:
intermediate = {"States":[]}
for i in df_florida['State']:
    intermediate['States'].append(i)

FL_control = pd.DataFrame(intermediate)

filepath = Path('../20_intermediate_files/fl_control.csv')
FL_control.to_csv(filepath)

### We're analyzing Washington against: Alaska, Hawaii, Iowa, Kansas, Maine, Massachusetts, Minnesota, Montana, Nebraska, North Dakota, Oregon, South Dakota, Virginia, and Wyoming

In [12]:
intermediate = {"States":[]}
for i in df_washington['State']:
    intermediate['States'].append(i)

WA_control = pd.DataFrame(intermediate)

filepath = Path('../20_intermediate_files/wa_control.csv')
WA_control.to_csv(filepath)

### We're analyzing Texas against: Arkansas, California, Georgia, Missouri, New York, and Wyoming

In [11]:
intermediate = {"States":[]}
for i in df_texas['State']:
    intermediate['States'].append(i)

TX_control = pd.DataFrame(intermediate)

filepath = Path('../20_intermediate_files/tx_control.csv')
TX_control.to_csv(filepath)

In [None]:
intermediate = {"States":[]}
for i in df_florida['State']:
    intermediate['States'].append(i)
for i in df_washington['State']:
    intermediate['States'].append(i)
for i in df_texas['State']:
    intermediate['States'].append(i)

states_control = pd.DataFrame(intermediate)

filepath = Path('../20_intermediate_files/states_control.csv')
states_control.to_csv(filepath)