In [316]:
import pandas as pd 
import numpy as np

# Read in data
mortality = pd.read_csv('../10_Clean_Data/mortality/mortality.csv')

population = pd.read_csv('../10_Clean_Data/population/final_population_data.csv')

shipments = pd.read_parquet('../01_Source_Data/shipments/shipments_data.gzip')

# First we reshape the population
population_melted = population.melt(id_vars=["GISJOIN", "STATE_CODE", "COUNTY_CODE", "COUNTY", "STATE"],
          var_name="Year",
          value_name="Population")
population_melted.head()


Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Year,Population
0,G0100010,1,1,Autauga County,Alabama,2003,46348
1,G0100030,1,3,Baldwin County,Alabama,2003,151574
2,G0100050,1,5,Barbour County,Alabama,2003,28805
3,G0100070,1,7,Bibb County,Alabama,2003,21224
4,G0100090,1,9,Blount County,Alabama,2003,54195


In [317]:
#in population, change District Of Columbia to District of Columbia for STATE
population_melted['STATE'] = population_melted['STATE'].replace('District Of Columbia', 'District of Columbia')

In [318]:
#remove Alaska from mortality and shipment data
mortality['State'] = mortality['State'].str.strip()
mortality = mortality[mortality['State'] != 'AK']

In [319]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State


In [320]:
shipments = shipments[shipments['BUYER_STATE'] != 'AK']

In [321]:
shipments = shipments[shipments['BUYER_STATE'] != 'PR']

In [322]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State


In [323]:
#value counts for county and state pair for mortality
(mortality.groupby(['County', 'State']).size()).sort_values(ascending=False)

County                   State
El Paso County           CO       13
East Baton Rouge Parish  LA       13
Durham County            NC       13
Salt Lake County         UT       13
DuPage County            IL       13
                                  ..
Lawrence County          AL        1
Tipton County            TN        1
McKinley County          NM        1
Delaware County          OK        1
Van Zandt County         TX        1
Length: 1039, dtype: int64

In [324]:
mortality.head()

Unnamed: 0,County,Year,Deaths,State
0,Acadia Parish,2003,11,LA
1,Ada County,2003,17,ID
2,Adams County,2003,42,CO
3,Aiken County,2003,10,SC
4,Alachua County,2003,11,FL


In [325]:
shipments.head()

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,Year,MME,CALC_BASE_WT_IN_GM
0,ABBEVILLE,SC,2006,3136215.0,2506.08439
1,ABBEVILLE,SC,2007,3232603.0,2623.718375
2,ABBEVILLE,SC,2008,3070698.0,2574.677256
3,ABBEVILLE,SC,2009,3827607.0,3110.779538
4,ABBEVILLE,SC,2010,4612935.0,3695.582848


In [326]:
#check for nas 
population_melted.isna().sum()
shipments.isna().sum()
mortality.isna().sum()


County    0
Year      0
Deaths    0
State     0
dtype: int64

In [327]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State


In [328]:
# remove the second word from COUNTY variable and make it lowercase
#population_melted['COUNTY'] = population_melted['COUNTY'].str.replace(' county', '', case=False).str.lower()
population_melted['COUNTY'] = population_melted['COUNTY'].str.replace(' county', '', case=False).str.replace(' parish', '', case=False).str.lower()
population_melted.head()

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Year,Population
0,G0100010,1,1,autauga,Alabama,2003,46348
1,G0100030,1,3,baldwin,Alabama,2003,151574
2,G0100050,1,5,barbour,Alabama,2003,28805
3,G0100070,1,7,bibb,Alabama,2003,21224
4,G0100090,1,9,blount,Alabama,2003,54195


In [329]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State


In [330]:
# remove the second word from BUYER_COUNTY variable in shipments and make it lowercase
shipments['BUYER_COUNTY'] = shipments['BUYER_COUNTY'].str.replace(' county', '', case=False).str.replace(' parish', '', case=False).str.lower()
shipments.head()

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,Year,MME,CALC_BASE_WT_IN_GM
0,abbeville,SC,2006,3136215.0,2506.08439
1,abbeville,SC,2007,3232603.0,2623.718375
2,abbeville,SC,2008,3070698.0,2574.677256
3,abbeville,SC,2009,3827607.0,3110.779538
4,abbeville,SC,2010,4612935.0,3695.582848


In [331]:
mortality['County'] = mortality['County'].str.replace(' county', '', case=False).str.replace(' parish', '', case=False).str.lower()
mortality.head()

Unnamed: 0,County,Year,Deaths,State
0,acadia,2003,11,LA
1,ada,2003,17,ID
2,adams,2003,42,CO
3,aiken,2003,10,SC
4,alachua,2003,11,FL


In [332]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State


In [333]:
state_dict = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
state_dict = dict(map(reversed, state_dict.items()))


In [334]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State


In [335]:
#add a column to mortality to match the state abbreviations

#remove whitespace from state column before the state name
mortality['State'] = mortality['State'].str.strip()

mortality['State Name'] = mortality['State'].map(state_dict)

In [336]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State,State Name


In [337]:
mortality.head()

Unnamed: 0,County,Year,Deaths,State,State Name
0,acadia,2003,11,LA,Louisiana
1,ada,2003,17,ID,Idaho
2,adams,2003,42,CO,Colorado
3,aiken,2003,10,SC,South Carolina
4,alachua,2003,11,FL,Florida


In [338]:
#do the same for shipments
shipments['BUYER_STATE'] = shipments['BUYER_STATE'].str.strip()
shipments['State Name'] = shipments['BUYER_STATE'].map(state_dict)

In [339]:
shipments.head()

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,Year,MME,CALC_BASE_WT_IN_GM,State Name
0,abbeville,SC,2006,3136215.0,2506.08439,South Carolina
1,abbeville,SC,2007,3232603.0,2623.718375,South Carolina
2,abbeville,SC,2008,3070698.0,2574.677256,South Carolina
3,abbeville,SC,2009,3827607.0,3110.779538,South Carolina
4,abbeville,SC,2010,4612935.0,3695.582848,South Carolina


NOW WE HAVE ALL 3 DATASETS AND NECESSARY DATA CLEANING AND ADDITIONS ARE DONE. LETS CHECK ALL 3 now

In [340]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State,State Name


In [341]:
shipments.head()

Unnamed: 0,BUYER_COUNTY,BUYER_STATE,Year,MME,CALC_BASE_WT_IN_GM,State Name
0,abbeville,SC,2006,3136215.0,2506.08439,South Carolina
1,abbeville,SC,2007,3232603.0,2623.718375,South Carolina
2,abbeville,SC,2008,3070698.0,2574.677256,South Carolina
3,abbeville,SC,2009,3827607.0,3110.779538,South Carolina
4,abbeville,SC,2010,4612935.0,3695.582848,South Carolina


In [342]:
mortality.head(5)

Unnamed: 0,County,Year,Deaths,State,State Name
0,acadia,2003,11,LA,Louisiana
1,ada,2003,17,ID,Idaho
2,adams,2003,42,CO,Colorado
3,aiken,2003,10,SC,South Carolina
4,alachua,2003,11,FL,Florida


In [343]:
population_melted.head()

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Year,Population
0,G0100010,1,1,autauga,Alabama,2003,46348
1,G0100030,1,3,baldwin,Alabama,2003,151574
2,G0100050,1,5,barbour,Alabama,2003,28805
3,G0100070,1,7,bibb,Alabama,2003,21224
4,G0100090,1,9,blount,Alabama,2003,54195


In [344]:
#checking number of unique counties state pairs in each

print(len(mortality.groupby(['County', 'State Name'])))
print(len(shipments.groupby(['BUYER_COUNTY', 'State Name'])))
print(len(population_melted.groupby(['COUNTY', 'STATE'])))

1039
3038
3143


In [345]:
#years in each, make year in shipments int
mortality['Year'] = mortality['Year'].astype(int)
shipments['Year'] = shipments['Year'].astype(int)
population_melted['Year'] = population_melted['Year'].astype(int)


print("mortality:",  mortality['Year'].unique())
print("shipments:", shipments['Year'].unique())
print("population:", population_melted['Year'].unique())




mortality: [2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015]
shipments: [2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019]
population: [2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015]


In [346]:
mortality[mortality['State'] == 'AK']

Unnamed: 0,County,Year,Deaths,State,State Name


In [347]:
#now merge the three datasets

#first merge mortality and population outer
mortality_population = pd.merge(mortality,population_melted, how='outer', left_on=['County', 'State Name', 'Year'], right_on=['COUNTY', 'STATE', 'Year'],validate="1:1",
indicator=True)

mortality_population._merge.value_counts()

_merge
right_only    32981
both           7878
left_only        10
Name: count, dtype: int64

In [348]:
#where county is nan in mortality population
mortality_population[mortality_population['County'].isna()]


Unnamed: 0,County,Year,Deaths,State,State Name,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Population,_merge
7888,,2003,,,,G0100010,1.0,1.0,autauga,Alabama,46348.0,right_only
7889,,2003,,,,G0100050,1.0,5.0,barbour,Alabama,28805.0,right_only
7890,,2003,,,,G0100070,1.0,7.0,bibb,Alabama,21224.0,right_only
7891,,2003,,,,G0100090,1.0,9.0,blount,Alabama,54195.0,right_only
7892,,2003,,,,G0100110,1.0,11.0,bullock,Alabama,11330.0,right_only
...,...,...,...,...,...,...,...,...,...,...,...,...
40864,,2015,,,,G5600370,56.0,37.0,sweetwater,Wyoming,45604.0,right_only
40865,,2015,,,,G5600390,56.0,39.0,teton,Wyoming,21800.0,right_only
40866,,2015,,,,G5600410,56.0,41.0,uinta,Wyoming,21154.0,right_only
40867,,2015,,,,G5600430,56.0,43.0,washakie,Wyoming,8586.0,right_only


In [349]:
#just checking the merge
mortality_population[mortality_population._merge == "left_only"]

Unnamed: 0,County,Year,Deaths,State,State Name,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Population,_merge
1577,la porte,2006,10.0,IN,Indiana,,,,,,,left_only
2121,la porte,2007,16.0,IN,Indiana,,,,,,,left_only
3306,la porte,2009,10.0,IN,Indiana,,,,,,,left_only
3929,la porte,2010,11.0,IN,Indiana,,,,,,,left_only
4592,la porte,2011,21.0,IN,Indiana,,,,,,,left_only
5282,la porte,2012,15.0,IN,Indiana,,,,,,,left_only
5983,la porte,2013,16.0,IN,Indiana,,,,,,,left_only
6702,la porte,2014,20.0,IN,Indiana,,,,,,,left_only
7467,la porte,2015,14.0,IN,Indiana,,,,,,,left_only
7539,mc kean,2015,11.0,PA,Pennsylvania,,,,,,,left_only


In [350]:
mortality_population

Unnamed: 0,County,Year,Deaths,State,State Name,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Population,_merge
0,acadia,2003,11.0,LA,Louisiana,G2200010,22.0,1.0,acadia,Louisiana,59209.0,both
1,ada,2003,17.0,ID,Idaho,G1600010,16.0,1.0,ada,Idaho,325482.0,both
2,adams,2003,42.0,CO,Colorado,G0800010,8.0,1.0,adams,Colorado,381370.0,both
3,aiken,2003,10.0,SC,South Carolina,G4500030,45.0,3.0,aiken,South Carolina,146903.0,both
4,alachua,2003,11.0,FL,Florida,G1200010,12.0,1.0,alachua,Florida,221717.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...
40864,,2015,,,,G5600370,56.0,37.0,sweetwater,Wyoming,45604.0,right_only
40865,,2015,,,,G5600390,56.0,39.0,teton,Wyoming,21800.0,right_only
40866,,2015,,,,G5600410,56.0,41.0,uinta,Wyoming,21154.0,right_only
40867,,2015,,,,G5600430,56.0,43.0,washakie,Wyoming,8586.0,right_only


In [351]:
mortality_population[mortality_population['County'] == 'district of columbia']

Unnamed: 0,County,Year,Deaths,State,State Name,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Population,_merge
100,district of columbia,2003,106.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,557620.0,both
520,district of columbia,2004,102.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,554239.0,both
982,district of columbia,2005,87.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,582049.0,both
1463,district of columbia,2006,113.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,585419.0,both
2000,district of columbia,2007,90.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,586409.0,both
2592,district of columbia,2008,69.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,591513.0,both
3172,district of columbia,2009,56.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,596618.0,both
3783,district of columbia,2010,83.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,601723.0,both
4449,district of columbia,2011,79.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,603741.0,both
5134,district of columbia,2012,71.0,DC,District of Columbia,G1100010,11.0,1.0,district of columbia,District of Columbia,605759.0,both


In [352]:
#show where shipments is null for State Name
shipments[shipments['State Name'].isnull()]

#remove this county
shipments = shipments[shipments['State Name'].notna()]

In [353]:
shipments.isna().sum()

BUYER_COUNTY          0
BUYER_STATE           0
Year                  0
MME                   0
CALC_BASE_WT_IN_GM    0
State Name            0
dtype: int64

In [354]:
shipments.groupby(['BUYER_COUNTY', 'State Name', 'Year']).size().reset_index(name='count').sort_values(by='count', ascending=False)

Unnamed: 0,BUYER_COUNTY,State Name,Year,count
0,abbeville,South Carolina,2006,1
27907,northumberland,Pennsylvania,2019,1
27889,northern mariana islands,Northern Mariana Islands,2010,1
27890,northern mariana islands,Northern Mariana Islands,2011,1
27891,northern mariana islands,Northern Mariana Islands,2012,1
...,...,...,...,...
13947,gilpin,Colorado,2016,1
13948,gilpin,Colorado,2017,1
13949,gilpin,Colorado,2018,1
13950,glacier,Montana,2006,1


In [355]:
#merge shipments and mortality_population
mortality_population_shipments = pd.merge(mortality_population,shipments, how='outer', left_on=['COUNTY', 'STATE', 'Year'], right_on=['BUYER_COUNTY', 'State Name', 'Year'], indicator = 'merge_indicator')

mortality_population_shipments['merge_indicator'].value_counts()

merge_indicator
both          29442
right_only    12398
left_only     11427
Name: count, dtype: int64

In [356]:
mortality_population_shipments[mortality_population_shipments.merge_indicator == "right_only"]['Year'].value_counts()

Year
2018    2993
2019    2992
2017    2988
2016    2985
2006      44
2007      44
2008      44
2009      44
2010      44
2011      44
2012      44
2013      44
2014      44
2015      44
Name: count, dtype: int64

In [357]:
#check the right onlys before 2015
mortality_population_shipments[(mortality_population_shipments.merge_indicator == "right_only") & (mortality_population_shipments['Year'] < 2015)]

#get the unique county names from above
mortality_population_shipments[(mortality_population_shipments.merge_indicator == "right_only") & (mortality_population_shipments['Year'] < 2015)]['BUYER_COUNTY'].unique()

array(['de kalb', 'de soto', 'de witt', 'dewitt', 'guam', 'la porte',
       'la salle', 'northern mariana islands', 'obrien', 'prince georges',
       'queen annes', 'radford', 'saint bernard', 'saint charles',
       'saint clair', 'saint croix', 'saint francis', 'saint francois',
       'saint helena', 'saint james', 'saint john', 'saint johns',
       'saint joseph', 'saint landry', 'saint lawrence', 'saint louis',
       'saint louis city', 'saint lucie', 'saint martin', 'saint mary',
       'saint marys', 'saint tammany', 'saint thomas', 'sainte genevieve',
       'salem', 'st john the baptist', 'st joseph'], dtype=object)

In [358]:
#we can just remove the above and also remove data after 2015
mortality_population_shipments = mortality_population_shipments[(mortality_population_shipments['Year'] <= 2015) & (mortality_population_shipments.merge_indicator != "right_only")]

In [359]:
mortality_population_shipments

Unnamed: 0,County,Year,Deaths,State,State Name_x,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,Population,_merge,BUYER_COUNTY,BUYER_STATE,MME,CALC_BASE_WT_IN_GM,State Name_y,merge_indicator
0,acadia,2003,11.0,LA,Louisiana,G2200010,22.0,1.0,acadia,Louisiana,59209.0,both,,,,,,left_only
1,ada,2003,17.0,ID,Idaho,G1600010,16.0,1.0,ada,Idaho,325482.0,both,,,,,,left_only
2,adams,2003,42.0,CO,Colorado,G0800010,8.0,1.0,adams,Colorado,381370.0,both,,,,,,left_only
3,aiken,2003,10.0,SC,South Carolina,G4500030,45.0,3.0,aiken,South Carolina,146903.0,both,,,,,,left_only
4,alachua,2003,11.0,FL,Florida,G1200010,12.0,1.0,alachua,Florida,221717.0,both,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40864,,2015,,,,G5600370,56.0,37.0,sweetwater,Wyoming,45604.0,right_only,sweetwater,WY,2.412509e+07,17488.438403,Wyoming,both
40865,,2015,,,,G5600390,56.0,39.0,teton,Wyoming,21800.0,right_only,teton,WY,3.343610e+06,2609.264725,Wyoming,both
40866,,2015,,,,G5600410,56.0,41.0,uinta,Wyoming,21154.0,right_only,uinta,WY,1.135771e+07,8459.162985,Wyoming,both
40867,,2015,,,,G5600430,56.0,43.0,washakie,Wyoming,8586.0,right_only,washakie,WY,7.700956e+06,5477.434225,Wyoming,both


In [360]:
#clean up and only keep relevant columns
mortality_population_shipments = mortality_population_shipments[['COUNTY', "STATE", 'State Name_x', 'Year', 'Deaths', 'Population', 'BUYER_COUNTY', 'BUYER_STATE', 'MME', 'CALC_BASE_WT_IN_GM', 'merge_indicator']]

In [361]:
mortality_population_shipments

Unnamed: 0,COUNTY,STATE,State Name_x,Year,Deaths,Population,BUYER_COUNTY,BUYER_STATE,MME,CALC_BASE_WT_IN_GM,merge_indicator
0,acadia,Louisiana,Louisiana,2003,11.0,59209.0,,,,,left_only
1,ada,Idaho,Idaho,2003,17.0,325482.0,,,,,left_only
2,adams,Colorado,Colorado,2003,42.0,381370.0,,,,,left_only
3,aiken,South Carolina,South Carolina,2003,10.0,146903.0,,,,,left_only
4,alachua,Florida,Florida,2003,11.0,221717.0,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
40864,sweetwater,Wyoming,,2015,,45604.0,sweetwater,WY,2.412509e+07,17488.438403,both
40865,teton,Wyoming,,2015,,21800.0,teton,WY,3.343610e+06,2609.264725,both
40866,uinta,Wyoming,,2015,,21154.0,uinta,WY,1.135771e+07,8459.162985,both
40867,washakie,Wyoming,,2015,,8586.0,washakie,WY,7.700956e+06,5477.434225,both


In [None]:
#remove la prte and mc kean county since we dont have population data for them so better to clean
mortality_population_shipments = mortality_population_shipments[mortality_population_shipments['County'] != 'la porte']
mortality_population_shipments = mortality_population_shipments[mortality_population_shipments['County'] != 'mc kean']

mortality_population_shipments

Unnamed: 0,County,STATE,State Name_x,Year,Deaths,Population,BUYER_COUNTY,BUYER_STATE,MME,CALC_BASE_WT_IN_GM,merge_indicator
0,acadia,Louisiana,Louisiana,2003,11.0,59209.0,,,,,left_only
1,ada,Idaho,Idaho,2003,17.0,325482.0,,,,,left_only
2,adams,Colorado,Colorado,2003,42.0,381370.0,,,,,left_only
3,aiken,South Carolina,South Carolina,2003,10.0,146903.0,,,,,left_only
4,alachua,Florida,Florida,2003,11.0,221717.0,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
40864,,Wyoming,,2015,,45604.0,sweetwater,WY,2.412509e+07,17488.438403,both
40865,,Wyoming,,2015,,21800.0,teton,WY,3.343610e+06,2609.264725,both
40866,,Wyoming,,2015,,21154.0,uinta,WY,1.135771e+07,8459.162985,both
40867,,Wyoming,,2015,,8586.0,washakie,WY,7.700956e+06,5477.434225,both


In [364]:
merged = mortality_population_shipments

In [365]:
merged
#we still havent removed 2003,2004,2005 data (as that data is not in shipments) but we might need it later


Unnamed: 0,COUNTY,STATE,State Name_x,Year,Deaths,Population,BUYER_COUNTY,BUYER_STATE,MME,CALC_BASE_WT_IN_GM,merge_indicator
0,acadia,Louisiana,Louisiana,2003,11.0,59209.0,,,,,left_only
1,ada,Idaho,Idaho,2003,17.0,325482.0,,,,,left_only
2,adams,Colorado,Colorado,2003,42.0,381370.0,,,,,left_only
3,aiken,South Carolina,South Carolina,2003,10.0,146903.0,,,,,left_only
4,alachua,Florida,Florida,2003,11.0,221717.0,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...
40864,sweetwater,Wyoming,,2015,,45604.0,sweetwater,WY,2.412509e+07,17488.438403,both
40865,teton,Wyoming,,2015,,21800.0,teton,WY,3.343610e+06,2609.264725,both
40866,uinta,Wyoming,,2015,,21154.0,uinta,WY,1.135771e+07,8459.162985,both
40867,washakie,Wyoming,,2015,,8586.0,washakie,WY,7.700956e+06,5477.434225,both


In [366]:
#calculate death rate

merged['Death Rate'] = merged['Deaths']/merged['Population']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['Death Rate'] = merged['Deaths']/merged['Population']


In [368]:
#impute the missing values for death rate based on the avg of that county
merged['Death Rate'] = merged.groupby(['COUNTY'])['Death Rate'].transform(lambda x: x.fillna(x.mean()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged['Death Rate'] = merged.groupby(['COUNTY'])['Death Rate'].transform(lambda x: x.fillna(x.mean()))


In [376]:
#find alabama in 2003
merged[(merged['STATE'] == 'Alabama') & (merged['Year'] == 2003)].head(20)

Unnamed: 0,COUNTY,STATE,State Name_x,Year,Deaths,Population,BUYER_COUNTY,BUYER_STATE,MME,CALC_BASE_WT_IN_GM,merge_indicator,Death Rate
15,baldwin,Alabama,Alabama,2003,10.0,151574.0,,,,,left_only,6.6e-05
164,jefferson,Alabama,Alabama,2003,69.0,659191.0,,,,,left_only,0.000105
232,mobile,Alabama,Alabama,2003,26.0,399943.0,,,,,left_only,6.5e-05
7888,autauga,Alabama,,2003,,46348.0,,,,,left_only,
7889,barbour,Alabama,,2003,,28805.0,,,,,left_only,
7890,bibb,Alabama,,2003,,21224.0,,,,,left_only,8.8e-05
7891,blount,Alabama,,2003,,54195.0,,,,,left_only,0.00018
7892,bullock,Alabama,,2003,,11330.0,,,,,left_only,
7893,butler,Alabama,,2003,,20765.0,,,,,left_only,0.000211
7894,calhoun,Alabama,,2003,,112097.0,,,,,left_only,0.000172


In [None]:
#look at Alabama in 2003
merged[(merged['STATE'] == 'North Carolina') & (merged['Year'] == 2003)]


Unnamed: 0,County,STATE,State Name_x,Year,Deaths,Population,BUYER_COUNTY,BUYER_STATE,MME,CALC_BASE_WT_IN_GM,merge_indicator,Death Rate
41,brunswick,North Carolina,North Carolina,2003,10.0,81607.0,,,,,left_only,0.000123
43,buncombe,North Carolina,North Carolina,2003,22.0,213228.0,,,,,left_only,0.000103
44,burke,North Carolina,North Carolina,2003,10.0,89514.0,,,,,left_only,0.000112
57,catawba,North Carolina,North Carolina,2003,17.0,147405.0,,,,,left_only,0.000115
75,cleveland,North Carolina,North Carolina,2003,12.0,98238.0,,,,,left_only,0.000122
...,...,...,...,...,...,...,...,...,...,...,...,...
9605,,North Carolina,,2003,,42665.0,,,,,left_only,
9606,,North Carolina,,2003,,113391.0,,,,,left_only,
9607,,North Carolina,,2003,,75358.0,,,,,left_only,
9608,,North Carolina,,2003,,37295.0,,,,,left_only,


In [None]:
#count unique buyer-county-year rows in shipments
##shipments.groupby(['BUYER_COUNTY', 'BUYER_STATE', 'Year']).size().reset_index(name='count').sort_values(by='count', ascending=False)

In [None]:
#shipments

In [None]:
#shipments.isna().sum()
