Step 1: Load the data

In [1]:
import pandas as pd

pd.set_option("mode.copy_on_write", True)

In [2]:
death = pd.read_csv("drug_deaths.txt", delimiter="\t")

In [3]:
population = pd.read_csv("population.csv")

In [4]:
population

Unnamed: 0,FIPS,State,State_FIPS,County_FIPS,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,1001,AL,1,1,45909.0,46800.0,48366.0,49676.0,51328.0,52405.0,53277.0,54135.0,54761.0,55229.0,54970.0,54747.0,54922.0,54903.0,55302.0
1,1003,AL,1,3,147957.0,151509.0,156266.0,160970.0,168121.0,172404.0,175827.0,179406.0,183121.0,186579.0,190203.0,194978.0,199306.0,203101.0,207787.0
2,1005,AL,1,5,28653.0,28594.0,28287.0,28027.0,27861.0,27757.0,27808.0,27657.0,27325.0,27344.0,27172.0,26946.0,26768.0,26300.0,25828.0
3,1007,AL,1,7,21199.0,21399.0,21721.0,22042.0,22099.0,22438.0,22705.0,22941.0,22858.0,22736.0,22657.0,22510.0,22541.0,22553.0,22590.0
4,1009,AL,1,9,52551.0,53457.0,54124.0,54624.0,55485.0,56240.0,57055.0,57341.0,57372.0,57561.0,57585.0,57630.0,57536.0,57535.0,57487.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3132,56037,WY,56,37,37428.0,37450.0,38026.0,38739.0,39749.0,41470.0,42358.0,44133.0,43580.0,44000.0,45032.0,45189.0,44996.0,44780.0,44319.0
3133,56039,WY,56,39,18837.0,19066.0,19467.0,19632.0,20014.0,20472.0,20988.0,21232.0,21298.0,21422.0,21643.0,22335.0,22801.0,23083.0,23255.0
3134,56041,WY,56,41,19587.0,19480.0,19470.0,19494.0,19709.0,20171.0,20613.0,21054.0,21090.0,20901.0,21008.0,20969.0,20835.0,20777.0,20711.0
3135,56043,WY,56,43,7988.0,7976.0,7960.0,8022.0,7979.0,8169.0,8229.0,8423.0,8531.0,8451.0,8410.0,8417.0,8277.0,8282.0,8180.0


Step 2: Clean the County Code

In [5]:
print(population["FIPS"].dtype)
print(death["County Code"].dtype)

int64
float64


* We will merge death's Couty Code with populaiton's FIPS. Couty code is now a float so we need to convert it to int.

In [6]:
death["County Code"] = death["County Code"].astype(int)

Step 3: Subset the population data based on states and years
* We can get the state code from the County Code. In this way, we double check if we get the correct state. Also we can subset population df with only the relevant states, and we can increase our data processing time by subsetting it.

In [7]:
import math
death["State Code"] = death["County Code"].apply(lambda x: float(math.floor(x / 1000)))
unique_state_code = death["State Code"].unique()
print(unique_state_code)

[17. 39. 12. 47. 48. 53. 54. 20. 41. 31. 56.]


* There are 12 unique state code from the death dataset: [17. 39. 12. 47. 48. 53. 54. 20. 41. 31. 56.]. So we selected the data correctly
* Next step we subset the popultion data based on the state code.

In [8]:
subset_population = population[population['State_FIPS'].isin(unique_state_code)]

In [18]:
print(death["Year"].min())
print(death["Year"].max())

2003
2015


In [24]:
subset_population = subset_population.drop(["2002", "2016"], axis=1)

KeyError: "['2002', '2016'] not found in axis"

In [25]:
subset_population

Unnamed: 0,FIPS,State,State_FIPS,County_FIPS,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
315,12001,FL,12,1,227022.0,229867.0,233756.0,239506.0,242685.0,244888.0,246657.0,247624.0,249879.0,251596.0,252585.0,255606.0,259215.0
316,12003,FL,12,3,23555.0,24142.0,24832.0,25571.0,26212.0,26725.0,27124.0,27067.0,27055.0,27059.0,27008.0,27123.0,27357.0
317,12005,FL,12,5,155044.0,158804.0,162917.0,165644.0,165345.0,166267.0,167464.0,169209.0,169587.0,171818.0,174704.0,178435.0,181678.0
318,12007,FL,12,7,27035.0,27703.0,28098.0,28506.0,28825.0,28961.0,28979.0,28536.0,28430.0,27052.0,26804.0,26562.0,26759.0
319,12009,FL,12,9,504847.0,518722.0,529907.0,535138.0,539719.0,542378.0,542109.0,544000.0,544442.0,547119.0,550478.0,555838.0,566133.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3132,56037,WY,56,37,37450.0,38026.0,38739.0,39749.0,41470.0,42358.0,44133.0,43580.0,44000.0,45032.0,45189.0,44996.0,44780.0
3133,56039,WY,56,39,19066.0,19467.0,19632.0,20014.0,20472.0,20988.0,21232.0,21298.0,21422.0,21643.0,22335.0,22801.0,23083.0
3134,56041,WY,56,41,19480.0,19470.0,19494.0,19709.0,20171.0,20613.0,21054.0,21090.0,20901.0,21008.0,20969.0,20835.0,20777.0
3135,56043,WY,56,43,7976.0,7960.0,8022.0,7979.0,8169.0,8229.0,8423.0,8531.0,8451.0,8410.0,8417.0,8277.0,8282.0


Step 4: Check the valid counties number

In [9]:
print(subset_population["FIPS"].nunique())
print(death["County Code"].nunique())

957
312


death_unique_county = death["County Code"].unique()
population_unique_county = death[]

Step 5: 

In [26]:
melted_data = death.pivot_table(index=['County', 'County Code', 'State Code'], columns='Year', values='Deaths').reset_index()
melted_data.columns.name = None

In [31]:
melted_data
rows_without_nan = melted_data[melted_data.notnull().all(axis=1)]
print(rows_without_nan.shape[0])
print(rows_without_nan[['County', 'County Code']])
large_county_code = rows_without_nan['County Code']

94
                    County  County Code
14       Benton County, WA        53005
16        Bexar County, TX        48029
17       Blount County, TN        47009
20     Brazoria County, TX        48039
22      Brevard County, FL        12009
..                     ...          ...
299     Wichita County, TX        48485
300        Will County, IL        17197
303  Williamson County, TX        48491
305   Winnebago County, IL        17201
310      Yakima County, WA        53077

[94 rows x 2 columns]


In [32]:
large_county = subset_population[subset_population['FIPS'].isin(large_county_code)]
small_county = subset_population[~subset_population['FIPS'].isin(large_county_code)]

In [33]:
large_county.describe()

Unnamed: 0,FIPS,State_FIPS,County_FIPS,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
count,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0
mean,33630.095745,33.521277,108.819149,585023.6,592210.9,600008.1,609214.5,616142.3,622991.1,630010.5,636724.6,643875.7,651661.5,659058.9,667548.3,676753.5
std,16563.246899,16.540422,105.01295,739248.5,739496.8,740763.2,745848.2,749151.2,755583.4,764984.3,773384.0,782542.0,792574.6,801856.6,811209.5,820064.0
min,12009.0,12.0,5.0,78465.0,78428.0,78341.0,78178.0,78323.0,78379.0,78782.0,78921.0,79366.0,79284.0,78843.0,78236.0,77416.0
25%,12113.5,12.0,40.0,219846.5,224058.8,228162.0,235455.5,243578.0,244977.0,249084.0,251886.2,253712.8,252316.2,252676.2,253064.0,256203.0
50%,39132.0,39.0,84.0,342260.5,351206.5,355020.0,360099.5,370265.5,374122.0,375147.5,376073.5,377076.0,379036.5,381361.5,384576.0,388012.5
75%,48160.5,48.0,120.5,624057.5,637100.2,652933.5,680501.8,690745.5,696682.2,704987.0,712686.2,717024.8,725936.0,740185.5,756202.5,776604.8
max,54081.0,54.0,491.0,5294739.0,5252021.0,5207615.0,5165495.0,5154235.0,5161831.0,5181728.0,5198977.0,5219636.0,5239105.0,5252513.0,5254108.0,5243371.0


In [34]:
small_county.describe()

Unnamed: 0,FIPS,State_FIPS,County_FIPS,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
count,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0,863.0
mean,37511.071842,37.381228,129.843569,35122.001159,35517.659328,35910.225956,36387.229432,36818.848204,37203.719583,37536.67555,37843.234067,38036.443801,38229.696408,38401.010429,38635.26883,38897.329085
std,13861.454655,13.835526,118.138353,51732.099807,52861.936376,53975.223724,55235.61157,56460.125068,57634.009953,58742.639978,59794.139487,60598.243137,61436.301869,62245.698374,63234.459302,64231.67415
min,12001.0,12.0,1.0,72.0,55.0,70.0,75.0,79.0,61.0,77.0,84.0,95.0,86.0,106.0,89.0,119.0
25%,20164.0,20.0,44.0,7570.5,7584.5,7654.5,7688.5,7703.5,7718.0,7716.0,7837.0,7758.0,7712.0,7662.0,7611.5,7619.0
50%,41065.0,41.0,97.0,19152.0,19232.0,19392.0,19472.0,19512.0,19670.0,19659.0,19759.0,19686.0,19532.0,19509.0,19578.0,19745.0
75%,48268.0,48.0,169.0,40842.5,41072.0,41614.0,41846.5,42209.0,42672.0,42624.0,42907.0,43265.5,43063.0,43275.0,43388.5,43359.0
max,56045.0,56.0,507.0,632475.0,653779.0,674982.0,695352.0,715264.0,736694.0,757468.0,779091.0,795305.0,807930.0,818047.0,829065.0,838741.0
