## Using the nhgis platform to get census data for the years 2006-2015

For 2006 and 2007, we extracted the population for all counties from the 1970-2007 data hosted on the nhgis website. 

In [153]:
import pandas as pd
import numpy as np

# Read in the data
pop2006 = pd.read_csv("nhgis0001_ds231_2006_county.csv")
pop2007 = pd.read_csv("nhgis0001_ds231_2007_county.csv")

In [154]:
# merge both pop2006 and pop2007 on STATEFP and COUNTYFP
pop = pd.merge(pop2006, pop2007, on=["STATEFP", "COUNTYFP"])

# only keep the columns we need
pop = pop[
    [
        "GISJOIN_x",
        "STATEFP",
        "COUNTYFP",
        "COUNTY_x",
        "STATE_x",
        "AGWD001_x",
        "AGWD001_y",
    ]
]

# rename columns
pop.columns = ["GISJOIN", "STATEFP", "COUNTYFP", "COUNTY", "STATE", "2006", "2007"]
pop

Unnamed: 0,GISJOIN,STATEFP,COUNTYFP,COUNTY,STATE,2006,2007
0,G0100010,1,1,Autauga,Alabama,49039,49834
1,G0100030,1,3,Baldwin,Alabama,168154,172815
2,G0100050,1,5,Barbour,Alabama,29420,29736
3,G0100070,1,7,Bibb,Alabama,21307,21485
4,G0100090,1,9,Blount,Alabama,55708,56866
...,...,...,...,...,...,...,...
3138,G5600370,56,37,Sweetwater,Wyoming,38001,39320
3139,G5600390,56,39,Teton,Wyoming,19588,20073
3140,G5600410,56,41,Uinta,Wyoming,19688,20071
3141,G5600430,56,43,Washakie,Wyoming,7675,7805


On the nhgis data, we couldn't find the population for 2008 as a single year, but found a dataset for 2008-2012 population, which we are using as a proxy for 2008 only. 

In [155]:
df2008 = pd.read_csv("data2008.csv", engine="python")
df2008

# join with pop dataframe sith statea, countya in df2008 and  statefp, countyfp in pop
df2008 = pd.merge(
    df2008, pop, left_on=["STATEA", "COUNTYA"], right_on=["STATEFP", "COUNTYFP"]
)

# drop columns we don't need
df2008 = df2008[
    [
        "GISJOIN_x",
        "STATEFP",
        "COUNTYFP",
        "COUNTY_x",
        "STATE_x",
        "2006",
        "2007",
        "QSPE001",
    ]
]

# rename columns
df2008.columns = [
    "GISJOIN",
    "STATE_CODE",
    "COUNTY_CODE",
    "COUNTY",
    "STATE",
    "2006",
    "2007",
    "2008",
]

df2008

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,2006,2007,2008
0,G0100010,1,1,Autauga County,Alabama,49039,49834,54590
1,G0100030,1,3,Baldwin County,Alabama,168154,172815,183226
2,G0100050,1,5,Barbour County,Alabama,29420,29736,27469
3,G0100070,1,7,Bibb County,Alabama,21307,21485,22769
4,G0100090,1,9,Blount County,Alabama,55708,56866,57466
...,...,...,...,...,...,...,...,...
3138,G5600370,56,37,Sweetwater County,Wyoming,38001,39320,43890
3139,G5600390,56,39,Teton County,Wyoming,19588,20073,21326
3140,G5600410,56,41,Uinta County,Wyoming,19688,20071,20942
3141,G5600430,56,43,Washakie County,Wyoming,7675,7805,8425


Next: Merging with 2010 data, we couldn't find 2009 data on the website, so merging with 2010 first, then will try to extrapolate for 2009. 

In [156]:
data2010 = pd.read_csv("2010pop.csv", engine="python")
data2010

data2010 = pd.merge(
    data2010,
    df2008,
    right_on=["STATE_CODE", "COUNTY_CODE"],
    left_on=["STATEA", "COUNTYA"],
)
data2010

# drop columns we don't need
data2010 = data2010[
    [
        "GISJOIN_x",
        "STATE_CODE",
        "COUNTY_CODE",
        "COUNTY_x",
        "STATE_x",
        "2006",
        "2007",
        "2008",
        "H7V001",
    ]
]

# rename columns
data2010.columns = [
    "GISJOIN",
    "STATE_CODE",
    "COUNTY_CODE",
    "COUNTY",
    "STATE",
    "2006",
    "2007",
    "2008",
    "2010",
]

In [157]:
# drop rows with missing values

data2010 = data2010.dropna()

data2010

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,2006,2007,2008,2010
0,G0100010,1,1,Autauga County,Alabama,49039,49834,54590,54571
1,G0100030,1,3,Baldwin County,Alabama,168154,172815,183226,182265
2,G0100050,1,5,Barbour County,Alabama,29420,29736,27469,27457
3,G0100070,1,7,Bibb County,Alabama,21307,21485,22769,22915
4,G0100090,1,9,Blount County,Alabama,55708,56866,57466,57322
...,...,...,...,...,...,...,...,...,...
3138,G5600370,56,37,Sweetwater County,Wyoming,38001,39320,43890,43806
3139,G5600390,56,39,Teton County,Wyoming,19588,20073,21326,21294
3140,G5600410,56,41,Uinta County,Wyoming,19688,20071,20942,21118
3141,G5600430,56,43,Washakie County,Wyoming,7675,7805,8425,8533


In [158]:
# add 3 new columns for 2009, 2011 and 2012 and fill them with the data from 2008

data2010["2009"] = data2010["2008"]
data2010["2011"] = data2010["2008"]
data2010["2012"] = data2010["2008"]

data2010

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,2006,2007,2008,2010,2009,2011,2012
0,G0100010,1,1,Autauga County,Alabama,49039,49834,54590,54571,54590,54590,54590
1,G0100030,1,3,Baldwin County,Alabama,168154,172815,183226,182265,183226,183226,183226
2,G0100050,1,5,Barbour County,Alabama,29420,29736,27469,27457,27469,27469,27469
3,G0100070,1,7,Bibb County,Alabama,21307,21485,22769,22915,22769,22769,22769
4,G0100090,1,9,Blount County,Alabama,55708,56866,57466,57322,57466,57466,57466
...,...,...,...,...,...,...,...,...,...,...,...,...
3138,G5600370,56,37,Sweetwater County,Wyoming,38001,39320,43890,43806,43890,43890,43890
3139,G5600390,56,39,Teton County,Wyoming,19588,20073,21326,21294,21326,21326,21326
3140,G5600410,56,41,Uinta County,Wyoming,19688,20071,20942,21118,20942,20942,20942
3141,G5600430,56,43,Washakie County,Wyoming,7675,7805,8425,8533,8425,8425,8425


ADDING 2013 data, it has population data on certain states, so we adding it for those and filling rest as NAN for now


In [159]:
# load 2013 data from csv
data2013 = pd.read_csv("data2013.csv", engine="python")
data2013 = data2013[["GISJOIN", "STATEA", "COUNTYA", "COUNTY", "STATE", "SBLE001"]]

In [162]:
# join the data2010 with data2013 but keep all rows in 2010, if that row not in 2013, then add NaN (do right on left on)
data2010 = pd.merge(
    data2010, data2013, how="left", left_on=["GISJOIN"], right_on=["GISJOIN"]
)

In [164]:
# change state_x and county_x to state and county
data2010 = data2010[
    [
        "GISJOIN",
        "STATE_CODE",
        "COUNTY_CODE",
        "COUNTY_x",
        "STATE_x",
        "2006",
        "2007",
        "2008",
        "2009",
        "2010",
        "2011",
        "2012",
        "SBLE001",
    ]
]

In [166]:
# change SBLE001 to 2013 - just come column cleaning
data2010.columns = [
    "GISJOIN",
    "STATE_CODE",
    "COUNTY_CODE",
    "COUNTY",
    "STATE",
    "2006",
    "2007",
    "2008",
    "2009",
    "2010",
    "2011",
    "2012",
    "2013",
]

2014


In [170]:
# add 2014 and 2015 data now and merge just like we did for 2013

data2014 = pd.read_csv("data2014.csv", engine="python")
data2014 = data2014[["GISJOIN", "STATEA", "COUNTYA", "COUNTY", "STATE", "AAA5E001"]]
data2014

Unnamed: 0,GISJOIN,STATEA,COUNTYA,COUNTY,STATE,AAA5E001
0,G0100030,1,3,Baldwin County,Alabama,200111
1,G0100150,1,15,Calhoun County,Alabama,115916
2,G0100430,1,43,Cullman County,Alabama,81289
3,G0100490,1,49,DeKalb County,Alabama,71065
4,G0100510,1,51,Elmore County,Alabama,80977
...,...,...,...,...,...,...
823,G7201130,72,113,Ponce Municipio,Puerto Rico,153540
824,G7201270,72,127,San Juan Municipio,Puerto Rico,365575
825,G7201350,72,135,Toa Alta Municipio,Puerto Rico,74837
826,G7201370,72,137,Toa Baja Municipio,Puerto Rico,84165


In [176]:
data2010 = data2010[
    [
        "GISJOIN",
        "STATE_CODE",
        "COUNTY_CODE",
        "COUNTY_x",
        "STATE_x",
        "2006",
        "2007",
        "2008",
        "2009",
        "2010",
        "2011",
        "2012",
        "2013",
        "AAA5E001_x",
    ]
]

In [184]:
# change AAA5E001_x to 2014 - just come column cleaning
data2010.columns = [
    "GISJOIN",
    "STATE_CODE",
    "COUNTY_CODE",
    "COUNTY",
    "STATE",
    "2006",
    "2007",
    "2008",
    "2009",
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
]

# change 2013 and 2014 to integer (exclude nan PLS)

data2010["2013"] = (
    pd.to_numeric(data2010["2013"], errors="coerce").fillna(0).astype(int)
)
data2010["2014"] = (
    pd.to_numeric(data2010["2014"], errors="coerce").fillna(0).astype(int)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2010['2013'] = pd.to_numeric(data2010['2013'], errors='coerce').fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2010['2014'] = pd.to_numeric(data2010['2014'], errors='coerce').fillna(0).astype(int)


In [187]:
# now add 2015 data
data2015 = pd.read_csv("data2015.csv", engine="python")
data2015 = data2015[["GISJOIN", "STATEA", "COUNTYA", "COUNTY", "STATE", "ACK2E001"]]

In [188]:
data2015

Unnamed: 0,GISJOIN,STATEA,COUNTYA,COUNTY,STATE,ACK2E001
0,G0100030,1,3,Baldwin County,Alabama,203709
1,G0100150,1,15,Calhoun County,Alabama,115620
2,G0100430,1,43,Cullman County,Alabama,82005
3,G0100490,1,49,DeKalb County,Alabama,71130
4,G0100510,1,51,Elmore County,Alabama,81468
...,...,...,...,...,...,...
825,G7201130,72,113,Ponce Municipio,Puerto Rico,149028
826,G7201270,72,127,San Juan Municipio,Puerto Rico,355074
827,G7201350,72,135,Toa Alta Municipio,Puerto Rico,74368
828,G7201370,72,137,Toa Baja Municipio,Puerto Rico,82065


In [189]:
data2010 = pd.merge(
    data2010, data2015, how="left", left_on=["GISJOIN"], right_on=["GISJOIN"]
)

In [191]:
data2010 = data2010[
    [
        "GISJOIN",
        "STATE_CODE",
        "COUNTY_CODE",
        "COUNTY_x",
        "STATE_x",
        "2006",
        "2007",
        "2008",
        "2009",
        "2010",
        "2011",
        "2012",
        "2013",
        "2014",
        "ACK2E001",
    ]
]

In [192]:
# change ACK2E001 to 2015 - just come column cleaning

data2010.columns = [
    "GISJOIN",
    "STATE_CODE",
    "COUNTY_CODE",
    "COUNTY",
    "STATE",
    "2006",
    "2007",
    "2008",
    "2009",
    "2010",
    "2011",
    "2012",
    "2013",
    "2014",
    "2015",
]

In [194]:
data2010
data2010["2015"] = (
    pd.to_numeric(data2010["2015"], errors="coerce").fillna(0).astype(int)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2010['2015'] = pd.to_numeric(data2010['2015'], errors='coerce').fillna(0).astype(int)


In [196]:
final = data2010

# FINAL DATASET

In [198]:
final.head()

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,G0100010,1,1,Autauga County,Alabama,49039,49834,54590,54590,54571,54590,54590,0,0,0
1,G0100030,1,3,Baldwin County,Alabama,168154,172815,183226,183226,182265,183226,183226,195540,200111,203709
2,G0100050,1,5,Barbour County,Alabama,29420,29736,27469,27469,27457,27469,27469,0,0,0
3,G0100070,1,7,Bibb County,Alabama,21307,21485,22769,22769,22915,22769,22769,0,0,0
4,G0100090,1,9,Blount County,Alabama,55708,56866,57466,57466,57322,57466,57466,0,0,0


But there are a few problems with this. The `nhgis` data source does not have complete population data for some years. In some years there is no proper data available, whereas for some years there is data for some years.

Here is the summary:
1. Complete data for `2006`, `2007`, `2010`
2. There is data for 2008-2012 - single population
3. Data for 2013,2014,2015 is incomplete (only for some states)

Since we have complete data for 2007 and 2010, we can just extrapolate and assume that the change in 2008 and 2009 will be uniform. For example, if population for Durham County was 100000 in 2007 and 130000 in 2010, we can assume uniform change and have 110000 in 2008 and 120000 in 2009, which will be suffecient for our analysis. 

In [204]:
# check data for 2007 and 2010 and extrapolate for 2008 and 2009
# Calculate the yearly change from 2007 to 2010
# Calculate the yearly change from 2007 to 2010
yearly_change = (final["2010"] - final["2007"]) / 3

# Calculate the population data for 2008 and 2009
final.loc[:, "2008"] = (final["2007"] + yearly_change).astype(int)
final.loc[:, "2009"] = (final["2007"] + 2 * yearly_change).astype(int)

In [205]:
final

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,G0100010,1,1,Autauga County,Alabama,49039,49834,51413,52992,54571,54590,54590,0,0,0
1,G0100030,1,3,Baldwin County,Alabama,168154,172815,175965,179115,182265,183226,183226,195540,200111,203709
2,G0100050,1,5,Barbour County,Alabama,29420,29736,28976,28216,27457,27469,27469,0,0,0
3,G0100070,1,7,Bibb County,Alabama,21307,21485,21961,22438,22915,22769,22769,0,0,0
4,G0100090,1,9,Blount County,Alabama,55708,56866,57018,57170,57322,57466,57466,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,G5600370,56,37,Sweetwater County,Wyoming,38001,39320,40815,42310,43806,43890,43890,0,0,0
3139,G5600390,56,39,Teton County,Wyoming,19588,20073,20480,20887,21294,21326,21326,0,0,0
3140,G5600410,56,41,Uinta County,Wyoming,19688,20071,20420,20769,21118,20942,20942,0,0,0
3141,G5600430,56,43,Washakie County,Wyoming,7675,7805,8047,8290,8533,8425,8425,0,0,0


We had obtained data for `2008-2012` and filled in for all those years, but as we already filled above for 2008 and 2009, we are using those values for 2012, since they were higher than 2010 census data, and population generally increases. 

Now we will do the same for 2011, use `2010` and `2012` to see the change and assume it was uniform for 2011 also hence just use the midpoint

In [207]:
# Calculate the yearly change from 2010 to 2012
yearly_change = (final["2012"] - final["2010"]) / 2

# Calculate the population data for 2011
final.loc[:, "2011"] = (data2010["2010"] + yearly_change).astype(int)

# Filling in missing values for 2013, 2014, 2015 using moving avg of last 5 years

For `2013`, `2014` and `2015`, we obtained the data from `nhgis` but the problem is that data was for limited number of counties and not for all. For the rest of the counties, we are using the 5 year moving average to fill in

In [224]:
######### 2013 #########
# find avg growth rate in from 2008 to 2012 and extrapolate for 2013, 2014 and 2015

# Calculate the yearly change from 2008 to 2012
yearly_change = (final["2012"] - final["2008"]) / 4

# multiply with 2012 to get 2013, 2014 and 2015

final.loc[:, "2013"] = (final["2012"] + yearly_change).astype(int)

######### 2014 #########

yearly_change = (final["2013"] - final["2009"]) / 4

final.loc[:, "2014"] = (final["2013"] + yearly_change).astype(int)

######### 2015 #########

yearly_change = (final["2014"] - final["2010"]) / 4

final.loc[:, "2015"] = (final["2014"] + yearly_change).astype(int)

final.drop("Avg_Growth_Rate_2013", axis=1, inplace=True)

In [225]:
#### OUR FINAL DATA ####
final

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,G0100010,1,1,Autauga County,Alabama,49039,49834,51413,52992,54571,54580,54590,55384,55982,56334
1,G0100030,1,3,Baldwin County,Alabama,168154,172815,175965,179115,182265,182745,183226,185041,186522,187586
2,G0100050,1,5,Barbour County,Alabama,29420,29736,28976,28216,27457,27463,27469,27092,26811,26649
3,G0100070,1,7,Bibb County,Alabama,21307,21485,21961,22438,22915,22842,22769,22971,23104,23151
4,G0100090,1,9,Blount County,Alabama,55708,56866,57018,57170,57322,57394,57466,57578,57680,57769
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,G5600370,56,37,Sweetwater County,Wyoming,38001,39320,40815,42310,43806,43848,43890,44658,45245,45604
3139,G5600390,56,39,Teton County,Wyoming,19588,20073,20480,20887,21294,21310,21326,21537,21699,21800
3140,G5600410,56,41,Uinta County,Wyoming,19688,20071,20420,20769,21118,21030,20942,21072,21147,21154
3141,G5600430,56,43,Washakie County,Wyoming,7675,7805,8047,8290,8533,8479,8425,8519,8576,8586


In [227]:
# save it to a csv
final.to_csv("final_population_data.csv", index=False)

In [226]:
# remove where state is Alaska
final = final[final["STATE"] != "Alaska"]

In [232]:
# which county has the county code 91
final[(final["COUNTY_CODE"] == 91) & (final["STATE_CODE"] == 12)]
# which state is 12

Unnamed: 0,GISJOIN,STATE_CODE,COUNTY_CODE,COUNTY,STATE,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
365,G1200910,12,91,Okaloosa County,Florida,182834,180911,180881,180851,180822,182111,183400,184029,184823,185823


In [233]:
final.columns

Index(['GISJOIN', 'STATE_CODE', 'COUNTY_CODE', 'COUNTY', 'STATE', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'],
      dtype='object')