Ref: [Census Data PDF](https://github.com/GoCodeColorado/GoCodeColorado-kbase-public/blob/187410313442847c357e04fb553a121941b297bf/Resources_for_Participants/Data/DOLA_Census_Data_GoCodeColorado.pdf)<br>
Datasets: 
[2013](https://data.colorado.gov/Demographics/Census-Block-Groups-in-Colorado-2013/9gri-r239) 
[2014](https://data.colorado.gov/Demographics/Census-Block-Groups-in-Colorado-2014/cmkv-zd4f) 
[2015](https://data.colorado.gov/Demographics/Census-Block-Groups-in-Colorado-2015/6hee-tnp6) 
[2016](https://data.colorado.gov/Demographics/Census-Block-Groups-in-Colorado-2016/iku4-4bpx) 
[2017](https://data.colorado.gov/Demographics/Census-Block-Groups-in-Colorado-2017/ty5m-9xub) <br>
Last update: Apr 19th, 2019 by Meng "Leo" Luo

**Note**: <br>
1. To join with other data sets, the main key here is 'geoid'/'geonum', it is similar to FIPS and looks like '1080310004003', which means 1- US, 08-Colorado, 031-Denver, 000400-tractID, 3-census block group
2. Columns missing more than 25% of the value are completely dropped.

In [0]:
## Initiating
import pandas as pd
import numpy as np

In [0]:
## Load

year=2013
filename='census_block_groups_'+str(year)+'.csv'
df = pd.read_csv(filename,dtype={'geonum':str})
df["Year"]=year
print(df.tail())
df_5years=df

In [0]:
## Check shape and NaN
def nancheck(df):
    print('Shape of the dataframe:',df.shape,'\n')
    print('Missing value counts:')
    print(df.shape[0]-df.count(),'\n') 
    return

nancheck(df)

Shape of the dataframe: (3532, 156) 

Missing value counts:
geonum           0
pop              0
hispanic         0
white_nh         0
black_nh         0
ntvam_nh         0
asian_nh         0
hawpi_nh         0
other_nh         0
twoplus_nh       0
male             0
female           0
ageless5         0
age5_9           0
age10_14         0
age15_19         0
age20_24         0
age25_29         0
age30_34         0
age35_39         0
age40_44         0
age45_49         0
age50_54         0
age55_59         0
age60_64         0
age65_69         0
age70_74         0
age75_79         0
age80_84         0
age85pl          0
              ... 
ps_uni           0
ps_below         0
ps_bel150        0
tot_l18       3532
puni_65pl     3532
pov_l18       3532
pov_65pl      3532
avghhsize       21
hhi_l20k         0
hhi20_30         0
hhi30_40         0
hhi40_50         0
hhi50_60         0
hhi60_75         0
hhi75_100        0
hhi100_125       0
hhi125_150       0
hhi150_200       0
hhi200_pl

In [0]:

def loadcensus(year):
    filename='census_block_groups_'+str(year)+'.csv'
    df = pd.read_csv(filename,dtype={'geonum':str})
    df["Year"]=year
    return df

for i in range(2014,2018):
    df_5years=pd.concat([df_5years,loadcensus(i)])
df_5years.tail()    

Unnamed: 0,Year,age10_14,age15_19,age18_24,age20_24,age25_29,age30_34,age35_39,age40_44,age45_49,...,v50k_100k,v750k_1m,v_1m_plus,v_l_50k,vac_hu,w_16pl_nh,walk,white_nh,wrk_home,wrkrs_16pl
3527,2017,13,42,47,36,50,26,47,56,65,...,9,8,0,0,157,,,651,,
3528,2017,420,192,245,176,186,158,357,329,348,...,0,26,15,67,73,,,3690,,
3529,2017,566,159,185,159,437,1099,1260,911,767,...,0,227,23,18,151,,,6598,,
3530,2017,37,104,191,110,176,63,87,51,26,...,0,0,0,0,25,,,667,,
3531,2017,0,2896,3252,409,0,0,0,0,0,...,0,0,0,0,0,,,2612,,


In [0]:
nancheck(df_5years)

Shape of the dataframe: (17660, 156) 

Missing value counts:
Year              0
age10_14          0
age15_19          0
age18_24          0
age20_24          0
age25_29          0
age30_34          0
age35_39          0
age40_44          0
age45_49          0
age50_54          0
age55_59          0
age5_9            0
age60_64          0
age65_69          0
age70_74          0
age75_79          0
age80_84          0
age85pl           0
ageless18         0
ageless5          0
armedfrcs         0
asian_nh          0
avghhsize       108
b1939_e           0
b1940_1949        0
b1950_1959        0
b1960_1969        0
b1970_1979        0
b1980_1989        0
              ...  
same_house    17660
same_state    17660
t_10_19       17660
t_20_29       17660
t_30_39       17660
t_40_59       17660
t_60_pl       17660
t_less_10     17660
tot_l18       17660
tr_other      17660
twoplus_nh        0
undergrad     17660
unemp             0
v100k_150k        0
v150k_200k        0
v200k_250k        0

In [0]:
## More cleansing - Incomplete columns dropped
df_5years_clean=df_5years.dropna(thresh=len(df_5years)*.75, axis=1)
nancheck(df_5years_clean)

Shape of the dataframe: (17660, 107) 

Missing value counts:
Year            0
age10_14        0
age15_19        0
age18_24        0
age20_24        0
age25_29        0
age30_34        0
age35_39        0
age40_44        0
age45_49        0
age50_54        0
age55_59        0
age5_9          0
age60_64        0
age65_69        0
age70_74        0
age75_79        0
age80_84        0
age85pl         0
ageless18       0
ageless5        0
armedfrcs       0
asian_nh        0
avghhsize     108
b1939_e         0
b1940_1949      0
b1950_1959      0
b1960_1969      0
b1970_1979      0
b1980_1989      0
             ... 
pop25plus       0
ps_bel150       0
ps_below        0
ps_uni          0
r1000t1249      0
r1250t1499      0
r1500t1999      0
r2000pl         0
r400t599        0
r600t799        0
r800t999        0
rented          0
rnocshr         0
rnt_occ_hu      0
rntl400         0
twoplus_nh      0
unemp           0
v100k_150k      0
v150k_200k      0
v200k_250k      0
v250k_300k      0
v30

In [0]:
# Saving df_clean as CSV file
df_5years_clean.to_csv('Cleaned_CensusBlockGroup_2013-17.csv',index=False)