In [1]:
# See covid_virtual_env.sh for environment setup
import pandas as pd, ftplib, io, pickle

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_rows', 500)

# Census Data

The US Census provides data access through it's FTP site:
* Website: https://www.census.gov/programs-surveys/acs/data/data-via-ftp.html
* HTTPS: https://www2.census.gov/programs-surveys/acs/
* FTP:  ftp://ftp2.census.gov

You do NOT need a user name and password. If prompted for one, use "anonymous" as the user name, and no password.


In [2]:
# Test if we can login
ftp = ftplib.FTP('ftp2.census.gov')
print(ftp.login())
print(ftp.quit())

230-Server: ftp2.census.gov
230-
230-Personal Identifiable Information (PII) shall not be placed on the FTP
230-server without prior special arrangement and in conjunction with ITSO.
230-
230-NOTE: The data available for anonymous FTP download on this FTP server are
230-also available over the Web:
230-http://www2.census.gov
230 Login successful.
221 Goodbye.


### Loging in to the Cencus COVID Data
Data does not appear to have any actual covid data, but looks like it was compiled for HHS for coronavirus research.
* Each file looks to be state specific, and contains county level detail for what appears to be almost all of the ACS data available.

In [3]:
# Login and navigate to the desired data, and explore the directory
ftp = ftplib.FTP('ftp2.census.gov')
ftp.login()
ftp.cwd('programs-surveys/acs/data/covid_19/Data_Profiles_for_HHS/050-County_By_State')
ftp.dir()
ftp.quit()

-rw-rw-r--    1 jenki415 i-acso      52069 Apr 28 07:17 dp02_ak.csv
-rw-rw-r--    1 jenki415 i-acso     125479 Apr 28 07:17 dp02_al.csv
-rw-rw-r--    1 jenki415 i-acso     134649 Apr 28 07:17 dp02_ar.csv
-rw-rw-r--    1 jenki415 i-acso      35930 Apr 28 07:17 dp02_az.csv
-rw-rw-r--    1 jenki415 i-acso     124719 Apr 28 07:17 dp02_ca.csv
-rw-rw-r--    1 jenki415 i-acso     114944 Apr 28 07:17 dp02_co.csv
-rw-rw-r--    1 jenki415 i-acso      23002 Apr 28 07:17 dp02_ct.csv
-rw-rw-r--    1 jenki415 i-acso       7986 Apr 28 07:17 dp02_dc.csv
-rw-rw-r--    1 jenki415 i-acso      12152 Apr 28 07:17 dp02_de.csv
-rw-rw-r--    1 jenki415 i-acso     139008 Apr 28 07:17 dp02_fl.csv
-rw-rw-r--    1 jenki415 i-acso     283467 Apr 28 07:17 dp02_ga.csv
-rw-rw-r--    1 jenki415 i-acso      15207 Apr 28 07:17 dp02_hi.csv
-rw-rw-r--    1 jenki415 i-acso     172597 Apr 28 07:17 dp02_ia.csv
-rw-rw-r--    1 jenki415 i-acso      79946 Apr 28 07:17 dp02_id.csv
-rw-rw-r--    1 jenki415 i-acso     186406 Apr 2

'221 Goodbye.'

In [4]:
# Get the files there
ftp = ftplib.FTP('ftp2.census.gov')
ftp.login()
ftp.cwd('programs-surveys/acs/data/covid_19/Data_Profiles_for_HHS/050-County_By_State')

files = ftp.nlst()
print(files)
ftp.quit()

['dp02_ak.csv', 'dp02_al.csv', 'dp02_ar.csv', 'dp02_az.csv', 'dp02_ca.csv', 'dp02_co.csv', 'dp02_ct.csv', 'dp02_dc.csv', 'dp02_de.csv', 'dp02_fl.csv', 'dp02_ga.csv', 'dp02_hi.csv', 'dp02_ia.csv', 'dp02_id.csv', 'dp02_il.csv', 'dp02_in.csv', 'dp02_ks.csv', 'dp02_ky.csv', 'dp02_la.csv', 'dp02_ma.csv', 'dp02_md.csv', 'dp02_me.csv', 'dp02_mi.csv', 'dp02_mn.csv', 'dp02_mo.csv', 'dp02_ms.csv', 'dp02_mt.csv', 'dp02_nc.csv', 'dp02_nd.csv', 'dp02_ne.csv', 'dp02_nh.csv', 'dp02_nj.csv', 'dp02_nm.csv', 'dp02_nv.csv', 'dp02_ny.csv', 'dp02_oh.csv', 'dp02_ok.csv', 'dp02_or.csv', 'dp02_pa.csv', 'dp02_pr.csv', 'dp02_ri.csv', 'dp02_sc.csv', 'dp02_sd.csv', 'dp02_tn.csv', 'dp02_tx.csv', 'dp02_ut.csv', 'dp02_va.csv', 'dp02_vt.csv', 'dp02_wa.csv', 'dp02_wi.csv', 'dp02_wv.csv', 'dp02_wy.csv', 'dp03_ak.csv', 'dp03_al.csv', 'dp03_ar.csv', 'dp03_az.csv', 'dp03_ca.csv', 'dp03_co.csv', 'dp03_ct.csv', 'dp03_dc.csv', 'dp03_de.csv', 'dp03_fl.csv', 'dp03_ga.csv', 'dp03_hi.csv', 'dp03_ia.csv', 'dp03_id.csv', 'dp03_il.

'221 Goodbye.'

### Download all the COVID csv files to a pickle

Loop through all the files, do some transformations, load them into a list containing a data frame for each file, and save to a pickle.
* ISO-8859-1 encoding looks to handle the special characters in some names better

In [5]:
# Login
ftp = ftplib.FTP('ftp2.census.gov')
ftp.login()
ftp.cwd('programs-surveys/acs/data/covid_19/Data_Profiles_for_HHS/050-County_By_State')

# Load and transform files
data = pd.DataFrame()
data_list = []
for file in files:
    download_file = io.BytesIO()
    ftp.retrbinary("RETR {}".format(file), download_file.write)
    download_file.seek(0) # after writing go back to the start of the virtual file
    df = pd.read_csv(download_file, encoding = "ISO-8859-1") # read virtual file into pandas
    
    # Transform the df to get ready to transpose
    df = df.drop(columns=['Line Number','Table ID'])
    df = df[df.columns.drop(list(df.filter(regex='Percent')))]
    df.columns = df.columns.str.rstrip(' Estimate')  

    df = df.rename(columns = {'Description':'County'})

    # Transpose the df so that we can aggregate files with rows for each county
    df = df.T
    
    # Fix the column names
    col_names = df.iloc[0,]
    df.columns = col_names

    # Drop the row with the column names
    df = df.drop(df.index[0])

    # Add a state columns
    df["State"] = file[-6:-4].upper()

    # Add the data to a list
    data_list.append(df)
    
ftp.quit()

'221 Goodbye.'

Now dump the list to a pickle.

In [6]:
pickle.dump(data_list, open('census.p','wb'))

### Inspecting the aggregate COVID data

Load the data from pickle

In [7]:
data_list = pickle.load(open('census.p','rb'))

There are four different types of tables. These are actually indicated by the 'dp##' in the file name, but also in a field we dropped called 'Table ID'.  

In [8]:
# Print the shapes for all the data frames in our list of data frames
for i in range(len(data_list)):
    print(data_list[i].shape)

(29, 198)
(67, 198)
(75, 198)
(15, 198)
(58, 198)
(64, 198)
(8, 198)
(1, 198)
(3, 198)
(67, 198)
(159, 198)
(5, 198)
(99, 198)
(44, 198)
(102, 198)
(92, 198)
(105, 198)
(120, 198)
(64, 198)
(14, 198)
(24, 198)
(16, 198)
(83, 198)
(87, 198)
(115, 198)
(82, 198)
(56, 198)
(100, 198)
(53, 198)
(93, 198)
(10, 198)
(21, 198)
(33, 198)
(17, 198)
(62, 198)
(88, 198)
(77, 198)
(36, 198)
(67, 198)
(78, 198)
(5, 198)
(46, 198)
(66, 198)
(95, 198)
(254, 198)
(29, 198)
(133, 198)
(14, 198)
(39, 198)
(72, 198)
(55, 198)
(23, 198)
(29, 168)
(67, 168)
(75, 168)
(15, 168)
(58, 168)
(64, 168)
(8, 168)
(1, 168)
(3, 168)
(67, 168)
(159, 168)
(5, 168)
(99, 168)
(44, 168)
(102, 168)
(92, 168)
(105, 168)
(120, 168)
(64, 168)
(14, 168)
(24, 168)
(16, 168)
(83, 168)
(87, 168)
(115, 168)
(82, 168)
(56, 168)
(100, 168)
(53, 168)
(93, 168)
(10, 168)
(21, 168)
(33, 168)
(17, 168)
(62, 168)
(88, 168)
(77, 168)
(36, 168)
(67, 168)
(78, 168)
(5, 168)
(46, 168)
(66, 168)
(95, 168)
(254, 168)
(29, 168)
(133, 168)
(14,

#### Inspecting the 2nd frame
* State = AR, and Table = dp02
* Table Name: Selected Social Characteristics in the USA

In [9]:
data_list[2].shape

(75, 198)

In [10]:
data_list[2].head()

County,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED STATES,HOUSEHOLDS BY TYPE,Total households,Family households (families),With own children of the householder under 18 years,Married-couple family,With own children of the householder under 18 years.1,"Male householder, no wife present, family",With own children of the householder under 18 years.2,"Female householder, no husband present, family",...,Swiss,Ukrainian,Welsh,West Indian (excluding Hispanic origin groups),NaN,COMPUTERS AND INTERNET USE,Total households.1,With a computer,With a broadband Internet subscription,State
Arkansas County,,,7682,4727,1857,3331,1029,353,201,1043,...,0,0,52,0,,,7682,6067,4828,AR
Ashley County,,,7915,5432,1976,4107,1319,284,71,1041,...,0,0,53,0,,,7915,6003,4787,AR
Baxter County,,,18370,12104,3466,9955,2494,592,222,1557,...,275,20,130,162,,,18370,15352,13007,AR
Benton County,,,94333,68778,33529,55774,26147,3537,1986,9467,...,710,185,1235,270,,,94333,86384,61145,AR
Boone County,,,14870,10351,3890,8141,2749,783,462,1427,...,137,0,356,32,,,14870,12611,11552,AR


#### Inspecting the 54th data frame
* State = AR, Table = dp03
* Table Name: Selected Economic Characteristics

In [11]:
data_list[54].shape

(75, 168)

In [12]:
data_list[54].head()

County,SELECTED ECONOMIC CHARACTERISTICS,EMPLOYMENT STATUS,Population 16 years and over,In labor force,Civilian labor force,Employed,Unemployed,Armed Forces,Not in labor force,NaN,...,Under 18 years,Related children of the householder under 18 years,Related children of the householder under 5 years,Related children of the householder 5 to 17 years,18 years and over,18 to 64 years,65 years and over,People in families,Unrelated individuals 15 years and over,State
Arkansas County,,,14579,8316,8316,7796,520,0,6263,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),AR
Ashley County,,,16167,8301,8301,7728,573,0,7866,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),AR
Baxter County,,,34881,16968,16968,16232,736,0,17913,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),AR
Benton County,,,197430,128941,128841,125343,3498,100,68489,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),AR
Boone County,,,29661,16892,16885,16145,740,7,12769,,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),AR


#### Inspecting the 106th data frame
* State = AR, Table = dp04
* Table Name: Selected Housing Characteristics

In [13]:
data_list[106].shape

(75, 186)

In [14]:
data_list[106].head()

County,SELECTED HOUSING CHARACTERISTICS,HOUSING OCCUPANCY,Total housing units,Occupied housing units,Vacant housing units,NaN,Homeowner vacancy rate,Rental vacancy rate,NaN.1,UNITS IN STRUCTURE,...,Occupied units paying rent (excluding units where GRAPI cannot be computed),Less than 15.0 percent,15.0 to 19.9 percent,20.0 to 24.9 percent,25.0 to 29.9 percent,30.0 to 34.9 percent,35.0 percent or more,NaN.2,Not computed,State
Arkansas County,,,9449,7682,1767,,2.9,9.0,,,...,2180,417,175,277,300,177,834,,608,AR
Ashley County,,,10148,7915,2233,,2.3,16.8,,,...,1709,172,210,191,201,65,870,,375,AR
Baxter County,,,22859,18370,4489,,2.1,10.2,,,...,3997,718,643,488,538,376,1234,,444,AR
Benton County,,,102838,94333,8505,,1.7,2.9,,,...,29121,7173,4948,3971,3124,1780,8125,,2594,AR
Boone County,,,16971,14870,2101,,2.5,5.7,,,...,3731,707,559,375,605,386,1099,,501,AR


#### Inspecting the 158th data frame
* State = AR, Table = dp05
* Table Name: ACS Demographic and Housing Estimates

In [15]:
data_list[158].shape

(75, 107)

In [16]:
data_list[158].head()

County,ACS DEMOGRAPHIC AND HOUSING ESTIMATES,SEX AND AGE,Total population,Male,Female,Sex ratio (males per 100 females),NaN,Under 5 years,5 to 9 years,10 to 14 years,...,Two races including Some other race,"Two races excluding Some other race, and Three or more races",NaN.1,Total housing units,NaN.2,"CITIZEN, VOTING AGE POPULATION","Citizen, 18 and over population",Male.1,Female.1,State
Arkansas County,,,18124,8733,9391,93.0,,1207,1313,878,...,6,174,,9449,,,13853,6500,7353,AR
Ashley County,,,20537,10013,10524,95.1,,1247,1707,1051,...,0,112,,10148,,,15288,7200,8088,AR
Baxter County,,,41219,19992,21227,94.2,,1721,2076,2139,...,0,711,,22859,,,33827,16182,17645,AR
Benton County,,,258980,128399,130581,98.3,,18379,18906,20702,...,112,6115,,102838,,,170777,82723,88054,AR
Boone County,,,37288,18350,18938,96.9,,2288,2344,2425,...,0,654,,16971,,,28667,13962,14705,AR


#### Do all the tables have the same number of counties?

In [17]:
# Inspecting the output below indicates, yes!
for i in range(int(len(data_list)/4)):
    print(data_list[i].shape[0], data_list[i+52].shape[0], data_list[i+52*2].shape[0], data_list[i+52*3].shape[0])

29 29 29 29
67 67 67 67
75 75 75 75
15 15 15 15
58 58 58 58
64 64 64 64
8 8 8 8
1 1 1 1
3 3 3 3
67 67 67 67
159 159 159 159
5 5 5 5
99 99 99 99
44 44 44 44
102 102 102 102
92 92 92 92
105 105 105 105
120 120 120 120
64 64 64 64
14 14 14 14
24 24 24 24
16 16 16 16
83 83 83 83
87 87 87 87
115 115 115 115
82 82 82 82
56 56 56 56
100 100 100 100
53 53 53 53
93 93 93 93
10 10 10 10
21 21 21 21
33 33 33 33
17 17 17 17
62 62 62 62
88 88 88 88
77 77 77 77
36 36 36 36
67 67 67 67
78 78 78 78
5 5 5 5
46 46 46 46
66 66 66 66
95 95 95 95
254 254 254 254
29 29 29 29
133 133 133 133
14 14 14 14
39 39 39 39
72 72 72 72
55 55 55 55
23 23 23 23


### Creating a master census data table

Concatinate the four table types horizontally.

In [18]:
data = pd.DataFrame()
data_list_2 = []
for i in range(int(len(data_list)/4)):
    df = pd.concat([data_list[i], data_list[i+52], data_list[i+2*52], data_list[i+3*52]], axis=1)
    data_list_2.append(df)

Remove Puerto Rico as the column names are different

In [19]:
# Puerto Rico table title says "Selected Social Characteristics in "Puerto Rico" which causes a problem with concatentate
data_list_2.pop(39)

County,SELECTED SOCIAL CHARACTERISTICS IN PUERTO RICO,HOUSEHOLDS BY TYPE,Total households,Family households (families),With own children of the householder under 18 years,Married-couple family,With own children of the householder under 18 years.1,"Male householder, no wife present, family",With own children of the householder under 18 years.2,"Female householder, no husband present, family",...,Two races including Some other race,"Two races excluding Some other race, and Three or more races",NaN,Total housing units,NaN.1,"CITIZEN, VOTING AGE POPULATION","Citizen, 18 and over population",Male,Female,State
Adjuntas Municipio,,,5861,4194,1487,2464,661,408,152,1322,...,0,0,,7594,,,14215,6838,7377,PR
Aguada Municipio,,,12819,9253,2930,6107,1552,895,406,2251,...,15,296,,17555,,,30987,15099,15888,PR
Aguadilla Municipio,,,21604,14998,5882,8458,2620,1339,558,5201,...,0,106,,27241,,,42913,20464,22449,PR
Aguas Buenas Municipio,,,8469,6270,2029,3635,914,542,209,2093,...,0,0,,10762,,,20759,9901,10858,PR
Aibonito Municipio,,,8301,6370,1936,3600,732,783,348,1987,...,0,0,,9800,,,18611,8785,9826,PR
Añasco Municipio,,,8933,6516,2046,4108,1081,559,227,1849,...,0,114,,12524,,,22022,10469,11553,PR
Arecibo Municipio,,,31788,21884,7587,12654,3525,1523,617,7707,...,0,28,,41309,,,69914,32862,37052,PR
Arroyo Municipio,,,6002,4351,1805,2355,726,326,158,1670,...,0,0,,8446,,,13998,6315,7683,PR
Barceloneta Municipio,,,8010,5680,2226,3006,927,437,144,2237,...,0,13,,10094,,,19086,8853,10233,PR
Barranquitas Municipio,,,8918,6745,2258,3566,618,787,269,2392,...,0,0,,10764,,,21995,10667,11328,PR


Concatinate the rows for all the states

In [20]:
data = pd.concat(data_list_2)

In [21]:
data.head()

County,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED STATES,HOUSEHOLDS BY TYPE,Total households,Family households (families),With own children of the householder under 18 years,Married-couple family,With own children of the householder under 18 years.1,"Male householder, no wife present, family",With own children of the householder under 18 years.2,"Female householder, no husband present, family",...,Two races including Some other race,"Two races excluding Some other race, and Three or more races",NaN,Total housing units,NaN.1,"CITIZEN, VOTING AGE POPULATION","Citizen, 18 and over population",Male,Female,State
Aleutians East Borough,,,860,538,243,346,122,79,29,113,...,0,222,,1106,,,2122,1221,901,AK
Aleutians West Census Ar,,,1176,716,407,510,288,87,51,119,...,7,255,,1967,,,3511,2308,1203,AK
Anchorage Municipality,,,106524,70176,33794,52005,23424,5921,3183,12250,...,278,24232,,116493,,,212747,109333,103414,AK
Bethel Census Ar,,,4558,3478,1967,2041,1212,559,319,878,...,0,670,,6027,,,11411,5974,5437,AK
Bristol Bay Borough,,,333,209,93,160,61,18,10,31,...,0,78,,940,,,686,419,267,AK


In [22]:
data.shape

(3142, 659)

Dump the master table as a pickle file

In [23]:
pickle.dump(data, open('census_master.p','wb'))

Print the columns we have aggregated

In [24]:
for col in data.columns:
    print(col)

SELECTED SOCIAL CHARACTERISTICS IN THE UNITED STATES
HOUSEHOLDS BY TYPE
Total households
Family households (families)
With own children of the householder under 18 years
Married-couple family
With own children of the householder under 18 years
Male householder, no wife present, family
With own children of the householder under 18 years
Female householder, no husband present, family
With own children of the householder under 18 years
Nonfamily households
Householder living alone
65 years and over
nan
Households with one or more people under 18 years
Households with one or more people 65 years and over
nan
Average household size
Average family size
nan
RELATIONSHIP
Population in households
Householder
Spouse
Child
Other relatives
Nonrelatives
Unmarried partner
nan
MARITAL STATUS
Males 15 years and over
Never married
Now married, except separated
Separated
Widowed
Divorced
nan
Females 15 years and over
Never married
Now married, except separated
Separated
Widowed
Divorced
nan
FERTILITY
Nu