Sources
* https://www.ntia.doc.gov/page/download-digital-nation-datasets
* https://www.ntia.doc.gov/files/ntia/data_central_downloads/docs/november-2017-techdocs.pdf

Useful variables:
* peinhome
* peinwork
* pemphone
* pemphone
* hemobdat
* hehomte1
* hefaminc
* pesex
* prtage
* peeduca 
* ptdtrace
* prdthsp
* prnmchld
* gestfips

In [1]:
cps = pd.read_csv("data/nov19-cps.csv")

cps = cps.filter(regex = "^((?!wgt[0-9]).)*$", axis = 1)

cps["P_at_home"]         = cps.peinhome == 1
cps["P_at_work"]         = cps.peinwork == 1
cps["P_smartphone"]      = cps.pemphone == 1
cps["P_smartphone_resp"] = cps.pemphone > 0
cps["H_mobdat"]          = cps.hemobdat == 1
cps["H_highsp"]          = cps.hehomte1 == 1

### Lots of contortions to get Census places.

In [2]:
cities = ["new_york", "los_angeles", "chicago", "houston", "phoenix",
          "philadelphia", "san_antonio", "san_diego", "dallas",
          "san_jose", "austin", "jacksonville", "san_francisco", "columbus",
          "fort_worth", "indianapolis", "charlotte", "seattle", "denver", "washington"]

city_dict = { # state, cbsa, city #
  "new_york"     : {"gestfips" : 36, "gtcbsa" : 35620, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "los_angeles"  : {"gestfips" :  6, "gtcbsa" : 31080, "gtcbsast" : 1, "gtindvpc" : 1, "gtco" :  37}, 
  "chicago"      : {"gestfips" : 17, "gtcbsa" : 16980, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "houston"      : {"gestfips" : 48, "gtcbsa" : 26420, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "phoenix"      : {"gestfips" :  4, "gtcbsa" : 38060, "gtcbsast" : 1, "gtindvpc" : 1},
  "philadelphia" : {"gestfips" : 42, "gtcbsa" : 37980, "gtcbsast" : 1, "gtindvpc" : 0, "gtco" : 101},
  "san_antonio"  : {"gestfips" : 48, "gtcbsa" : 41700, "gtcbsast" : 1, "gtindvpc" : 0}, 
  "san_diego"    : {"gestfips" :  6, "gtcbsa" : 41740, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "dallas"       : {"gestfips" : 48, "gtcbsa" : 19100, "gtcbsast" : 1, "gtindvpc" : 1},
  "san_jose"     : {"gestfips" :  6, "gtcbsa" : 41940, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "austin"       : {"gestfips" : 48, "gtcbsa" : 12420, "gtcbsast" : 1, "gtindvpc" : 0}, 
  "jacksonville" : {"gestfips" : 12, "gtcbsa" : 27260, "gtcbsast" : 1, "gtindvpc" : 0}, 
  "san_francisco": {"gestfips" :  6, "gtcbsa" : 41860, "gtcbsast" : 1, "gtindvpc" : 1, "gtco" :  75}, # SF county
  "columbus"     : {"gestfips" : 39, "gtcbsa" : 18140, "gtcbsast" : 1, "gtindvpc" : 0},
  "fort_worth"   : {"gestfips" : 48, "gtcbsa" : 19100, "gtcbsast" : 1, "gtindvpc" : 2}, 
  "indianapolis" : {"gestfips" : 18, "gtcbsa" : 26900, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "charlotte"    : {"gestfips" : 37, "gtcbsa" : 16740, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "seattle"      : {"gestfips" : 53, "gtcbsa" : 42660, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "denver"       : {"gestfips" :  8, "gtcbsa" : 19740, "gtcbsast" : 1, "gtindvpc" : 1}, 
  "washington"   : {"gestfips" : 11, "gtcbsa" : 47900, "gtcbsast" : 1, "gtindvpc" : 1}
}

cps["city"] = ""
for city, vals in city_dict.items():
    
    query = " & ".join(["({} == {:})".format(k, v)
                        for k, v in vals.items()])
    
    cps.loc[cps.query(query).index, "city"] = city
    
cps_city = cps[cps.city != ""].copy()

In [3]:
cbsa = ["new_york", "los_angeles", "chicago", "dallas-fort_worth", "houston", 
        "philadelphia", "washington", "miami", "atlanta", "boston", 
        "san_francisco", "riverside", "phoenix", "detroit", "seattle", 
        "minneapolis", "san_diego", "tampa", "st_louis", "baltimore"]

cbsa_dict = {
    35620 : "new_york", 31080 : "los_angeles", 16980 : "chicago", 19100 : "dallas-fort_worth", 26420 : "houston", 
    37980 : "philadelphia", 47900 : "washington", 33100 : "miami", 12060 : "atlanta", 14460 : "boston", 
    41860 : "san_francisco", 40140 : "riverside", 38060 : "phoenix", 19820 : "detroit", 42660 : "seattle", 
    33460 : "minneapolis", 41740 : "san_diego", 45300 : "tampa", 41180 : "st_louis", 12580 : "baltimore"
}

cps["cbsa"] = ""
cps.loc[cps.query("gtcbsa in @cbsa_dict").index, "cbsa"] = \
   cps.query("gtcbsa in @cbsa_dict").gtcbsa.replace(cbsa_dict)

cps_cbsa = cps[cps.cbsa != ""].copy()

In [4]:
cps.query("city == 'houston'")[["city", "gestfips", "gtcbsa", "gtcsa", "gtmetsta", "gtcbsast", "gtindvpc", "gtco"]].drop_duplicates()

Unnamed: 0,city,gestfips,gtcbsa,gtcsa,gtmetsta,gtcbsast,gtindvpc,gtco
9133,houston,48,26420,0,1,1,1,0


These populations seem about right... Washington DC is _way_ oversampled... weights are NECESSARY.

In [5]:
cps_city.groupby("city").pwsswgt.sum().sort_values(ascending = False)

city
new_york         8.278783e+06
los_angeles      4.074095e+06
chicago          2.496310e+06
houston          2.191824e+06
san_antonio      1.851952e+06
philadelphia     1.645956e+06
phoenix          1.523442e+06
san_diego        1.292693e+06
fort_worth       1.138746e+06
dallas           1.023687e+06
indianapolis     9.692320e+05
columbus         9.394237e+05
jacksonville     9.317886e+05
austin           8.087954e+05
denver           8.071363e+05
san_francisco    8.046062e+05
seattle          7.749811e+05
san_jose         7.281600e+05
charlotte        7.028493e+05
washington       7.005770e+05
Name: pwsswgt, dtype: float64

Internet in home.

In [22]:
def h_weighted_mean(grp): return grp._get_numeric_data().multiply(grp['hwhhwgt'], axis=0).sum()/grp['hwhhwgt'].sum()
def p_weighted_mean(grp): return grp._get_numeric_data().multiply(grp['pwsswgt'], axis=0).sum()/grp['pwsswgt'].sum()

def access(df, grouper):

    prime_age = "(18 <= prtage) & (prtage < 65)"
    home   = df.query("(peinhome > 0) & " + prime_age).groupby(grouper).apply(p_weighted_mean).P_at_home
    work   = df.query("(peinwork > 0) & " + prime_age).groupby(grouper).apply(p_weighted_mean).P_at_work
    mphone = df.query("(pemphone > 0) & " + prime_age).groupby(grouper).apply(p_weighted_mean).P_smartphone
    mphone_resp = df.groupby(grouper).apply(p_weighted_mean).P_smartphone_resp

    mobile = df.query("hemobdat > 0").groupby(grouper).apply(h_weighted_mean).H_mobdat
    highsp = df.query("hehomte1 > 0").groupby(grouper).apply(h_weighted_mean).H_highsp

    assembled = pd.concat([home, work, mphone, mphone_resp, mobile, highsp], axis = 1).round(3)
    
    return assembled.sort_values("H_highsp", ascending = False)

In [23]:
access(cps_city, "city")

Unnamed: 0_level_0,P_at_home,P_at_work,P_smartphone,P_smartphone_resp,H_mobdat,H_highsp
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
columbus,0.793,0.683,0.841,0.973,0.922,0.955
san_jose,0.929,0.798,0.94,0.978,0.965,0.952
washington,0.883,0.835,0.885,0.964,0.939,0.942
seattle,0.881,0.801,0.893,0.935,0.941,0.936
san_diego,0.828,0.679,0.861,0.987,0.838,0.933
san_francisco,0.683,0.612,0.669,0.993,0.909,0.932
austin,0.887,0.744,0.924,0.955,0.963,0.915
new_york,0.76,0.583,0.775,0.955,0.916,0.913
denver,0.804,0.643,0.811,0.95,0.91,0.903
dallas,0.82,0.589,0.886,0.927,0.937,0.875


In [35]:
cbsa_access = access(cps_cbsa, "cbsa")
cbsa_access = cbsa_access.reindex(cbsa)
cbsa_access.index = cbsa_access.index.str.replace("_", " ").str.title()

cbsa_access

Unnamed: 0_level_0,P_at_home,P_at_work,P_smartphone,P_smartphone_resp,H_mobdat,H_highsp
cbsa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
New York,0.798,0.605,0.803,0.963,0.889,0.928
Los Angeles,0.733,0.515,0.778,0.962,0.913,0.825
Chicago,0.818,0.65,0.838,0.964,0.877,0.869
Dallas-Fort Worth,0.816,0.626,0.862,0.957,0.912,0.895
Houston,0.766,0.586,0.787,0.964,0.906,0.865
Philadelphia,0.787,0.65,0.784,0.969,0.886,0.888
Washington,0.821,0.697,0.829,0.961,0.905,0.916
Miami,0.771,0.604,0.8,0.969,0.896,0.901
Atlanta,0.792,0.603,0.825,0.965,0.905,0.847
Boston,0.769,0.623,0.774,0.964,0.914,0.921


In [43]:
why_no_net = {
  1  : "Don't need it or not interested",
  2  : "Can't afford it",
  3  : "Not worth the cost",
  4  : "Can use it elsewhere",
  5  : "Not available in area",
  6  : "No device",
  7  : "Privacy or security concerns",
  8  : "Safety concerns",
  9  : "Moving",
  10 : "Other",
  -1 : "NA"
}

cps["why_no_net"] = cps.heprinoh.replace(why_no_net)

cps.query("heprinoh > 0").why_no_net.value_counts()

Don't need it or not interested    11188
Can't afford it                     3257
Other                               1707
Not available in area                831
Can use it elsewhere                 646
Not worth the cost                   520
No device                            478
Privacy or security concerns         299
Moving                                97
Safety concerns                       77
Name: why_no_net, dtype: int64

In [8]:
print([x for x in cps.columns if "wgt" not in x])

['hrhhid', 'hrmonth', 'hryear4', 'hurespli', 'hufinal', 'hetenure', 'hehousut', 'hetelhhd', 'hetelavl', 'hephoneo', 'hefaminc', 'hutypea', 'hutypb', 'hutypc', 'hrintsta', 'hrnumhou', 'hrhtype', 'hrmis', 'huinttyp', 'huprscnt', 'hrlonglk', 'hrhhid2', 'hwhhwtln', 'hubus', 'hubusl1', 'hubusl2', 'hubusl3', 'hubusl4', 'gereg', 'gediv', 'gestfips', 'gtcbsa', 'gtco', 'gtcbsast', 'gtmetsta', 'gtindvpc', 'gtcbsasz', 'gtcsa', 'perrp', 'peparent', 'prtage', 'prtfage', 'pemaritl', 'pespouse', 'pesex', 'peafever', 'peafnow', 'peeduca', 'ptdtrace', 'prdthsp', 'puchinhh', 'pulineno', 'prfamnum', 'prfamrel', 'prfamtyp', 'pehspnon', 'prmarsta', 'prpertyp', 'penatvty', 'pemntvty', 'pefntvty', 'prcitshp', 'prcitflg', 'prinusyr', 'puslfprx', 'pemlr', 'puwk', 'pubus1', 'pubus2ot', 'pubusck1', 'pubusck2', 'pubusck3', 'pubusck4', 'puretot', 'pudis', 'peret1', 'pudis1', 'pudis2', 'puabsot', 'pulay', 'peabsrsn', 'peabspdo', 'pemjot', 'pemjnum', 'pehrusl1', 'pehrusl2', 'pehrftpt', 'pehruslt', 'pehrwant', 'pehrr

### CSA doesn't work.... 

Missing: 288 houston, 420 phoenix, 476 st louis

In [19]:
csa = {"new_york" : 408, "los_angeles" : 348, "chicago" : 176, "washington" : 548, "san_jose" : 488, 
       "boston" : 148, "dallas-fort_worth" : 206, "houston" : 288, "philadelphia" : 428, "miami" : 370,
       "atlanta" : 122, "detroit" : 220, "phoenix" : 429, "seattle" : 500, "orlando" : 422, 
       "minneapolis" : 378, "denver" : 216, "cleveland" : 184, "portland" : 440, "st_louis" : 476}