In [1]:
# Add user specific python libraries to path
import sys
sys.path.insert(0, "/home/smehra/local-packages")

In [2]:
import numpy as np
import pandas as pd

import geopandas as gpd
from shapely.geometry import Point, Polygon

# enable automated generational garbage collection
import gc
gc.enable()

import matplotlib.pyplot as plt
%matplotlib inline

import time
from datetime import timedelta  
from datetime import date
from datetime import datetime


# Compute counties for each POI

In [64]:
# get all POI observations

path_to_files = []

# get list of files with user data in event directory    
path_to_files = ['/data/tmp/covid/SafeGraph/core_places/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part1.csv', 
                 '/data/tmp/covid/SafeGraph/core_places/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part1.csv', 
                 '/data/tmp/covid/SafeGraph/core_places/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part1.csv', 
                 '/data/tmp/covid/SafeGraph/core_places/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part1.csv', 
                 '/data/tmp/covid/SafeGraph/core_places/CoreRecords-CORE_POI-2019_03-2020-03-25/core_poi-part1.csv']

files = []
for path in path_to_files:
    file = pd.read_csv(path)
    files.append(file)

# concatenate all partition files
core_poi = pd.concat(files, ignore_index=True)
core_poi = core_poi[['safegraph_place_id', 'region', 'latitude', 'longitude']]
core_poi.rename(columns={"region": "state_code"}, inplace = True)

geo_poi = gpd.GeoDataFrame(core_poi, geometry = gpd.points_from_xy(core_poi.longitude, core_poi.latitude))
geo_poi.head()


Unnamed: 0,safegraph_place_id,state_code,latitude,longitude,geometry
0,sg:001341fe7e794ab6bd65bb80759a1ac6,CA,37.890704,-122.118315,POINT (-122.11831 37.89070)
1,sg:002921847f104c93a6df47bb831dedd2,IL,42.255103,-89.085241,POINT (-89.08524 42.25510)
2,sg:002d2e9f047d47ca8661abb8939096cf,NY,42.214633,-74.220827,POINT (-74.22083 42.21463)
3,sg:0060616e5f184585a026a0b7e8192fbd,NJ,40.948824,-74.154553,POINT (-74.15455 40.94882)
4,sg:0061c1d8e124406cbcc97e40eb1dd585,NC,34.207249,-77.86301,POINT (-77.86301 34.20725)


In [68]:
# map state code to state names
state_codes = pd.read_csv('/data/tmp/smehra/tmp/state_codes.csv')
geo_poi = geo_poi.merge(state_codes, on = ['state_code'], how = 'left')
geo_poi.sort_values(['state'], inplace = True)
geo_poi.reset_index(drop=True, inplace = True)
geo_poi.head()

Unnamed: 0,safegraph_place_id,state_code,latitude,longitude,geometry,state
0,sg:512f0acfe6824058af4714694506a29b,AL,32.643121,-86.192513,POINT (-86.19251 32.64312),Alabama
1,sg:49c1b9881d724286b20752492e7c7743,AL,33.669355,-85.824686,POINT (-85.82469 33.66935),Alabama
2,sg:aaa7fdb1b373418f92379441a1f08eb6,AL,34.730719,-86.588217,POINT (-86.58822 34.73072),Alabama
3,sg:393a4b83159d45458fd2348e0b3eff42,AL,33.37862,-86.98819,POINT (-86.98819 33.37862),Alabama
4,sg:4f7bb749ffed45c2bed48936482c12c6,AL,34.712877,-86.575784,POINT (-86.57578 34.71288),Alabama


In [77]:
# read adm2 shapefile
counties = gpd.read_file('/data/tmp/smehra/tmp/gpl-covid/data/interim/adm/adm2/adm2.shp')
# keep only USA
counties = shpfile[counties.adm0_name == "USA"]
# drop administrative units with no geometry
counties = counties[counties.geometry.notna()]
# sort by state
counties.sort_values(['adm1_name', 'adm2_name'], inplace = True)
counties.reset_index(drop=True, inplace = True)
counties.head()


Unnamed: 0,adm0_name,adm1_name,adm2_name,latitude,longitude,name_alt,population,area_km2,pop_densit,geometry
0,USA,Alabama,Autauga,32.535523,-86.64291,,54571.0,2193.108651,24.882944,"POLYGON ((-86.81896 32.34027, -86.81084 32.347..."
1,USA,Alabama,Baldwin,30.736267,-87.724924,,182265.0,5733.910758,31.787206,"MULTIPOLYGON (((-87.51056 30.33972, -87.51083 ..."
2,USA,Alabama,Barbour,31.869798,-85.395443,,27457.0,3222.520295,8.52035,"POLYGON ((-85.13026 31.76386, -85.13504 31.768..."
3,USA,Alabama,Bibb,32.998646,-87.126384,,22915.0,2299.448326,9.965434,"POLYGON ((-87.42078 32.87446, -87.42028 32.971..."
4,USA,Alabama,Blount,33.981583,-86.567974,,57322.0,2455.105991,23.348075,"POLYGON ((-86.95322 33.81542, -86.95449 33.845..."


In [None]:
# create a results dataset
poi_with_adm_info = geo_poi.copy()

# get a sorted list of unique states
states = sorted(geo_poi.state.unique())

for state in states:
    
    # get all relevant counties within the state
    relevant_counties = counties[counties.adm1_name == state]
    relevant_poi = geo_poi[(geo_poi.state == state)]
    
    print(datetime.now().strftime("%d/%m/%Y %H:%M:%S") + ' Current State: ' + state)
    print(datetime.now().strftime("%d/%m/%Y %H:%M:%S") + ' Number of poi: ' + str(len(relevant_poi)))

    # iterate through all poi's within that state
    for poi in relevant_poi.itertuples():
        
        # iterate through all counties within that state
        for county in relevant_counties.itertuples():
            
            # check if county countains the poi
            if county.geometry.contains(poi.geometry):
                
                # update results database
                poi_with_adm_info.at[poi.Index, 'adm2_name'] = county.adm2_name
                break
                

In [104]:
poi_with_adm_info.rename(columns={"state": "adm1_name"}, inplace = True)
poi_with_adm_info.drop(columns = ['geometry'], inplace = True)
poi_with_adm_info.to_csv('/data/tmp/smehra/tmp/covid/poi_with_adm_info.csv', index = False)
poi_with_adm_info.head()

Unnamed: 0,safegraph_place_id,state_code,latitude,longitude,adm1_name,adm2_name
0,sg:512f0acfe6824058af4714694506a29b,AL,32.643121,-86.192513,Alabama,Elmore
1,sg:49c1b9881d724286b20752492e7c7743,AL,33.669355,-85.824686,Alabama,Calhoun
2,sg:aaa7fdb1b373418f92379441a1f08eb6,AL,34.730719,-86.588217,Alabama,Madison
3,sg:393a4b83159d45458fd2348e0b3eff42,AL,33.37862,-86.98819,Alabama,Jefferson
4,sg:4f7bb749ffed45c2bed48936482c12c6,AL,34.712877,-86.575784,Alabama,Madison


# Analyze Social Distancing Metrics

#### Load Raw Data

In [None]:
from os import listdir, environ
from os.path import isfile, join, exists

# get list of files with user data in event directory    
all_files = []
year_directory = '/data/covid/SafeGraph/sg-social-distancing/2020'

for month_directory in listdir(year_directory):

    if(month_directory not in ['.DS_Store', '._.DS_Store']):
        for day_directory in listdir(year_directory + '/' + month_directory):    
    
            if(day_directory not in ['.DS_Store', '._.DS_Store']):
                for csv_file in listdir(year_directory + '/' + month_directory + '/' + day_directory):    
                    
                    if(csv_file not in ['.DS_Store', '._.DS_Store']):
                        path_to_file = year_directory + '/' + month_directory + '/' + day_directory + '/' + csv_file
                        all_files.append(path_to_file)

date_parser = lambda x: pd.datetime.strptime(x[:-6], '%Y-%m-%dT%H:%M:%S')

# read all partition files
csv_datasets = []
for path in all_files:
    csv = pd.read_csv(path, parse_dates=['date_range_start', 'date_range_end'], date_parser=date_parser)
    csv_datasets.append(csv)

raw_social_dist_data = pd.concat(csv_datasets, ignore_index=True)
raw_social_dist_data.head()


#### Select required rows and derationalize columns

In [7]:
from ast import literal_eval

raw_data_cleaned = raw_social_dist_data[['origin_census_block_group', 
                                             'date_range_start', 
                                             'device_count', 
                                             'distance_traveled_from_home', 
                                             'bucketed_distance_traveled', 
                                             'completely_home_device_count', 
                                             'median_home_dwell_time', 
                                             'part_time_work_behavior_devices', 
                                             'full_time_work_behavior_devices']]

raw_data_cleaned.rename(inplace = True,
                        columns = {'origin_census_block_group': 'census_block_group', 
                                   'date_range_start': 'date', 
                                   'device_count': 'total_device_count', 
                                   'distance_traveled_from_home': 'avg_dist_traveled_from_home',  
                                   'completely_home_device_count': 'devices_home_all_day',
                                   'median_home_dwell_time': 'avg_time_at_home', 
                                   'part_time_work_behavior_devices': 'devices_outside_home_3_to_6_hours', 
                                   'full_time_work_behavior_devices': 'devices_outside_home_more_than_6_hours'})

raw_data_cleaned.loc[raw_data_cleaned.bucketed_distance_traveled.notna(), 'bucketed_distance_traveled'] \
= raw_data_cleaned[raw_data_cleaned.bucketed_distance_traveled.notna()].bucketed_distance_traveled.apply( \
    lambda x: literal_eval(str(x)))

raw_data_cleaned.loc[raw_data_cleaned.bucketed_distance_traveled.isna(), 'bucketed_distance_traveled'] = [{}]

raw_data_cleaned['devices_travelled_less_than_1km'] \
= raw_data_cleaned.bucketed_distance_traveled.apply( \
    lambda x: x['<1000'] if ('<1000' in x.keys()) else np.NaN)

raw_data_cleaned['devices_travelled_1_to_2km'] \
= raw_data_cleaned.bucketed_distance_traveled.apply( \
    lambda x: x['1001-2000'] if ('1001-2000' in x.keys()) else np.NaN)

raw_data_cleaned['devices_travelled_2_to_8km'] \
= raw_data_cleaned.bucketed_distance_traveled.apply( \
    lambda x: x['2001-8000'] if ('2001-8000' in x.keys()) else np.NaN)

raw_data_cleaned['devices_travelled_8_to_16km'] \
= raw_data_cleaned.bucketed_distance_traveled.apply( \
    lambda x: x['8001-16000'] if ('8001-16000' in x.keys()) else np.NaN)

raw_data_cleaned['devices_travelled_16_to_50km'] \
= raw_data_cleaned.bucketed_distance_traveled.apply( \
    lambda x: x['16001-50000'] if ('16001-50000' in x.keys()) else np.NaN)

raw_data_cleaned['devices_travelled_more_than_50km'] \
= raw_data_cleaned.bucketed_distance_traveled.apply( \
    lambda x: x['>50000'] if ('>50000' in x.keys()) else np.NaN)

raw_data_cleaned.drop(columns = ['bucketed_distance_traveled'], inplace = True)

raw_data_cleaned.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,census_block_group,date,total_device_count,avg_dist_traveled_from_home,devices_home_all_day,avg_time_at_home,devices_outside_home_3_to_6_hours,devices_outside_home_more_than_6_hours,devices_travelled_less_than_1km,devices_travelled_1_to_2km,devices_travelled_2_to_8km,devices_travelled_8_to_16km,devices_travelled_16_to_50km,devices_travelled_more_than_50km
0,10150007002,2020-01-01,80,5431.0,25,752,5,6,15.0,1.0,15.0,6.0,2.0,8.0
1,10299598001,2020-01-01,156,15016.0,39,797,10,17,14.0,4.0,13.0,25.0,34.0,17.0
2,10299598001,2020-01-01,156,15016.0,39,797,10,17,14.0,4.0,13.0,25.0,34.0,17.0
3,10730109006,2020-01-01,38,7419.0,14,713,2,6,1.0,2.0,8.0,5.0,2.0,2.0
4,11250103023,2020-01-01,131,11979.0,30,750,13,31,12.0,9.0,17.0,26.0,15.0,20.0


#### Compute and save census block level metrics

In [5]:
census_block_metrics = raw_data_cleaned.copy()

census_block_metrics['census_block_group'] = census_block_metrics.census_block_group.astype(str).str.zfill(12) 

census_block_metrics['total_device_count'] = census_block_metrics.total_device_count.astype(float)
census_block_metrics['avg_time_at_home'] = census_block_metrics.avg_time_at_home.astype(float)

census_block_metrics['devices_home_all_day'] = census_block_metrics.devices_home_all_day.astype(float)
census_block_metrics['perc_devices_home_all_day'] = (census_block_metrics.devices_home_all_day/census_block_metrics.total_device_count)*100

census_block_metrics['perc_devices_travelled_less_than_1km'] = (census_block_metrics.devices_travelled_less_than_1km/census_block_metrics.total_device_count)*100
census_block_metrics['perc_devices_travelled_1_to_2km'] = (census_block_metrics.devices_travelled_1_to_2km/census_block_metrics.total_device_count)*100
census_block_metrics['perc_devices_travelled_2_to_8km'] = (census_block_metrics.devices_travelled_2_to_8km/census_block_metrics.total_device_count)*100
census_block_metrics['perc_devices_travelled_8_to_16km'] = (census_block_metrics.devices_travelled_8_to_16km/census_block_metrics.total_device_count)*100
census_block_metrics['perc_devices_travelled_16_to_50km'] = (census_block_metrics.devices_travelled_16_to_50km/census_block_metrics.total_device_count)*100
census_block_metrics['perc_devices_travelled_more_than_50km'] = (census_block_metrics.devices_travelled_more_than_50km/census_block_metrics.total_device_count)*100

census_block_metrics['devices_outside_home_3_to_6_hours'] = census_block_metrics.devices_outside_home_3_to_6_hours.astype(float)
census_block_metrics['perc_devices_outside_home_3_to_6_hours'] = (census_block_metrics.devices_outside_home_3_to_6_hours/census_block_metrics.total_device_count)*100
census_block_metrics['devices_outside_home_more_than_6_hours'] = census_block_metrics.devices_outside_home_more_than_6_hours.astype(float)
census_block_metrics['perc_devices_outside_home_more_than_6_hours'] = (census_block_metrics.devices_outside_home_more_than_6_hours/census_block_metrics.total_device_count)*100

census_block_metrics = census_block_metrics[['census_block_group', 
                                             'date', 
                                             'total_device_count', 
                                             'avg_time_at_home',
                                             'avg_dist_traveled_from_home', 
                                             'perc_devices_home_all_day', 
                                             'perc_devices_outside_home_3_to_6_hours', 
                                             'perc_devices_outside_home_more_than_6_hours', 
                                             'perc_devices_travelled_less_than_1km',
                                             'perc_devices_travelled_1_to_2km',
                                             'perc_devices_travelled_2_to_8km',
                                             'perc_devices_travelled_8_to_16km',
                                             'perc_devices_travelled_16_to_50km',
                                             'perc_devices_travelled_more_than_50km']]

census_block_metrics.to_csv('/data/tmp/smehra/aggregated_data/covid/safe_graph_social_distance_metrics_census_block_level.csv', index = False)
census_block_metrics.head()


Unnamed: 0,census_block_group,date,total_device_count,avg_time_at_home,avg_dist_traveled_from_home,perc_devices_home_all_day,perc_devices_outside_home_3_to_6_hours,perc_devices_outside_home_more_than_6_hours,perc_devices_travelled_less_than_1km,perc_devices_travelled_1_to_2km,perc_devices_travelled_2_to_8km,perc_devices_travelled_8_to_16km,perc_devices_travelled_16_to_50km,perc_devices_travelled_more_than_50km
0,10150007002,2020-01-01,80.0,752.0,5431.0,31.25,6.25,7.5,18.75,1.25,18.75,7.5,2.5,10.0
1,10299598001,2020-01-01,156.0,797.0,15016.0,25.0,6.410256,10.897436,8.974359,2.564103,8.333333,16.025641,21.794872,10.897436
2,10299598001,2020-01-01,156.0,797.0,15016.0,25.0,6.410256,10.897436,8.974359,2.564103,8.333333,16.025641,21.794872,10.897436
3,10730109006,2020-01-01,38.0,713.0,7419.0,36.842105,5.263158,15.789474,2.631579,5.263158,21.052632,13.157895,5.263158,5.263158
4,11250103023,2020-01-01,131.0,750.0,11979.0,22.900763,9.923664,23.664122,9.160305,6.870229,12.977099,19.847328,11.450382,15.267176


#### Compute county level metrics

In [9]:
county_metrics = raw_data_cleaned.copy()
county_metrics['census_block_group'] = county_metrics.census_block_group.astype(str).str.zfill(12) 
county_metrics['county_code'] = county_metrics.census_block_group.astype(str).str.slice(0, 5)

county_metrics = county_metrics.groupby(['county_code', 'date']).agg({'total_device_count': 'sum', 
                                                                      'avg_dist_traveled_from_home': 'mean',
                                                                      'devices_home_all_day': 'sum', 
                                                                      'avg_time_at_home': 'mean',
                                                                      'devices_outside_home_3_to_6_hours': 'sum',
                                                                      'devices_outside_home_more_than_6_hours': 'sum',
                                                                      'devices_travelled_less_than_1km': 'sum',
                                                                      'devices_travelled_1_to_2km': 'sum',
                                                                      'devices_travelled_2_to_8km': 'sum',
                                                                      'devices_travelled_8_to_16km': 'sum',
                                                                      'devices_travelled_16_to_50km': 'sum',
                                                                      'devices_travelled_more_than_50km': 'sum'})

county_metrics.reset_index(inplace = True)
county_metrics.head()


Unnamed: 0,county_code,date,devices_outside_home_more_than_6_hours,devices_travelled_2_to_8km,total_device_count,devices_travelled_8_to_16km,devices_outside_home_3_to_6_hours,avg_dist_traveled_from_home,devices_home_all_day,devices_travelled_more_than_50km,devices_travelled_16_to_50km,devices_travelled_1_to_2km,avg_time_at_home,devices_travelled_less_than_1km
0,1001,2020-01-01,847,1008.0,5315,474.0,430,10180.75,1593,665.0,652.0,157.0,839.28125,536.0
1,1001,2020-01-02,1021,1012.0,5192,700.0,593,12603.84375,1048,577.0,1093.0,208.0,763.96875,381.0
2,1001,2020-01-03,1166,1049.0,5234,697.0,675,11974.8125,1068,654.0,1224.0,190.0,805.0625,255.0
3,1001,2020-01-04,871,1190.0,5127,616.0,429,10665.78125,1157,530.0,912.0,221.0,875.125,354.0
4,1001,2020-01-05,683,1298.0,5112,628.0,417,9521.9375,1257,461.0,777.0,179.0,836.875,348.0


In [10]:
county_metrics['total_device_count'] = county_metrics.total_device_count.astype(float)
county_metrics['avg_time_at_home'] = county_metrics.avg_time_at_home.astype(float)

county_metrics['devices_home_all_day'] = county_metrics.devices_home_all_day.astype(float)
county_metrics['perc_devices_home_all_day'] = (county_metrics.devices_home_all_day/county_metrics.total_device_count)*100

county_metrics['perc_devices_travelled_less_than_1km'] = (county_metrics.devices_travelled_less_than_1km/county_metrics.total_device_count)*100
county_metrics['perc_devices_travelled_1_to_2km'] = (county_metrics.devices_travelled_1_to_2km/county_metrics.total_device_count)*100
county_metrics['perc_devices_travelled_2_to_8km'] = (county_metrics.devices_travelled_2_to_8km/county_metrics.total_device_count)*100
county_metrics['perc_devices_travelled_8_to_16km'] = (county_metrics.devices_travelled_8_to_16km/county_metrics.total_device_count)*100
county_metrics['perc_devices_travelled_16_to_50km'] = (county_metrics.devices_travelled_16_to_50km/county_metrics.total_device_count)*100
county_metrics['perc_devices_travelled_more_than_50km'] = (county_metrics.devices_travelled_more_than_50km/county_metrics.total_device_count)*100

county_metrics['devices_outside_home_3_to_6_hours'] = county_metrics.devices_outside_home_3_to_6_hours.astype(float)
county_metrics['perc_devices_outside_home_3_to_6_hours'] = (county_metrics.devices_outside_home_3_to_6_hours/county_metrics.total_device_count)*100
county_metrics['devices_outside_home_more_than_6_hours'] = county_metrics.devices_outside_home_more_than_6_hours.astype(float)
county_metrics['perc_devices_outside_home_more_than_6_hours'] = (county_metrics.devices_outside_home_more_than_6_hours/county_metrics.total_device_count)*100

county_metrics = county_metrics[['county_code', 
                                 'date', 
                                 'total_device_count', 
                                 'avg_time_at_home',
                                 'avg_dist_traveled_from_home', 
                                 'perc_devices_home_all_day', 
                                 'perc_devices_outside_home_3_to_6_hours', 
                                 'perc_devices_outside_home_more_than_6_hours', 
                                 'perc_devices_travelled_less_than_1km',
                                 'perc_devices_travelled_1_to_2km',
                                 'perc_devices_travelled_2_to_8km',
                                 'perc_devices_travelled_8_to_16km',
                                 'perc_devices_travelled_16_to_50km',
                                 'perc_devices_travelled_more_than_50km']]

county_metrics.to_csv('/data/tmp/smehra/aggregated_data/covid/safe_graph_social_distance_metrics_county_level.csv', index = False)
county_metrics.head()


Unnamed: 0,county_code,date,total_device_count,avg_time_at_home,avg_dist_traveled_from_home,perc_devices_home_all_day,perc_devices_outside_home_3_to_6_hours,perc_devices_outside_home_more_than_6_hours,perc_devices_travelled_less_than_1km,perc_devices_travelled_1_to_2km,perc_devices_travelled_2_to_8km,perc_devices_travelled_8_to_16km,perc_devices_travelled_16_to_50km,perc_devices_travelled_more_than_50km
0,1001,2020-01-01,5315.0,839.28125,10180.75,29.971778,8.09031,15.93603,10.084666,2.953904,18.965193,8.918156,12.267168,12.511759
1,1001,2020-01-02,5192.0,763.96875,12603.84375,20.1849,11.421418,19.664869,7.338213,4.006163,19.491525,13.48228,21.051618,11.113251
2,1001,2020-01-03,5234.0,805.0625,11974.8125,20.405044,12.896446,22.277417,4.871991,3.630111,20.042033,13.316775,23.385556,12.495224
3,1001,2020-01-04,5127.0,875.125,10665.78125,22.566803,8.367466,16.988492,6.904623,4.310513,23.210454,12.014823,17.78818,10.337429
4,1001,2020-01-05,5112.0,836.875,9521.9375,24.589202,8.157277,13.36072,6.807512,3.501565,25.391236,12.28482,15.199531,9.017997
