## pre-process: collect POI's visitor+location+feature from SafeGraph Datasets

In [None]:
import numpy as np
import pandas as pd
import ast
# import pyproj
# import geopandas as gpd  
import matplotlib.pyplot as plt
import pickle
import tqdm
import json
import time
import os
import math

## specific city specific time

In [None]:
specific_area = 'Philadelphia' #ATENTION: change target city name
specific_period = '2019'

filtdata_dir = '/data/'+specific_area+'/filt_safegraph/'
normdata_dir = '/data/'+specific_area+'/normalized_safegraph/'
directory = os.path.dirname(filtdata_dir)
if not os.path.exists(directory):
    os.makedirs(directory)
directory = os.path.dirname(normdata_dir)
if not os.path.exists(directory):
    os.makedirs(directory)
    
cbg_temp = pd.read_csv('/data/' + specific_area + '/' + specific_area +'_cbg_features_group.csv',usecols=['census_block_group'])
specific_cbgs = cbg_temp.census_block_group.tolist()

# POI SafeGraph

In [None]:
safegraph_dir = '/data/rawdata/Datasets/SafeGraphDatasets/SafeGraph_new/'
files = os.listdir(safegraph_dir)
files = [x for x in files if 'patterns-' in x] 
months = sorted(set([x[9:16] for x in files]))
specific_month = [x for x in months if specific_period in x] 
specific_month_files = sorted([x for x in files for month in specific_month if month in x and 'part' in x]) 

In [None]:
for x in specific_month_files: 
    print(x)
    df = pd.read_csv(safegraph_dir+x)
    print(df.shape)
    specific_filt_poi_df = df[df.poi_cbg.isin(specific_cbgs)] 
    specific_filt_poi_df['norm'] = specific_filt_poi_df.normalized_visits_by_state_scaling / specific_filt_poi_df.raw_visitor_counts
    print(specific_filt_poi_df.shape)
    specific_filt_poi_df.to_csv(filtdata_dir+'filt_'+ specific_area + '_' +x,index=False)

In [None]:
 
for x in specific_month_files:
    print(x)
    specific_norm_poi_df = pd.read_csv(filtdata_dir+'filt_'+ specific_area + '_' +x)
    for index, row in specific_norm_poi_df.iterrows():
        temp_dict = json.loads(row['visitor_home_cbgs'])  
        temp_dict = {key: value for key, value in temp_dict.items() if key.isdigit()}
        
        temp_dict = {key: value for key, value in temp_dict.items() if int(key) in specific_cbgs}
        
        temp_dict = {key: value * row['norm'] for key, value in temp_dict.items()}
        
        specific_norm_poi_df.at[index, 'visitor_home_cbgs'] = json.dumps(temp_dict)
    
    specific_norm_poi_df.to_csv(normdata_dir+'normalized_'+ specific_area + '_' +x,index=False)

In [None]:

def merge_dicts(dict1, dict2):
    merged_dict = dict1.copy()
    for key, value in dict2.items():
        if key in merged_dict:
            merged_dict[key] += value
        else:
            merged_dict[key] = value
    return merged_dict #dict

In [None]:

def update_dataframes(df1, df2):
    merged_rows = []
 
    for index, row in tqdm.tqdm(df2.iterrows()):
        placekey = row['placekey']
        if placekey in df1['placekey'].values:
            matching_row = df1[df1['placekey'] == placekey].iloc[0].copy() 
            dict_temp = merge_dicts(matching_row['visitor_home_cbgs'], row['visitor_home_cbgs'])
            matching_row['visitor_home_cbgs']= dict_temp
            merged_rows.append(matching_row.to_frame().T)
        else:
            row = pd.DataFrame(row).transpose()
            merged_rows.append(row)

    merged_df = pd.concat(merged_rows,ignore_index=True)

    return merged_df

In [None]:

specific_poi_dfs = pd.DataFrame()
for x in specific_month_files: 
    print(x)

    specific_poi_df = pd.read_csv(normdata_dir+'normalized_'+ specific_area + '_' +x,usecols=['placekey','visitor_home_cbgs'])
    
    specific_poi_df['visitor_home_cbgs'] = specific_poi_df['visitor_home_cbgs'].map(lambda x: json.loads(x)) 
    specific_poi_df = specific_poi_df[specific_poi_df['visitor_home_cbgs'].apply(lambda x: len(x) > 0)]
    specific_poi_df = specific_poi_df.reset_index(drop=True)
    
    if specific_poi_dfs.empty:
        specific_poi_dfs = specific_poi_df
    else:
    
        specific_poi_df_temp = update_dataframes(specific_poi_dfs, specific_poi_df)
        
        
        specific_poi_dfs = specific_poi_df_temp.merge(specific_poi_dfs, on=['placekey'], how='outer',suffixes=('_temp', '_old')) 
        specific_poi_dfs['visitor_home_cbgs'] = specific_poi_dfs['visitor_home_cbgs_temp'].fillna(specific_poi_dfs['visitor_home_cbgs_old'])
        specific_poi_dfs = specific_poi_dfs.drop(['visitor_home_cbgs_old','visitor_home_cbgs_temp'], axis=1)
        
    print(specific_poi_dfs.shape)
    
specific_poi_dfs['visitor_home_cbgs'] = specific_poi_dfs.visitor_home_cbgs.map(lambda x: json.dumps(x))
specific_poi_dfs.to_csv('/data/'+specific_area+'/'+specific_area+'_poi_visitor.csv',index=False) 
del specific_poi_dfs

------------------------  

### get poi location and feature


In [None]:

poi_list=pd.read_csv('/data/'+specific_area+'/'+specific_area+'_poi_visitor.csv',usecols=['placekey'])
print(poi_list.shape)

poi_locs = pd.DataFrame(columns=['placekey','latitude','longitude'])
for x in specific_month_files:
    print(x)
    poi_loc_temp = []
    poi_loc = pd.read_csv(normdata_dir+'normalized_'+ specific_area + '_' +x,usecols=['placekey','longitude','latitude'])
    for index, row in tqdm.tqdm(poi_loc.iterrows()):
        if row['placekey'] in poi_list['placekey'].values and row['placekey'] not in poi_locs['placekey'].values:
            poi_location = []
            poi_location.extend(row)
            poi_loc_temp.append(poi_location)
    poi_loc_temp = pd.DataFrame(poi_loc_temp, columns=['placekey','latitude','longitude'])
    #print(poi_features_temp)
    poi_locs = pd.concat([poi_locs, poi_loc_temp], axis=0)

print(poi_locs.shape)
poi_locs.to_csv('/data/'+specific_area+'/'+specific_area+'_poi_location.csv',index=False)
del poi_locs

In [None]:

poi_features = pd.DataFrame()
poi_list=pd.read_csv('/data/'+specific_area+'/'+specific_area+'_poi_visitor.csv',usecols=['placekey'])
print(poi_list.shape)
for x in specific_month_files:
    print(x)
    poi_features_temp = []
    boston_poi_df = pd.read_csv(normdata_dir+'normalized_'+ specific_area + '_' +x,usecols=['placekey', 'location_name', 'top_category', 'naics_code','poi_cbg'])#any feature you need
    for index, row in tqdm.tqdm(boston_poi_df.iterrows()):
        if row['placekey'] in poi_list['placekey'].values:
            poi_feature = []
            poi_feature.extend(row)
      
            poi_features_temp.append(poi_feature)
    poi_features_temp = pd.DataFrame(poi_features_temp, columns=['placekey', 'location_name', 'top_category', 'naics_code','poi_cbg'])
  
    poi_features = pd.concat([poi_features, poi_features_temp], axis=0)

poi_features = poi_features.drop_duplicates()
poi_features = poi_features.reset_index(drop=True)
poi_features.to_csv('/data/'+specific_area+'/'+specific_area+'_poi_features.csv',index=False) #['placekey', 'location_name', 'top_category', 'naics_code', 'poi_cbg']

poi_features.head()