## data_process
1. calculate POI's real racial segregation 
2. get the gap between POIs' visitor proportion and demographic proportion

In [None]:
import numpy as np
import pandas as pd
import json
import os
from numpy import radians, sin, cos, arcsin, sqrt

## specific city specific time

In [None]:
specific_area = 'Philadelphia'
specific_period = '2019'

cbg_temp = pd.read_csv('/data/' + specific_area + '/' + specific_area +'_cbg_features_group.csv',usecols=['census_block_group'])
specific_cbgs = cbg_temp.census_block_group.tolist()

# Segregation

In [None]:
specific_features = pd.read_csv('/data/'+specific_area+'/'+specific_area+'_cbg_features_group.csv',usecols=['census_block_group', 'Total population', 'asian_ratio', 'Hispanic_ratio', 'non_Hispanic_white_ratio','non_Hispanic_black_ratio', 'average_income', 'longitude', 'latitude']) 
specific_features.rename(columns={'non_Hispanic_white_ratio':'white_ratio','non_Hispanic_black_ratio':'black_ratio'},inplace=True)

specific_poi_dfs = pd.read_csv('/data/'+specific_area+'/'+specific_area+'_poi_visitor.csv')
specific_poi_dfs['visitor_home_cbgs'] = specific_poi_dfs.visitor_home_cbgs.map(lambda x: json.loads(x)) 
specific_poi_dfs['sum_visitor'] = specific_poi_dfs['visitor_home_cbgs'].apply(lambda x:sum(x.values()))
specific_poi_dfs.head()

### racial_seg

In [None]:
racial_feature_list=['Hispanic', 'black', 'asian','white']
specific_features['other_ratio']=1-(specific_features[[race+'_ratio'for race in racial_feature_list]].sum(axis=1))
racial_feature_list.append('other')
specific_features[[item+'_ratio' for item in racial_feature_list]] = specific_features[[item+'_ratio' for item in racial_feature_list]].round(6) 

base_proportion={}

sum_population_num = specific_features['Total population'].sum()
for race in racial_feature_list:
    column_name = race + '_ratio'  
    value = (specific_features[column_name]*specific_features['Total population']).sum() 
    base_proportion[race] = value 

base_proportion={key:value/sum_population_num for key,value in base_proportion.items()} 

print('base_racail_proportion:',base_proportion) 
seg_coefficient=1/(2*(1-min(base_proportion.values())))
print('seg_coefficient:',seg_coefficient)

In [None]:
def Get_racialseg(row,base_propotion,seg_coefficient): 
    print(row.name)
    visitor_number = row['visitor_home_cbgs']

    
    feature_keys_list = list(base_proportion.keys())
    
    feature_visitor_sum = {key: 0 for key in base_proportion}
    for keyv, valuev in visitor_number.items():
         if int(keyv) in specific_features['census_block_group'].values:
             for feature in feature_keys_list:
                column_name = feature+'_ratio'
                feature_ratio=specific_features.loc[specific_features['census_block_group'] == int(keyv), column_name].values[0]
                feature_visitor_sum[feature] += (feature_ratio*valuev)
    # print(feature_visitor_sum)
    
    total_sum = sum(feature_visitor_sum.values())
    if total_sum != 0:  
        segregation_index = float(0)
        for feature in feature_keys_list:  
            segregation_index += abs(feature_visitor_sum[feature]/total_sum - base_proportion[feature])
        segregation_index *= seg_coefficient
    else:
        segregation_index=np.nan
    # print(segregation_index)
    return segregation_index        

specific_poi_dfs['racial_segregation_index'] = specific_poi_dfs.apply(lambda row: Get_racialseg(row,base_proportion,seg_coefficient), axis=1)

specific_poi_dfs.head()

In [None]:
specific_poi_dfs.to_csv(
    '/data/' + specific_area + '/' + specific_area + '_'+ specific_period + '_segregationindex.csv',
    index=False)
print(specific_poi_dfs.shape)
del specific_poi_dfs

# Gap between visitor and local demographic

In [None]:
def disN7(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    aa = sin(d_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(d_lon / 2) ** 2
    bb=sqrt(aa)
    c = 2 * arcsin(bb)
    r = 6371
    return c * r

In [None]:
Yelp_df= pd.read_csv(f'/data/{specific_area}/{specific_area}_poi_with_yelp_review_image_imagestext_GPT4v.csv',usecols=['placekey','poi_cbg']) 
poi_seg_df = pd.read_csv(f'/data/{specific_area}/{specific_area}_{specific_period}_segregationindex.csv') 
poi_seg_df['racial_segregation_index'] = poi_seg_df['racial_segregation_index'].round(6)
poi_location_df = pd.read_csv(f'/data/{specific_area}/{specific_area}_poi_location.csv')
poi_df = pd.merge(Yelp_df,poi_seg_df,on='placekey',how='left')
poi_df = poi_df.merge(poi_location_df,on='placekey',how='left')
# poi_df['racial_predict_segregation_index']=np.NAN

In [None]:
mean_array = {}
for racial in racial_feature_list:
    proportion=np.sum(specific_features[racial+'_ratio']*specific_features['Total population'])/specific_features['Total population'].sum()
    mean_array[racial] = round(proportion,6)

seg_coefficient = round(1/(2*(1-min(mean_array.values()))),6)
print('mean_population:',mean_array,'\nseg_coefficient:',seg_coefficient)

In [None]:
def getracial(cbg_row,racial):
    if cbg_row.empty:
        return mean_array[racial]
    else:
        Total = cbg_row['Total population'].sum()
        proportion=np.sum(cbg_row[racial+'_ratio']*cbg_row['Total population'])/Total
        return round(proportion,6)  

for racial in racial_feature_list:
    colunms_name = racial+'_selfcbg'
    poi_df[colunms_name] = poi_df['poi_cbg'].apply(lambda x:getracial(specific_features[specific_features['census_block_group']==x],racial))
poi_df.head()

In [None]:
import tqdm
import ast

poi_dif_list = []
pre_part_name = '_selfcbg'

for index, row in tqdm.tqdm(poi_df.iterrows()):
    new_row = row.copy()
    #1.real_flow
    real_visitor_dict = ast.literal_eval(row['visitor_home_cbgs'])
    real_flow_Series = pd.Series(np.zeros(specific_features.shape[0]), index=specific_features.index)
    for indexcbg,rowcbg in specific_features.iterrows():
        cbg=str(int(rowcbg['census_block_group']))
        if cbg in real_visitor_dict:
            real_flow_Series.at[indexcbg] = real_visitor_dict[cbg]
    
    #2. 
    sum_real_flow = real_flow_Series.sum()
    for racial in racial_feature_list:
        
        feature_columns_name = racial+'_ratio'
        group_real_flow = (real_flow_Series*specific_features[feature_columns_name]).sum()/sum_real_flow
        group_pre_flow = row[racial+pre_part_name]
        group_dif_flow = group_real_flow-group_pre_flow
        
        new_row_key = racial+'_dif'
        new_row[new_row_key] = group_dif_flow
        new_row[racial+'_real'] = group_real_flow
    
    poi_dif_list.append(new_row)
    
poi_dif_df=pd.DataFrame(poi_dif_list)
poi_dif_df.head()

In [None]:
poi_save = poi_dif_df[['placekey', 'racial_segregation_index',
       'Hispanic_selfcbg',
       'black_selfcbg', 'asian_selfcbg', 'white_selfcbg', 'other_selfcbg',
       'Hispanic_dif', 'Hispanic_real', 'black_dif', 'black_real', 'asian_dif',
       'asian_real', 'white_dif', 'white_real', 'other_dif', 'other_real']]

poi_save.to_csv(f'/data/{specific_area}/{specific_area}_realseg+population_dif.csv',index=False)