## Extracted RE`EM model training dateset

In [None]:
import pandas as pd
import numpy as np
from numpy import radians, sin, cos, arcsin, sqrt

In [None]:
city = 'Tucson'
data_path = './'
save_path = './' #ATTENTION

### POI Population 

In [None]:
def disN7(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    aa = sin(d_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(d_lon / 2) ** 2
    bb=sqrt(aa)
    c = 2 * arcsin(bb)
    r = 6371
    return c * r

In [None]:
cbg_feature_df = pd.read_csv(data_path+f'/data/{city}/{city}_cbg_features_group.csv',usecols=['census_block_group', 'Total population','longitude', 'latitude','average_income_group', 'Hispanic_ratio', 'non_Hispanic_white_ratio','non_Hispanic_black_ratio', 'asian_ratio'])
cbg_feature_df.rename(columns={'non_Hispanic_white_ratio':'white_ratio','non_Hispanic_black_ratio':'black_ratio'},inplace=True)
racial_feature_list=['Hispanic', 'black', 'asian','white']
cbg_feature_df['other_ratio']=1-(cbg_feature_df[[race+'_ratio'for race in racial_feature_list]].sum(axis=1))
racial_feature_list.append('other')
cbg_feature_df[[item+'_ratio' for item in racial_feature_list]] = cbg_feature_df[[item+'_ratio' for item in racial_feature_list]].round(6) 
# cbg_feature_df.head() #1328*10

In [None]:
import os
image_files = os.listdir(data_path+f'/data/{city}/')
pattern = f'{city}_poi_with_yelp_review_image_imagestext_'
csv_files = [file for file in image_files if file.startswith(pattern) and file.endswith('.csv')]
poi_df = pd.read_csv(data_path+f'/data/{city}/'+csv_files[0],usecols=['placekey','longitude','poi_cbg','latitude']) # usecols=['placekey', 'images_text','attributes'])
# poi_df = pd.merge(seg_df,review_df,on='placekey',how='inner')

def getracial(cbg_row):
    racial_array = np.array([])
    Total = cbg_row['Total population'].sum()
    for racial in racial_feature_list:
        proportion=np.sum(cbg_row[racial+'_ratio']*cbg_row['Total population'])/Total
        racial_array = np.append(racial_array, round(proportion,6))
    return racial_array    

poi_df['self'] = poi_df['poi_cbg'].apply(lambda x:getracial(cbg_feature_df[cbg_feature_df['census_block_group']==x]))
poi_df.head()

In [None]:
mean_array = np.array([])
for racial in racial_feature_list:
    proportion=np.sum(cbg_feature_df[racial+'_ratio']*cbg_feature_df['Total population'])/cbg_feature_df['Total population'].sum()
    mean_array = np.append(mean_array, round(proportion,6))
poi_df['self'] = poi_df['self'].apply(lambda x: mean_array if np.all(np.isnan(x)) else x)

poi_df.head()

In [None]:
distance = [0.5,1,2,5,10]

def getsurround(lon1,lat1,dis):
    distance = disN7(lon1,lat1,cbg_feature_df['longitude'],cbg_feature_df['latitude'])
    cbg_row = cbg_feature_df[distance<dis]
    if cbg_row.empty:
        return np.NaN
    racial_array = getracial(cbg_row)
    return racial_array

for dis in distance:
    poi_df[dis] = poi_df.apply(lambda x:getsurround(x['longitude'],x['latitude'],dis),axis=1)
    print(dis,poi_df[dis].isna().sum())
    poi_df[dis] = poi_df[dis].fillna(poi_df['self'])

poi_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_gravity_datasets.csv',index=False)
poi_df.head()

## Embedding Component Datasets

In [None]:
import pandas as pd
import numpy as np
import ast
import random
from sklearn.model_selection import train_test_split
import tqdm

max_review_num = 50
MAX_TRAIN_REVIEW=15  
image_files = os.listdir(data_path+f'/data/{city}/')
pattern = f'{city}_poi_with_yelp_review_image_imagestext_'
csv_files = [file for file in image_files if file.startswith(pattern) and file.endswith('.csv')]
review_df = pd.read_csv(data_path+f'/data/{city}/'+csv_files[0],usecols=['placekey','review_num','review'])

seg_df = pd.read_csv(data_path+f'/data/{city}/{city}_2019_segregationindex.csv',usecols=['placekey','racial_segregation_index'])
df = pd.merge(review_df,seg_df,on='placekey',how='inner')
train_df, temp_test_df, _, _ = train_test_split(df, df, test_size=0.4, random_state=42)
train_df['review'] = train_df['review'].apply(lambda x:ast.literal_eval(x)) 
temp_test_df['review'] = temp_test_df['review'].apply(lambda x:ast.literal_eval(x)) 

val_df, test_df = train_test_split(temp_test_df, test_size=0.5, random_state=42)
test_df['review'] = test_df['review'].apply(lambda x:random.sample(x, min(len(x), max_review_num))) 
test_df['review'] = test_df['review'].apply(lambda x: [f'Review {index+1}: {item["text"]}' for index, item in enumerate(x)])

val_df['review'] = val_df['review'].apply(lambda x:random.sample(x, min(len(x), MAX_TRAIN_REVIEW))) 
val_df['review'] = val_df['review'].apply(lambda x: '\n'.join([f'Review {index+1}: {item["text"]}' for index, item in enumerate(x)]))


def get_sample_count(length):
    if length <= 10:
        return 1
    elif length <= 20:
        return 2
    elif length <= 40:
        return 4
    elif length <= 50:
        return 5
    elif length <= 80:
        return 8
    else:
        return 10

def sample_review(review):
    sample_count = get_sample_count(len(review))
    sampled_reviews = [random.sample(review, min(len(review), MAX_TRAIN_REVIEW)) for _ in range(sample_count)]
    return sampled_reviews

new_rows = []
for index, row in tqdm.tqdm(train_df.iterrows()):
    original_review = row['review']
   
    sampled_reviews = sample_review(original_review)
 
    for review in sampled_reviews:
        new_row = row.copy()
        new_row['review'] = '\n'.join([f'Review {index+1}: {item["text"]}' for index, item in enumerate(review)])
        new_rows.append(new_row)

new_train_df = pd.DataFrame(new_rows).reset_index(drop=True)
print(new_train_df.shape)
new_train_df.head() 


population = pd.read_csv(save_path+f'/data/train-dataset/{city}/{city}_gravity_datasets.csv',usecols=['placekey','0.5'])
new_train_df = new_train_df.merge(population,on='placekey',how='inner')
val_df = val_df.merge(population,on='placekey',how='inner')
test_df = test_df.merge(population,on='placekey',how='inner')


new_train_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_popu+allreview_traindata.csv',index=False)
val_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_popu+allreview_valdata.csv',index=False)
test_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_popu+allreview_testdata.csv',index=False)

new_train_df.shape,val_df.shape,test_df.shape,new_train_df.columns

In [None]:
#train/val every review

import pandas as pd
import numpy as np
import ast
import random
from sklearn.model_selection import train_test_split
import tqdm

max_review_num = 50
MAX_TRAIN_REVIEW=15

image_files = os.listdir(data_path+f'/data/{city}/')
pattern = f'{city}_poi_with_yelp_review_image_imagestext_'
csv_files = [file for file in image_files if file.startswith(pattern) and file.endswith('.csv')]
review_df = pd.read_csv(data_path+f'/data/{city}/'+csv_files[0],usecols=['placekey','review_num','review'])
seg_df = pd.read_csv(data_path+f'/data/{city}/{city}_2019_segregationindex.csv',usecols=['placekey','racial_segregation_index']) 
df = pd.merge(review_df,seg_df,on='placekey',how='inner')
df['review'] = df['review'].apply(lambda x:ast.literal_eval(x))
df['review'] =df['review'].apply(lambda x:random.sample(x, min(len(x), max_review_num))) 
df['review'] = df['review'].apply(lambda x: [f'Review {index+1}: {item["text"]}' for index, item in enumerate(x)])


new_train_df, temp_test_df, _, _ = train_test_split(df, df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_test_df, test_size=0.5, random_state=42)

population = pd.read_csv(save_path+f'/data/train-dataset/{city}/{city}_gravity_datasets.csv',usecols=['placekey','0.5'])
new_train_df = new_train_df.merge(population,on='placekey',how='inner')
val_df = val_df.merge(population,on='placekey',how='inner')
test_df = test_df.merge(population,on='placekey',how='inner')


new_train_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_popu+allreview_load_traindata.csv',index=False)
val_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_popu+allreview_load_valdata.csv',index=False)
test_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_popu+allreview_load_testdata.csv',index=False)

new_train_df.shape,val_df.shape,test_df.shape #((1072, 5), (1072, 5))

## GET Review Embedding
Note: The first step of Fusion should be conducted after the Embedding fine-tuned step, hence the following code should be executed after the Embedding Component Training.

In [None]:
import pandas as pd
import glob
import os

city = 'Tucson'
# file_dir = '/code/RE`EM/model/R&E_embeddingMLP_example/embedding_result' #ATTENTION
emb_adapter_foler = 'trained model folder name'
file_dir = f"./code/REEM/trained-gte-model-trained-MLP/{city}/"+emb_adapter_foler

dfs = []
for root, dirs, files in os.walk(file_dir):
    for filename in files:
        if filename.startswith("embedding_train_result_") and filename.endswith(".csv"):
            file = os.path.join(root, filename)
            df = pd.read_csv(file)
            dfs.append(df)
train_df = pd.concat(dfs, ignore_index=True)

dfs = []
for root, dirs, files in os.walk(file_dir):
    for filename in files:
        if filename.startswith("embedding_val_result_") and filename.endswith(".csv"):
            file = os.path.join(root, filename)
            df = pd.read_csv(file)
            dfs.append(df)
val_df = pd.concat(dfs, ignore_index=True)

dfs = []
for root, dirs, files in os.walk(file_dir):
    for filename in files:
        if filename.startswith("embedding_test_result_") and filename.endswith(".csv"):
            file = os.path.join(root, filename)
            df = pd.read_csv(file)
            dfs.append(df)
test_df = pd.concat(dfs, ignore_index=True)


population = pd.read_csv(save_path+f'/data/train-dataset/{city}/{city}_gravity_datasets.csv',usecols=['placekey','0.5'])
train_df = train_df.merge(population,on='placekey',how='inner')
val_df = val_df.merge(population,on='placekey',how='inner')
test_df = test_df.merge(population,on='placekey',how='inner')

train_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu_traindata.csv',index=False)
val_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu_valdata.csv',index=False)
test_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu_testdata.csv',index=False)

train_df.shape,val_df.shape,test_df.shape

## Reasoning Component Datasets


In [None]:
import pandas as pd
import ast
import os

city = 'target-city'
file_dir = f'./data/{city}/rating/'
save_path = './'

dfs = []
for root, dirs, files in os.walk(file_dir):
    for filename in files:
        if filename.startswith("rating_result_") and filename.endswith(".csv"):
            file = os.path.join(root, filename)
            df = pd.read_csv(file)
            dfs.append(df)
print(len(dfs))
file = pd.concat(dfs, ignore_index=True)
file = file.drop(columns=['visitor_home_cbgs_y'])
file = file.rename(columns={'visitor_home_cbgs_x': 'visitor_home_cbgs'})
print(file.shape)
file['rating'] = file['rating'].apply(ast.literal_eval)


train_df=pd.read_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu_traindata.csv')
val_df=pd.read_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu_valdata.csv')
test_df=pd.read_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu_testdata.csv')

train_df = train_df.merge(file,on='placekey',how='inner')
val_df = val_df.merge(file,on='placekey',how='inner')
test_df = test_df.merge(file,on='placekey',how='inner')

train_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu&rating_traindata.csv',index=False)
val_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu&rating_valdata.csv',index=False)
test_df.to_csv(save_path+f'/data/train-dataset/{city}/{city}_allembedding&popu&rating_testdata.csv',index=False)

train_df.shape,val_df.shape,test_df.shape