## 得到邻接矩阵，train_idx,val_idx,test_idx,以及对应的整个文件

In [None]:
import os
import pandas as pd
import numpy as np
from numpy import radians, sin, cos, arcsin, sqrt

In [None]:
def disN7(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    aa = sin(d_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(d_lon / 2) ** 2
    bb=sqrt(aa)
    c = 2 * arcsin(bb)
    r = 6371
    return c * r

# load csv

In [None]:
def LoadDf(csv_dir):
    POI_df = pd.read_csv(csv_dir)
    POI_df['racial_segregation_index'] = POI_df['racial_segregation_index'].astype(np.float32)
    POI_df['0.5'] = POI_df['0.5'].apply(lambda x:np.fromstring(x[1:-1],sep=' ').astype(np.float32))
    POI_df['embedding']=POI_df['embedding'].apply(lambda x:np.array(eval(x)).astype(np.float32))
    POI_df['rating'] = POI_df['rating'].apply(lambda x:np.array(eval(x)).astype(np.float32))
    return POI_df

In [None]:
dataset_dir = './data/train-dataset/'
city = 'New Orleans'
datasets_file = 'allembedding&popu&rating'
train_dir = os.path.join(dataset_dir,f'{city}/{city}_{datasets_file}_traindata.csv')
val_dir = os.path.join(dataset_dir,f'{city}/{city}_{datasets_file}_valdata.csv')
test_dir = os.path.join(dataset_dir,f'{city}/{city}_{datasets_file}_testdata.csv')
train_df = LoadDf(train_dir)
val_df = LoadDf(val_dir)
test_df= LoadDf(test_dir)

In [None]:

print('raw shape',train_df.shape,val_df.shape,test_df.shape)
duplicates_train_val = train_df.merge(val_df, on='placekey', how='inner')['placekey']
duplicates_train_test = train_df.merge(test_df, on='placekey', how='inner')['placekey']
duplicates_val_test = test_df.merge(val_df, on='placekey', how='inner')['placekey']
duplicates = pd.concat([duplicates_train_val, duplicates_train_test, duplicates_val_test]).unique()
train_df = train_df[~train_df['placekey'].isin(duplicates)]
val_df = val_df[~val_df['placekey'].isin(duplicates)]
test_df = test_df[~test_df['placekey'].isin(duplicates)]
print('processed shape',train_df.shape,val_df.shape,test_df.shape)

In [None]:
train_list = train_df.placekey.tolist()
val_list = val_df.placekey.tolist()
test_list = test_df.placekey.tolist()
all_df = pd.concat([train_df,val_df,test_df]) 

def assign_split(x):
    if x in train_list:
        return 2
    elif x in val_list:
        return 1
    elif x in test_list:
        return 0
    

all_df['split'] = all_df['placekey'].apply(assign_split)
all_df = all_df.drop_duplicates(subset='placekey', keep='first')
all_df = all_df.reset_index(drop=True)

all_df.shape,all_df.columns,all_df.iloc[-1].name

## GET Adj

In [None]:
#Review
from sklearn.metrics.pairwise import cosine_similarity

Neigh_num = 5
embedding_matrix = np.vstack(all_df['embedding'].values)
similarity_matrix = cosine_similarity(embedding_matrix)
adjacency_matrix = np.zeros((len(all_df), len(all_df)), dtype=int)
for i in range(len(all_df)):
    similarity_vector = similarity_matrix[i]
    similarity_vector[i] = -1
    nearest_indices = np.argsort(-similarity_vector)[:Neigh_num]
    # 更新邻接矩阵
    for j in nearest_indices:
        adjacency_matrix[i, j] = 1

# 保存邻接矩阵到文本文件
save_dif = os.path.join(dataset_dir,f'{city}/{city}_review_adj{Neigh_num}.txt')
with open(save_dif, 'w') as f:
    for i in range(len(all_df)):
        for j in range(len(all_df)):
            if adjacency_matrix[i, j] == 1:
                f.write(f"{i},{j}\n")

print(f"saved in {save_dif}")

In [None]:
#rating
from sklearn.metrics.pairwise import cosine_similarity

Neigh_num = 5
embedding_matrix = np.vstack(all_df['rating'].values)
similarity_matrix = cosine_similarity(embedding_matrix)
adjacency_matrix = np.zeros((len(all_df), len(all_df)), dtype=int)
for i in range(len(all_df)):
    similarity_vector = similarity_matrix[i]
    similarity_vector[i] = -1
    nearest_indices = np.argsort(-similarity_vector)[:Neigh_num]
    # 更新邻接矩阵
    for j in nearest_indices:
        adjacency_matrix[i, j] = 1

# 保存邻接矩阵到文本文件
save_dif = os.path.join(dataset_dir,f'{city}/{city}_rating_adj{Neigh_num}.txt')
with open(save_dif, 'w') as f:
    for i in range(len(all_df)):
        for j in range(len(all_df)):
            if adjacency_matrix[i, j] == 1:
                f.write(f"{i},{j}\n")

print(f"Saved in {save_dif} ")

In [None]:
#location
def disN7(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    aa = sin(d_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(d_lon / 2) ** 2
    bb=sqrt(aa)
    c = 2 * arcsin(bb)
    r = 6371
    return c * r

Neigh_num = 5

adjacency_matrix = np.zeros((len(all_df), len(all_df)), dtype=int)
for i,row in all_df.iterrows():
    lon,lat = row['longitude'],row['latitude']
    dis_series = disN7(lon,lat,all_df['longitude'],all_df['latitude'])
    dis_series[i] = float('inf')
    nearest_indices = np.argsort(dis_series)[:Neigh_num]
    # 更新邻接矩阵
    for j in nearest_indices:
        adjacency_matrix[i, j] = 1
# 保存邻接矩阵到文本文件
save_dif = os.path.join(dataset_dir,f'{city}/{city}_location_adj{Neigh_num}.txt')
with open(save_dif, 'w') as f:
    for i in range(len(all_df)):
        for j in range(len(all_df)):
            if adjacency_matrix[i, j] == 1:
                f.write(f"{i},{j}\n")

print(f"Saved in {save_dif}")

In [None]:
def read_adjacency_matrix_from_txt(file_path, num_nodes):
    adjacency_matrix = np.zeros((num_nodes, num_nodes), dtype=int)
    
    with open(file_path, 'r') as f:
        for line in f:
            i, j = map(int, line.strip().split(','))
            adjacency_matrix[i, j] = 1
    
    return adjacency_matrix


num_nodes = len(all_df)
adjacency_matrix = read_adjacency_matrix_from_txt(save_dif, num_nodes)
adjacency_matrix.shape

## Save

In [None]:
all_df['embedding'] = all_df['embedding'].apply(lambda x: ','.join(map(str, x)))
all_df['rating'] = all_df['rating'].apply(lambda x: ','.join(map(str, x)))
all_df.to_csv(os.path.join(dataset_dir,f'{city}/{city}_GNNGraph.csv'),index=False)