In [1]:
import pandas as pd
import geohash2 as gh
from tqdm import tqdm
import json
from datetime import datetime

In [2]:
data = pd.read_csv('../model/gps_data.csv')

def create_geohash(lat, lon):
	return gh.encode(lat, lon)

In [3]:
data['Geohash'] = data.apply(lambda x: create_geohash(x['Latitude'], x['Longitude']), axis=1)

In [14]:
grouped_data = data.groupby('Geohash').agg({
	'Person ID': lambda x: list(set(x)),
	'Timestamp': list,
	'Latitude': 'mean',
	'Longitude': 'mean'
}).reset_index()

In [15]:
grouped_data

Unnamed: 0,Geohash,Person ID,Timestamp,Latitude,Longitude
0,wx4d5wzfphbt,[24],[2009-01-12 01:25:12],39.764011,116.180378
1,wx4d5y8z3u65,[39],[2009-05-11 00:36:27],39.763536,116.181528
2,wx4d5y9mxrgz,[39],[2009-05-11 00:36:32],39.763255,116.182453
3,wx4d5y9q3g9p,[24],[2009-01-12 01:25:22],39.763361,116.182213
4,wx4d5yb1hxk2,[39],[2009-05-11 00:36:22],39.763852,116.180619
...,...,...,...,...,...
16139033,wx4uj5s8k7d1,[163],[2009-02-21 04:16:35],40.097413,116.593279
16139034,wx4uj5sx1h1t,[82],[2008-08-01 08:43:00],40.098575,116.593139
16139035,wx4uj5t41491,[10],[2007-12-28 09:06:12],40.097708,116.593825
16139036,wx4uj5thgkku,[144],[2008-12-16 01:40:29],40.098190,116.593927


In [17]:
location_pairs = []

for _, row in tqdm(grouped_data.iterrows()):
	geohash = row['Geohash']
	person_ids = row['Person ID']
	timestamps = row['Timestamp']
	latitude = row['Latitude']
	longitude = row['Longitude']
	
	# Create pairs of different people at the same location with their timestamps
	for i in range(len(person_ids)):
		for j in range(i + 1, len(person_ids)):
			pair = {
				'Geohash': geohash,
				'Person ID 1': person_ids[i],
				'Person ID 2': person_ids[j],
				'Timestamps': {
					person_ids[i]: timestamps[i],
					person_ids[j]: timestamps[j]
				},
				'Latitude': latitude,
				'Longitude': longitude
			}
			location_pairs.append(pair)

16139038it [10:01, 26842.61it/s]


In [19]:
with open('../files/edges/new_geolife.json', 'w') as json_file:
	json.dump(location_pairs, json_file, indent=4)