In [1]:
import os
import json

import numpy as np
import pandas as pd
import geohash2 as gh

from tqdm import tqdm

In [2]:
if not os.path.isdir('images/'):
	os.makedirs('images/')
if not os.path.isdir('htmls/'):
	os.makedirs('htmls/')
if not os.path.isdir('data/'):
	os.makedirs('data/')

# 2. Load geolife data

In [3]:
data_path = './geolife/Data/'

dataframes = []
index = 0

for user_folder in tqdm(os.listdir(data_path)):
	user_folder_path = os.path.join(data_path, user_folder)

	if os.path.isdir(user_folder_path):
		trajectory_dir = os.path.join(user_folder_path, 'Trajectory')
		
		if os.path.exists(trajectory_dir) and os.path.isdir(trajectory_dir):
			for trajectory_file in os.listdir(trajectory_dir):
				if trajectory_file.endswith('.plt'):
					trajectory_file_path = os.path.join(trajectory_dir, trajectory_file)
					df = pd.read_csv(trajectory_file_path, header=None, skiprows=6)
					df.insert(0, 'person', trajectory_file.replace(".plt", ""))
					df.insert(0, 'file', int(user_folder))
					dataframes.append(df)
			index += 1

gps_data = pd.concat(dataframes, ignore_index=True)
column_names = ['Person ID','Trajectory', 'Latitude', 'Longitude', '0', 'Altitude', 'NumDays', 'Date', 'Time']
gps_data.columns = column_names
gps_data['Timestamp'] = pd.to_datetime(gps_data['Date'] + ' ' + gps_data['Time'])
gps_data = gps_data.drop(columns=['NumDays', 'Date', 'Time', '0'])
gps_data = gps_data[gps_data['Latitude'] >= 39.75]
gps_data = gps_data[gps_data['Latitude'] <= 40.1]
gps_data = gps_data[gps_data['Longitude'] >= 116.18]
gps_data = gps_data[gps_data['Longitude'] <= 116.6]
gps_data.to_csv("data/geolife_gps_data.csv", index=False)

100%|██████████| 182/182 [03:03<00:00,  1.01s/it]


# 3. Load geolife labels

In [4]:
data_path = './geolife/Data/'

dataframes = []

for user_folder in os.listdir(data_path):
	user_folder_path = os.path.join(data_path, user_folder)

	labels_file_path = os.path.join(user_folder_path, 'labels.txt')
					
	if os.path.exists(labels_file_path):
		labels_df = pd.read_csv(labels_file_path, sep='\t')
		
		dataframes.append(labels_df)

data_labels = pd.concat(dataframes, ignore_index=True)
data_labels.to_csv("data/geolife_labels_data.csv", index=False)

# 4. Create Nodes

In [5]:
unique_taxi_ids = gps_data['Person ID'].unique()
unique_taxi_ids_df = pd.DataFrame({'Person ID': unique_taxi_ids})
unique_taxi_ids_df = unique_taxi_ids_df.sort_values(by='Person ID')
unique_taxi_ids_df = unique_taxi_ids_df.reset_index(drop=True)
unique_taxi_ids_df.to_csv('data/geolife_nodes.csv', index=False)

# 5. Add Geohash codes

In [6]:
def create_geohash(lat, lon):
	return gh.encode(lat, lon, precision=8)

In [7]:
codes = []
columns = gps_data.columns.to_list()
lat = columns.index('Latitude')
lon = columns.index('Longitude')
for i in tqdm(range(len(gps_data))):
	codes.append(create_geohash(gps_data.iat[i, lat], gps_data.iat[i, lon]))

geohash_data = gps_data.copy(deep=True)
geohash_data['Geohash'] = codes

geohash_data.to_csv('data/geolife_geohash_size_8.csv',index=False)

100%|██████████| 18165801/18165801 [09:13<00:00, 32832.43it/s]


# 6. Group users based on geohash codes

In [17]:
# create pairs of meetings for two persons
def createEdges(geohash_data):
	locations_meets = []
	print("loaded")
	geohash_count = len(geohash_data['Geohash'].unique())
	print(geohash_count)
	print("grouping")
	codes = geohash_data.groupby('Geohash')
	print("grouped")

	t = 0
	k = 0
	for code, group in tqdm(codes):
		t += 1
		lat = group['Latitude'].mean()
		lon = group['Longitude'].mean()
		person_ids = group['Person ID'].values.tolist()
		times = group['Timestamp'].values.tolist()

		locations_meets.append({
			'Geohash': code,
			"persons": person_ids,
			"times": times,
			'Latitude': lat,
			'Longitude': lon
		})

		if len(locations_meets) > 1000:
			if not (os.path.isdir('data/geolife_edges/')):
				os.makedirs('data/geolife_edges/')

			with open('data/geolife_edges/geolige_meets_'+str(k)+'.json', 'w') as json_file:
				json.dump(locations_meets, json_file, indent=4)

			k += 1
			locations_meets = []
	
	print(str(t / float(geohash_count)), t, '/', geohash_count)
	
	if len(locations_meets) > 0:
		if not (os.path.isdir('data/geolife_edges/')):
			os.makedirs('data/geolife_edges/')

		with open('data/geolife_edges/geolife_meets_' + str(k) + '.json', 'w') as json_file:
			json.dump(locations_meets, json_file, indent=4)

	return k

In [18]:
k = createEdges(geohash_data)
print(k)
edges = []

# for i in tqdm(range(421)):
for i in tqdm(range(k)):
	d = pd.read_json('data/geolife_edges/geolige_meets_' + str(i) + '.json')
	edges.append(d.copy(deep=True))
edges = pd.concat(edges)
edges.to_csv('data/geolife_edges_size_8.csv', index=False)

loaded
421526
grouping
grouped


100%|██████████| 421526/421526 [01:39<00:00, 4221.67it/s]


1.0 421526 / 421526
421


100%|██████████| 421/421 [00:10<00:00, 39.71it/s]


# 7. Geohash meets without time

In [19]:
def create_meet_geohash():
	edges_all = pd.read_csv('data/geolife_edges_size_8.csv')
	persons = []
	p_times = []
	count_p = []
	count_t = []
	t_diff = []
	
	for i in tqdm(range(len(edges_all))):
		x = json.loads(edges_all.iloc[i]['times'].replace("'","\""))
		p = json.loads(edges_all.iloc[i]['persons'])
		assert len(x) == len(p)
		t = {}
		for j in np.unique(p):
			t[str(j)] = []
		for j in range(len(x)):
			t[str(p[j])].append(x[j])
		persons.append(json.dumps(np.unique(p).tolist()))
		p_times.append(json.dumps(t))
		count_p.append(len(np.unique(p)))
		count_t.append(len(x))
		xt = np.array(x, dtype='datetime64[s]')
		t_diff.append(abs(np.timedelta64(xt.max() - xt.min(), 's').astype('int')))

	meet_edges = edges_all.copy(deep=True)
	meet_edges['persons'] = persons
	meet_edges['times'] = p_times
	meet_edges['count_p'] = count_p
	meet_edges['count_t'] = count_t
	meet_edges['diff_time'] = t_diff

	meet_edges.to_csv("data/geolife_groupby_geohash_size_8.csv", index=False)
	meet_edges[(meet_edges['diff_time'] >= 0) & (meet_edges['count_p'] > 1)].to_csv("data/geolife_meet_geohash_size_8.csv", index=False)


In [20]:
create_meet_geohash()

100%|██████████| 421421/421421 [01:04<00:00, 6565.02it/s]


In [21]:
def create_meet_edges_without_time():

	print('loading')
	meet_geohash = pd.read_csv("data/geolife_meet_geohash_size_8.csv")
	meets = {}
	print('starting')
	for i in tqdm(range(len(meet_geohash))):
		p = json.loads(meet_geohash.iloc[i]['persons'])
		for a in range(len(p)):
			a_key = p[a]
			for b in range(a+1,len(p)):
				b_key = p[b]
						
				a_in = a_key in meets.keys()
				b_in = b_key in meets.keys()
				if not a_in and not b_in:
					meets[a_key]=set()
					meets[a_key].add(b_key)
				elif a_in:
					meets[a_key].add(b_key)
				elif b_in:
					meets[b_key].add(a_key)
				else:
					raise Exception("")

	edges = []
	for a in meets.keys():
		for b in meets[a]:
			edges.append([a,b])
	edges = pd.DataFrame(edges, columns=['A', 'B'])
	edges.to_csv('data/geolife_meet_edges.csv',index=False)


In [22]:
create_meet_edges_without_time()

loading
starting


100%|██████████| 240746/240746 [00:10<00:00, 23149.52it/s]


# 8. Geohash meets with time

In [23]:
def create_meet_edges(min_seconds, max_seconds, min_times, max_times):

	print('loading')
	meet_geohash = pd.read_csv("data/geolife_meet_geohash_size_8.csv")
	meet_geohash = meet_geohash[(meet_geohash['count_t'] <= max_times) & (meet_geohash['count_t'] >= min_times)].copy(deep=True)
	meet_geohash.reset_index(drop=True, inplace=True)
	meets = {}
	print('starting')
	for i in tqdm(range(len(meet_geohash))):
		users = json.loads(meet_geohash.iloc[i]['times'])
		keys = list(users.keys())
		for a in range(len(keys)):
			a_key = keys[a]
			a_times = np.array(users[a_key], dtype='datetime64[s]')
			for b in range(a+1,len(keys)):
				b_key = keys[b]
				b_times = np.array(users[b_key], dtype='datetime64[s]')

				for a_time in a_times:
					for b_time in b_times:
						diff = abs(np.timedelta64(b_time - a_time, 's').astype('int'))
						if min_seconds <= diff <= max_seconds:
							a_in = a_key in meets.keys()
							b_in = b_key in meets.keys()
							if not a_in and not b_in:
								meets[a_key]=set()
								meets[a_key].add(b_key)
							elif a_in:
								meets[a_key].add(b_key)
							elif b_in:
								meets[b_key].add(a_key)
							else:
								raise Exception("")

	for i in meets.keys():
		meets[i]=list(meets[i])
		
	if not (os.path.isdir('data/meets/')):
			os.makedirs('data/meets/')
			
	with open('data/meets/meets_size_8_'+str(min_seconds)+'-'+str(max_seconds)+'s-'+str(min_times)+'-'+str(max_times)+'times.json', "w") as write_file:
		json.dump(meets, write_file)

	edges = []
	for a in meets.keys():
		for b in meets[a]:
			edges.append([a,b])
	edges = pd.DataFrame(edges, columns=['A', 'B'])
	edges.to_csv('data/meets/geolife_meet_edges_size_8_'+str(min_seconds)+'-'+str(max_seconds)+'s-'+str(min_times)+'-'+str(max_times)+'times.csv',index=False)


In [24]:
# create_meet_edges(0, 120, 0, 10)
create_meet_edges(0, 120, 0, 100)
# create_meet_edges(0, 120, 0, 1000)


loading
starting



overflow encountered in scalar absolute

100%|██████████| 206349/206349 [04:28<00:00, 768.84it/s] 
