In [1]:
import json
import os
import pandas as pd
import load_geolife
import load_tdrive
import add_geohash
import create_nodes
import geohash_location_count
import geohash_user_count
import create_geohash_meet
import create_geohash_intervals
import geohash_pairs
import find_duplicates

In [3]:
def create_dirs():
	if not os.path.isdir('images/'):
		os.makedirs('images/')
	if not os.path.isdir('htmls/'):
		os.makedirs('htmls/')
	if not os.path.isdir('data/'):
		os.makedirs('data/')
	if not os.path.isdir('data/duplicates'):
		os.makedirs('data/duplicates')
	if not os.path.isdir('data/geolife_geohash_intervals'):
		os.makedirs('data/geolife_geohash_intervals')
	if not os.path.isdir('data/tdrive_pairs'):
		os.makedirs('data/tdrive_pairs')   
	if not os.path.isdir('data/tdrive_geohash_intervals'):
		os.makedirs('data/tdrive_geohash_intervals')
	if not os.path.isdir('data/tdrive_pairs'):
		os.makedirs('data/tdrive_pairs')


def removeDuplicates(in_file, duplicates_file, out_file):
	duplicates = pd.read_csv(duplicates_file)
	df = pd.read_csv(in_file)
	df['px'] = df['Person ID'].astype('string') + "_" + df['Trajectory'].astype('string') + ".plt"
	duplicates['px'] = duplicates['user'].astype('string')+"_"+duplicates['file_name'].astype('string')
	noduplicates = df[~df['px'].isin(duplicates['px'])].copy(deep=True)
	noduplicates.to_csv(out_file,index=False)

In [4]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)

In [10]:
create_dirs()
load_geolife.load_geolife('./geolife/Data/',"./data/geolife_gps_data.csv", "./data/geolife_labels_data.csv")
add_geohash.add_geohash('./data/geolife_gps_data.csv', './data/geolife_geohash_size_8.csv')
find_duplicates.find_duplicates_by_filename('./geolife/Data/', './data/duplicates/')
find_duplicates.find_duplicates_by_content('./geolife/Data/', './data/duplicates/')
removeDuplicates('./data/geolife_geohash_size_8.csv', './data/duplicates/geolife_user_track_duplicates_by_content.csv', './data/geolife_geohash_size_8_no_duplicates.csv')
create_nodes.create_nodes('./data/geolife_geohash_size_8_no_duplicates.csv', './data/geolife_nodes.csv')
geohash_location_count.geohash_location_count('./data/geolife_geohash_size_8_no_duplicates.csv', './data/geolife_geohash_8_location_counts.csv')
geohash_user_count.geohash_user_count('./data/geolife_geohash_size_8_no_duplicates.csv', './data/geolife_geohash_8_user_counts.csv')
count_files = create_geohash_meet.createGeohashMeetJsons('./data/geolife_geohash_size_8_no_duplicates.csv', './data/geolife_geohash_meet/')
create_geohash_meet.createGeohashMeetCSV(count_files,'./data/geolife_geohash_meet/', './data/geolife_geohash_meet_size_8.csv')
create_geohash_intervals.create_geohash_intervals('./data/geolife_geohash_meet_size_8.csv', './data/geolife_geohash_meet_intervals_size_8.csv')

100%|██████████| 5046480/5046480 [1:08:02<00:00, 1236.26it/s]


Local

In [4]:
import pandas as pd
import json
from tqdm import tqdm

def create_geohash_time_meets_local():
	df = pd.read_csv('data/geolife_geohash_meet_intervals_size_8.csv')
	meets_dict = {}

	for index, row in tqdm(df.iterrows()):
		meets_intervals = json.loads(row['meets_intervals'])
		
		if len(meets_intervals) > 1:
			keys = list(meets_intervals.keys())
			intervals = [meets_intervals[key] for key in keys]
			
			for i in range(len(keys)):
				for j in range(i + 1, len(keys)): 
					for interval_i in intervals[i]:
						for interval_j in intervals[j]:
							if (interval_i[1] <= interval_j[2] and interval_j[1] <= interval_i[2]):
								key = tuple(sorted([keys[i], keys[j]]))
								meets_dict.setdefault((row['geohash'], interval_i[5]), set()).add(key)

		else:
			key = next(iter(meets_intervals))
			meets_dict.setdefault((row['geohash'], meets_intervals[key][0][5]), set()).add(key)

	new_data = [{'geohash': key[0], 'time': key[1], 'person_ids': list(person_pairs)} for key, person_pairs in meets_dict.items()]
	new_df = pd.DataFrame(new_data)

	new_df.to_csv('data2/geolife_local_meets.csv', index=False)

create_geohash_time_meets_local()


415946it [00:43, 9624.37it/s] 


Global

In [5]:
import pandas as pd
import json
from tqdm import tqdm

def create_geohash_time_meets_global():
	df = pd.read_csv('data/geolife_geohash_meet_intervals_size_8.csv')
	meets_dict = {}

	for index, row in tqdm(df.iterrows()):
		meets_intervals = json.loads(row['meets_intervals'])
		
		if len(meets_intervals) > 1:
			keys = list(meets_intervals.keys())
			intervals = [meets_intervals[key] for key in keys]
			
			for i in range(len(keys)):
				for j in range(i + 1, len(keys)): 
					for interval_i in intervals[i]:
						for interval_j in intervals[j]:
							if (interval_i[1] <= interval_j[2] and interval_j[1] <= interval_i[2]):
								key = tuple(sorted([keys[i], keys[j]]))
								timestamp = interval_i[5]  # Assuming start time is at index 5
								meets_dict.setdefault(key, {'users': key, 'meetings': [], 'geohashes': set(), 'time': [None, None]})
								meets_dict[key]['meetings'].append((row['Latitude'], row['Longitude']))
								meets_dict[key]['geohashes'].add(row['geohash'])
								if meets_dict[key]['time'][0] is None or interval_i[1] < meets_dict[key]['time'][0]:
									meets_dict[key]['time'][0] = interval_i[1]
								if meets_dict[key]['time'][1] is None or interval_j[2] > meets_dict[key]['time'][1]:
									meets_dict[key]['time'][1] = interval_j[2]

	new_data = [{'users': info['users'], 'meetings': info['meetings'], 'geohashes': list(info['geohashes']), 'time': info['time']} for info in meets_dict.values()]
	new_df = pd.DataFrame(new_data)

	new_df.to_csv('data2/geolife_global_meets.csv', index=False)

create_geohash_time_meets_global()


415946it [00:44, 9266.63it/s] 


In [6]:
create_dirs()
load_tdrive.load_tdrive('./tdrive/taxi_log_2008_by_id',"./data/tdrive_gps_data.csv")
add_geohash.add_geohash('./data/tdrive_gps_data.csv', './data/tdrive_geohash_size_8.csv')
create_nodes.create_nodes('./data/tdrive_geohash_size_8.csv', './data/tdrive_nodes.csv')
geohash_location_count.geohash_location_count('./data/tdrive_geohash_size_8.csv', './data/tdrive_geohash_8_location_counts.csv')
geohash_user_count.geohash_user_count('./data/tdrive_geohash_size_8.csv', './data/tdrive_geohash_8_user_counts.csv')
count_files = create_geohash_meet.createGeohashMeetJsons('./data/tdrive_geohash_size_8.csv', './data/tdrive_geohash_meet/')
create_geohash_meet.createGeohashMeetCSV(count_files,'./data/tdrive_geohash_meet/', './data/tdrive_geohash_meet_size_8.csv')
create_geohash_intervals.create_geohash_intervals('./data/tdrive_geohash_meet_size_8.csv', './data/tdrive_geohash_meet_intervals_size_8.csv')

100%|██████████| 13818537/13818537 [06:55<00:00, 33267.56it/s]


loaded
587021
grouping...
grouped
0.0 0 / 587021
0.001703516569253911 1000 / 587021
0.003407033138507822 2000 / 587021
0.005110549707761733 3000 / 587021
0.006814066277015644 4000 / 587021
0.008517582846269554 5000 / 587021
0.010221099415523465 6000 / 587021
0.011924615984777375 7000 / 587021
0.013628132554031287 8000 / 587021
0.015331649123285197 9000 / 587021
0.017035165692539107 10000 / 587021
0.01873868226179302 11000 / 587021
0.02044219883104693 12000 / 587021
0.02214571540030084 13000 / 587021
0.02384923196955475 14000 / 587021
0.02555274853880866 15000 / 587021
0.027256265108062575 16000 / 587021
0.028959781677316485 17000 / 587021
0.030663298246570395 18000 / 587021
0.032366814815824305 19000 / 587021
0.034070331385078215 20000 / 587021
0.035773847954332125 21000 / 587021
0.03747736452358604 22000 / 587021
0.03918088109283995 23000 / 587021
0.04088439766209386 24000 / 587021
0.04258791423134777 25000 / 587021
0.04429143080060168 26000 / 587021
0.04599494736985559 27000 / 587021

100%|██████████| 586303/586303 [1:14:18<00:00, 131.50it/s] 
100%|██████████| 587021/587021 [2:07:41<00:00, 76.62it/s]   
100%|██████████| 587021/587021 [01:19<00:00, 7411.90it/s] 
7885it [00:01, 6064.28it/s]
100%|██████████| 1886666/1886666 [14:49<00:00, 2122.04it/s]
 25%|██▌       | 476919/1886666 [1:37:45<4:48:58, 81.31it/s]  


KeyboardInterrupt: 

Local

In [2]:
import pandas as pd
import json
from tqdm import tqdm

def create_geohash_time_meets_local():
	df = pd.read_csv('data/tdrive_geohash_meet_intervals_size_8.csv')
	meets_dict = {}

	for index, row in tqdm(df.iterrows()):
		meets_intervals = json.loads(row['meets_intervals'])
		
		if len(meets_intervals) > 1:
			keys = list(meets_intervals.keys())
			intervals = [meets_intervals[key] for key in keys]
			
			for i in range(len(keys)):
				for j in range(i + 1, len(keys)): 
					for interval_i in intervals[i]:
						for interval_j in intervals[j]:
							if (interval_i[1] <= interval_j[2] and interval_j[1] <= interval_i[2]):
								key = tuple(sorted([keys[i], keys[j]]))
								meets_dict.setdefault((row['geohash'], interval_i[5]), set()).add(key)

		else:
			key = next(iter(meets_intervals))
			meets_dict.setdefault((row['geohash'], meets_intervals[key][0][5]), set()).add(key)

	new_data = [{'geohash': key[0], 'time': key[1], 'person_ids': list(person_pairs)} for key, person_pairs in meets_dict.items()]
	new_df = pd.DataFrame(new_data)

	new_df.to_csv('data2/tdrive_local_meets.csv', index=False)

create_geohash_time_meets_local()


587021it [01:22, 7128.22it/s] 


Global

In [3]:
import pandas as pd
import json
from tqdm import tqdm

def create_geohash_time_meets_global():
	df = pd.read_csv('data/tdrive_geohash_meet_intervals_size_8.csv')
	meets_dict = {}

	for index, row in tqdm(df.iterrows()):
		meets_intervals = json.loads(row['meets_intervals'])
		
		if len(meets_intervals) > 1:
			keys = list(meets_intervals.keys())
			intervals = [meets_intervals[key] for key in keys]
			
			for i in range(len(keys)):
				for j in range(i + 1, len(keys)): 
					for interval_i in intervals[i]:
						for interval_j in intervals[j]:
							if (interval_i[1] <= interval_j[2] and interval_j[1] <= interval_i[2]):
								key = tuple(sorted([keys[i], keys[j]]))
								# Use the start time of interval_i for timestamp
								timestamp = interval_i[5]  # Assuming start time is at index 5
								meets_dict.setdefault(key, {'users': key, 'meetings': [], 'geohashes': set(), 'time': [None, None]})
								meets_dict[key]['meetings'].append((row['Latitude'], row['Longitude']))
								meets_dict[key]['geohashes'].add(row['geohash'])
								if meets_dict[key]['time'][0] is None or interval_i[1] < meets_dict[key]['time'][0]:
									meets_dict[key]['time'][0] = interval_i[1]
								if meets_dict[key]['time'][1] is None or interval_j[2] > meets_dict[key]['time'][1]:
									meets_dict[key]['time'][1] = interval_j[2]

	new_data = [{'users': info['users'], 'meetings': info['meetings'], 'geohashes': list(info['geohashes']), 'time': info['time']} for info in meets_dict.values()]
	new_df = pd.DataFrame(new_data)

	new_df.to_csv('data2/tdrive_global_meets.csv', index=False)

create_geohash_time_meets_global()


587021it [01:23, 7027.64it/s] 
