Import data

In [3]:
import pandas as pd
import os
from tqdm import tqdm

# Path to data
data_path = '../data/'

Load data

In [4]:
# Create an empty list to store DataFrames
dataframes = []
index = 0

# Traverse through user directories
for user_folder in tqdm(os.listdir(data_path)):
	user_folder_path = os.path.join(data_path, user_folder)

	# Check if it's a directory
	if os.path.isdir(user_folder_path):
		trajectory_dir = os.path.join(user_folder_path, 'Trajectory')
		
		# Check if 'Trajectory' directory exists
		if os.path.exists(trajectory_dir) and os.path.isdir(trajectory_dir):
			# Loop through trajectory files for each user
			for trajectory_file in os.listdir(trajectory_dir):
				if trajectory_file.endswith('.plt'):
					trajectory_file_path = os.path.join(trajectory_dir, trajectory_file)
					
					df = pd.read_csv(trajectory_file_path, header=None, skiprows=6)
					df.insert(0, 'person', index)
					dataframes.append(df)
			index += 1

  0%|          | 0/182 [00:00<?, ?it/s]

100%|██████████| 182/182 [04:01<00:00,  1.33s/it]


Concat data

In [5]:
# Concatenate all DataFrames into one
gps_data = pd.concat(dataframes, ignore_index=True)
gps_data

Unnamed: 0,person,0,1,2,3,4,5,6
0,0,39.984702,116.318417,0,492.000000,39744.120185,2008-10-23,02:53:04
1,0,39.984683,116.318450,0,492.000000,39744.120255,2008-10-23,02:53:10
2,0,39.984686,116.318417,0,492.000000,39744.120313,2008-10-23,02:53:15
3,0,39.984688,116.318385,0,492.000000,39744.120370,2008-10-23,02:53:20
4,0,39.984655,116.318263,0,492.000000,39744.120428,2008-10-23,02:53:25
...,...,...,...,...,...,...,...,...
24876973,181,40.914867,111.710500,0,3802.493438,39521.152731,2008-03-14,03:39:56
24876974,181,40.914267,111.710333,0,3795.931759,39521.153669,2008-03-14,03:41:17
24876975,181,40.912467,111.710667,0,3795.931759,39521.154884,2008-03-14,03:43:02
24876976,181,40.911517,111.711317,0,3779.527559,39521.155185,2008-03-14,03:43:28


Add labels

In [6]:
# Assign column names
column_names = ['Person ID', 'Latitude', 'Longitude', '0', 'Altitude', 'NumDays', 'Date', 'Time']

# Rename columns in the DataFrame
gps_data.columns = column_names
gps_data

Unnamed: 0,Person ID,Latitude,Longitude,0,Altitude,NumDays,Date,Time
0,0,39.984702,116.318417,0,492.000000,39744.120185,2008-10-23,02:53:04
1,0,39.984683,116.318450,0,492.000000,39744.120255,2008-10-23,02:53:10
2,0,39.984686,116.318417,0,492.000000,39744.120313,2008-10-23,02:53:15
3,0,39.984688,116.318385,0,492.000000,39744.120370,2008-10-23,02:53:20
4,0,39.984655,116.318263,0,492.000000,39744.120428,2008-10-23,02:53:25
...,...,...,...,...,...,...,...,...
24876973,181,40.914867,111.710500,0,3802.493438,39521.152731,2008-03-14,03:39:56
24876974,181,40.914267,111.710333,0,3795.931759,39521.153669,2008-03-14,03:41:17
24876975,181,40.912467,111.710667,0,3795.931759,39521.154884,2008-03-14,03:43:02
24876976,181,40.911517,111.711317,0,3779.527559,39521.155185,2008-03-14,03:43:28


Remove date, time and num days and replace them with timestamp

In [7]:
gps_data['Timestamp'] = pd.to_datetime(gps_data['Date'] + ' ' + gps_data['Time'])

# Drop the 'NumDays', 'Date', and 'Time' columns
gps_data = gps_data.drop(columns=['NumDays', 'Date', 'Time', '0'])

Recude data to Beijing Metro area only

In [8]:
gps_data = gps_data[gps_data['Latitude'] >= 39.43]
gps_data = gps_data[gps_data['Latitude'] <= 40.63]
gps_data = gps_data[gps_data['Longitude'] >= 115.8]
gps_data = gps_data[gps_data['Longitude'] <= 117.2]

gps_data

Unnamed: 0,Person ID,Latitude,Longitude,Altitude,Timestamp
0,0,39.984702,116.318417,492.000000,2008-10-23 02:53:04
1,0,39.984683,116.318450,492.000000,2008-10-23 02:53:10
2,0,39.984686,116.318417,492.000000,2008-10-23 02:53:15
3,0,39.984688,116.318385,492.000000,2008-10-23 02:53:20
4,0,39.984655,116.318263,492.000000,2008-10-23 02:53:25
...,...,...,...,...,...
24876951,181,39.988783,116.299000,114.829396,2008-02-17 10:43:09
24876952,181,39.989500,116.298667,98.425197,2008-02-17 10:43:56
24876953,181,39.990067,116.298350,131.233596,2008-02-17 10:44:26
24876954,181,39.989517,116.298417,150.918635,2008-02-17 10:45:23


Save data

In [9]:
# Save DataFrame to a CSV file
gps_data.to_csv('../model/gps_data.csv', index=False)

Load Labels

In [10]:
# Create an empty list to store DataFrames
dataframes = []

# Traverse through user directories
for user_folder in os.listdir(data_path):
	user_folder_path = os.path.join(data_path, user_folder)

	labels_file_path = os.path.join(user_folder_path, 'labels.txt')
					
	# Check if the labels file exists
	if os.path.exists(labels_file_path):
		labels_df = pd.read_csv(labels_file_path, sep='\t')
		
		dataframes.append(labels_df)

labels_data = pd.concat(dataframes, ignore_index=True)
labels_data

Unnamed: 0,Start Time,End Time,Transportation Mode
0,2007/06/26 11:32:29,2007/06/26 11:40:29,bus
1,2008/03/28 14:52:54,2008/03/28 15:59:59,train
2,2008/03/28 16:00:00,2008/03/28 22:02:00,train
3,2008/03/29 01:27:50,2008/03/29 15:59:59,train
4,2008/03/29 16:00:00,2008/03/30 15:59:59,train
...,...,...,...
14713,2008/11/17 06:59:58,2008/11/17 07:06:16,bus
14714,2008/11/17 07:06:16,2008/11/17 07:14:32,walk
14715,2008/11/29 01:58:05,2008/11/29 02:01:39,bus
14716,2008/11/29 02:01:39,2008/11/29 02:07:57,walk


In [11]:
# Save DataFrame to a CSV file
labels_data.to_csv('../model/labels_data.csv', index=False)