In [None]:
# check if the GPS coordinates are valid
import os
geolife_dir = 'Geolife Trajectories 1.3/Data/'
folder_list = os.listdir(geolife_dir)

for folder in folder_list:  
    trajectory_dir = geolife_dir + folder + '/Trajectory/'
    user_trajectories = os.listdir(trajectory_dir)
    trajectory_one_user = []
    for plt in user_trajectories:
        with open(trajectory_dir + plt, 'r', newline='', encoding='utf-8') as f:
            GPS_logs = filter(lambda x: len(x.split(',')) == 7, f)
            GPS_logs_split = map(lambda x: x.rstrip('\r\n').split(','), GPS_logs)
            for row in GPS_logs_split:
                if float(row[0])< -90 or float(row[0])>90 or float(row[1])< -180 or float(row[1])>180:
                    print(f"{folder}/{plt},{row[0],row[1]}")

In [1]:
# preprocessing step for the geolife dataset
import pandas as pd
import numpy as np
import os
 
data_dir = 'Geolife Trajectories 1.3/Data/'


dirlist = os.listdir(data_dir)
#list all the folders with labels.txt
label_dirs=[]
folder_dirs = []
for dir in dirlist:  
  if os.path.exists(data_dir + '/' +dir+'/labels.txt'):
    label_dirs.append(data_dir + '/' + dir+'/'+'labels.txt')
    folder_dirs.append(data_dir + '/' + dir+'/'+'Trajectory')

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

traj=pd.DataFrame()

num=0
BEIJING = [39.9, 116.41]                                                    
# central beijing coords, for map centres
B1 = 39.8,116.2                                          
# bbox limits for beijing extent
B2 =  40.0 ,116.5

for no,dir in enumerate(folder_dirs):
    label_path = label_dirs[no]
    label = pd.read_csv(label_path, sep='\t', header=0)
    label['Start Time'] = pd.to_datetime(label['Start Time'])
    label['End Time'] = pd.to_datetime(label['End Time'])
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        #read data:
        data = pd.read_csv(file_path,
                       header=None, 
                       skiprows=6,
                       names=['Latitude', 'Longitude', 'Not_Important1', 'Altitude', 'Not_Important2', 'Date', 'Time'])
        '''
        merge date and time
        '''
        data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])
        data=data[['Latitude', 'Longitude', 'Altitude', 'Datetime']]

        '''
        merge label and data
        '''
        for i in range(len(label)):
            mask=(data['Datetime']>=label.iloc[i,0]) & (data['Datetime']<=label.iloc[i,1])
            data.loc[mask,'mode']=label.iloc[i,2] 
        '''
        retain positions in Beijing city
        '''
        #data=data[(data['Latitude']>B1[0]) & (data['Latitude']<B2[0]) & (data['Longitude']>B1[1]) & (data['Longitude']<B2[1])] 
    
        '''
        time gap to 1s, and remain first record every 5s
        '''
        data['Datetime_1s']=data['Datetime'].dt.floor('1s')
        data=data.drop_duplicates(subset=['Datetime_1s'],keep='first')
    
        '''
        remove stopping point
        '''
        data['is_moving'] = (data['Latitude'] != data['Latitude'].shift()) | (data['Longitude'] != data['Longitude'].shift())
        data=data[data['is_moving']==True]
        data=data[['Latitude','Longitude','Datetime_1s','mode']]
    
    
        '''
        split trajs without records in 10min into 2 trajs （and update id）
        '''
        data['time_diff']=data['Datetime_1s'].diff()
        data['split_id']=0
        mask=data['time_diff']>pd.Timedelta(minutes=10)
        data.loc[mask,'split_id']=1
        data['split_id']=data['split_id'].cumsum()
    

        data['id']=str(num)
        num+=1
        data['id']=data['id']+'_'+data['split_id'].astype(str)
    
    
        '''
        calc each traj's length, filter out short trajs and truncate long ones
        '''
        #calculate nearby location's lon and lat gap
        lat_lon_diff = data.groupby('id',group_keys=False).apply(lambda group: group[['Latitude', 'Longitude']].diff())
        #calc nearby locationn's distance
        distance = lat_lon_diff.apply(lambda row: haversine_distance(row['Latitude'], row['Longitude'], 0, 0), axis=1)
        data['distance']=distance
        #calculate each id's accumulated distance
        data['accum_dis']=data.groupby('id')['distance'].cumsum()
        #split those trajs longer than 10km into 2 trajs
        data['split_traj_id']=data['accum_dis']//10
        data['split_traj_id']=data['split_traj_id'].fillna(0)
        data['split_traj_id']=data['split_traj_id'].astype(int).astype(str)
        #get new id
        data['id']=data['id']+'_'+data['split_traj_id']
        #remove those shorter than 1km
        iid=data.groupby('id')['accum_dis'].max()
        iid=iid.reset_index(name='distance')
        iid=iid[iid['distance']>1]
        data=data[data['id'].isin(iid['id'])]
    
    
        '''
        filter trajs shorter than 10  records
        '''
        iid=data.groupby('id').size()
        iid=iid.reset_index(name='count')
        iid=iid[iid['count']>=10]
        data=data[data['id'].isin(iid['id'])]
    
        '''
        remove stay points
        '''
        latlon=pd.DataFrame()
        latlon['max_lat']=data.groupby('id')['Latitude'].max()
        latlon['min_lat']=data.groupby('id')['Latitude'].min()
        latlon['max_lon']=data.groupby('id')['Longitude'].max()
        latlon['min_lon']=data.groupby('id')['Longitude'].min()
        latlon['max_dis']=latlon.apply(lambda row: haversine_distance(row['max_lat'],row['max_lon'],row['min_lat'],row['min_lon']),axis=1)
        latlon=latlon[latlon['max_dis']>=1]
    
    
        data=data[data['id'].isin(latlon.index)]
        data=data[['Latitude','Longitude','Datetime_1s','id','mode']]

        traj=pd.concat([traj,data])

#merge and select the final modes ['walk', 'bike', 'bus', 'drive', 'train']
traj.loc[traj['mode'].isin(['car', 'taxi']), 'mode'] = 'drive'
traj.loc[traj['mode'].isin(['train', 'subway']), 'mode'] = 'train'
traj = traj[traj['mode'].isin(['walk', 'bike', 'bus', 'drive', 'train'])]
traj['mode'].replace({'walk':0,'bike':1,'bus':2,'drive':3,'train':4},inplace=True)

#show the distribution of the modes at point level
print(traj['mode'].value_counts())

traj.to_csv('geolife_processed_full.csv',index=False)

  data['Datetime'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])


In [None]:
#split the data into segments with parameter M
import pandas as pd
M = 32
df = pd.read_csv('geolife_processed_full.csv')

df['segment_id'] = (df.groupby(['id', 'mode']).cumcount() // M).astype(int)
df['id'] = df['id'] + '_' + df['segment_id'].astype(str)   

#show the distribution of the modes at segment level
print(df.groupby('mode')['id'].nunique())

df.to_csv('geolife_processed_full_truncated{}.csv'.format(M),index=False)

In [None]:
#Obtain three trajectories with the most modes for visiualization
import pandas as pd 
df = pd.read_csv('geolife_processed_full.csv')
top_three_ids = df.groupby('id')['mode'].nunique().nlargest(3).index.tolist()
for id in top_three_ids:
    df_id = df[df['id']==id]
    df_id.to_csv(f'trajectory to show_{id}.csv',index=False)