In [73]:
import numpy as np
import pandas as pd
import glob
import os.path
import datetime
import os

In [74]:
def read_plt(plt_file):
    points = pd.read_csv(plt_file, skiprows=6, header=None)

    # for clarity rename columns
    points.rename(inplace=True, columns={0: 'lat', 1: 'lon', 3: 'alt', 5: 'day', 6: 'hour'})
    
    date_format = '%Y-%m-%d %H:%M:%S'
    points['time'] = pd.to_datetime(points['day'] + ' ' + points['hour'], format=date_format)

    # remove unused columns
    points.drop(inplace=True, columns=[2, 4, 'day', 'hour'])

    return points

plt_file = "/kaggle/input/microsoft-geolife-gps-trajectory-dataset/Geolife Trajectories 1.3/Data/000/Trajectory/20081023025304.plt"
df_plt = read_plt(plt_file)
df_plt

Unnamed: 0,lat,lon,alt,time
0,39.984702,116.318417,492,2008-10-23 02:53:04
1,39.984683,116.318450,492,2008-10-23 02:53:10
2,39.984686,116.318417,492,2008-10-23 02:53:15
3,39.984688,116.318385,492,2008-10-23 02:53:20
4,39.984655,116.318263,492,2008-10-23 02:53:25
...,...,...,...,...
903,40.009172,116.321211,88,2008-10-23 11:10:52
904,40.009204,116.321130,86,2008-10-23 11:10:57
905,40.009243,116.321050,85,2008-10-23 11:11:02
906,40.009269,116.320978,84,2008-10-23 11:11:07


In [75]:
mode_names = ['walk', 'bike', 'bus', 'car', 'subway','train', 'airplane', 'boat', 'run', 'motorcycle', 'taxi']
mode_ids = {s : i + 1 for i, s in enumerate(mode_names)} #walk: 1, ..., taxi: 11

def read_labels(labels_file):
    labels = pd.read_csv(labels_file, skiprows=1, header=None, sep='\s+')
    labels['start_time'] = pd.to_datetime(labels[0] + ' ' + labels[1])
    labels['end_time'] = pd.to_datetime(labels[2] + ' ' + labels[3])
    labels.rename(inplace=True, columns = {4: 'label'})

    labels.drop(inplace=True, columns=[0,1,2,3])
    labels['label'] = [mode_ids[i] for i in labels['label']]

    return labels

labels_file = "/kaggle/input/microsoft-geolife-gps-trajectory-dataset/Geolife Trajectories 1.3/Data/065/labels.txt"
labels = read_labels(labels_file)
labels

Unnamed: 0,label,start_time,end_time
0,1,2011-08-24 13:51:21,2011-08-24 13:52:44
1,2,2011-08-25 00:36:03,2011-08-25 00:58:25
2,2,2011-08-25 14:44:57,2011-08-25 14:58:30
3,1,2011-08-26 01:01:58,2011-08-26 01:24:01
4,1,2011-08-28 01:11:21,2011-08-28 07:20:04
...,...,...,...
201,1,2012-01-07 12:01:34,2012-01-07 12:04:37
202,3,2012-01-07 12:04:38,2012-01-07 12:11:09
203,1,2012-01-07 12:11:10,2012-01-07 12:24:10
204,11,2012-01-08 05:30:59,2012-01-08 05:59:00


In [76]:
def apply_labels(points, labels):
    indices = labels['start_time'].searchsorted(points['time'], side='right') - 1    
    #indices:  chỉ ra khoảng thời gian nào trong labels dataframe mà point có thể thuộc vào
    
    no_label_condition = (indices < 0) | (points['time'].values >= labels['end_time'].iloc[indices].values)
    points['label'] = labels['label'].iloc[indices].values
    points.loc[no_label_condition, 'label'] = 0
    

def read_user(user_folder):
    labels = None

    plt_files = glob.glob(os.path.join(user_folder, 'Trajectory', '*.plt'))
    df = pd.concat([read_plt(f) for f in plt_files])

    labels_file = os.path.join(user_folder, 'labels.txt')
    if os.path.exists(labels_file):
        labels = read_labels(labels_file)
        apply_labels(df, labels)
    else:
        df['label'] = 0

    return df

user_folder = "/kaggle/input/microsoft-geolife-gps-trajectory-dataset/Geolife Trajectories 1.3/Data/065"
df_user = read_user(user_folder)
df_user

Unnamed: 0,lat,lon,alt,time,label
0,39.974437,116.337132,393.0,2010-05-08 02:23:41,0
1,39.974376,116.336183,456.0,2010-05-08 02:23:46,0
2,39.975417,116.335274,758.0,2010-05-08 02:25:06,0
3,39.974968,116.335099,674.0,2010-05-08 02:25:11,0
4,39.974796,116.334956,643.0,2010-05-08 02:25:16,0
...,...,...,...,...,...
4675,39.978547,116.332460,0.0,2011-09-01 13:38:30,2
4676,39.978570,116.332377,0.0,2011-09-01 13:38:30,2
4677,39.978577,116.332328,0.0,2011-09-01 13:38:30,2
4678,39.978577,116.332360,0.0,2011-09-01 13:38:30,2


In [77]:
def correct_label(df_user):
    df_user = df_user[df_user['label'] != 0].copy()
    if len(df_user) != 0:
        df_user['label'] = df_user['label'] - 1
    return df_user

In [78]:
df_user = correct_label(df_user)
df_user

Unnamed: 0,lat,lon,alt,time,label
0,39.978795,116.308245,0.0,2011-09-23 09:52:33,0
1,39.978805,116.308210,0.0,2011-09-23 09:52:34,0
2,39.978703,116.308490,0.0,2011-09-23 09:52:51,0
3,39.978715,116.308550,0.0,2011-09-23 09:52:52,0
4,39.978717,116.308557,0.0,2011-09-23 09:52:52,0
...,...,...,...,...,...
4675,39.978547,116.332460,0.0,2011-09-01 13:38:30,1
4676,39.978570,116.332377,0.0,2011-09-01 13:38:30,1
4677,39.978577,116.332328,0.0,2011-09-01 13:38:30,1
4678,39.978577,116.332360,0.0,2011-09-01 13:38:30,1


In [79]:
df_user[(df_user['label'] == -1) | (df_user['label'] == 11)]

Unnamed: 0,lat,lon,alt,time,label


In [80]:
df_user['time'].value_counts()

time
2011-09-29 01:46:56    5
2011-10-25 00:34:38    5
2011-10-25 00:34:54    5
2011-10-25 00:34:53    5
2011-10-25 00:34:52    5
                      ..
2011-10-01 16:46:16    1
2011-10-01 16:46:11    1
2011-10-01 16:46:06    1
2011-10-01 16:46:01    1
2011-09-01 13:38:31    1
Name: count, Length: 79102, dtype: int64

In [81]:
def remove_duplication(df_user):
    df_user = df_user.drop_duplicates(subset=['time'], keep='first')
    return df_user

df_user = remove_duplication(df_user)

In [82]:
df_user['time'].value_counts().max()

1

# Segmentation

In [83]:
def segment_user_trajectory(user_folder, df_user, threshold_timediff = (60*2)):
    
    user_name = user_folder.split('/')[-1]
    tid = 0
    modified_groups = []
    
    for segmentor, group in df_user.groupby('label'):
        group = group.sort_values(by = 'time', ascending = True)
        group.index = range(len(group))

        group['tid'] = 'tid'
        group['time_diff'] = group['time'].diff().dt.total_seconds()
        group['time_diff'] = group['time_diff'].fillna(0)
#         print(group)

        indices = list((group[group['time_diff'] > threshold_timediff]).index)
        
        
        if indices:
            if indices[-1] <= len(group)-1:
                indices.append(len(group))
#             print(indices)
            
            start = 0
            for index in indices:
                                            
                group.loc[start:(index-1), 'tid'] = f'{user_name}_{tid}'
                                            
#                 print("---------------------------------------")
#                 print("start", start)
#                 print("index", index-1)                
#                 print(group.loc[start:(index-1)])
#                 print("---------------------------------------")
                                            
                start = index
                tid = tid + 1
        else:
#             print(indices)
            group['tid'] = f'{user_name}_{tid}'
            tid = tid + 1
            
            
        modified_groups.append(group)
        
    df_modified = pd.concat(modified_groups)
    
    return df_modified

df_user_segment = segment_user_trajectory(user_folder, df_user)

In [84]:
df_user_segment.head(10)

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff
0,39.978887,116.332237,0.0,2011-08-24 13:51:21,0,065_0,0.0
1,39.978683,116.332422,0.0,2011-08-24 13:51:40,0,065_0,19.0
2,39.97865,116.33247,0.0,2011-08-24 13:51:41,0,065_0,1.0
3,39.978628,116.332495,0.0,2011-08-24 13:51:42,0,065_0,1.0
4,39.978605,116.332522,0.0,2011-08-24 13:51:43,0,065_0,1.0
5,39.978578,116.332543,0.0,2011-08-24 13:51:44,0,065_0,1.0
6,39.97856,116.332563,0.0,2011-08-24 13:51:45,0,065_0,1.0
7,39.97854,116.332582,0.0,2011-08-24 13:51:46,0,065_0,1.0
8,39.978525,116.332597,0.0,2011-08-24 13:51:47,0,065_0,1.0
9,39.978505,116.332615,0.0,2011-08-24 13:51:48,0,065_0,1.0


In [85]:
df_user_segment[df_user_segment['tid'] == 'tid']

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff


In [86]:
df_user_segment['tid'].value_counts()

tid
065_227    2473
065_252    1784
065_236    1676
065_140    1263
065_162    1183
           ... 
065_112       1
065_151       1
065_122       1
065_126       1
065_65        1
Name: count, Length: 284, dtype: int64

# Partition

In [87]:
def partition_user_trajectory(df_user_segment, max_point = 100):
    
    modified_groups = []
    
    for tid, group in list(df_user_segment.groupby('tid')):
        partition = 0

        group = group.sort_values(by = 'time', ascending = True)
        group.index = range(len(group))
        group['partition'] = 'partition'

#         print(group)
        
        if len(group) > max_point:
            for i in range(0, len(group), max_point):
                if (len(group)-1) - i + 1 >= max_point:
                    group.loc[i:(i+max_point-1), 'partition'] = f'{tid}_{partition}'
                    partition += 1
                    
                else:
                    group.loc[i:, 'partition'] = f'{tid}_{partition}'
                    partition += 1
                    
                
        else:
            group['partition'] = f'{tid}_{partition}'
            partition += 1
            
        modified_groups.append(group)
        
    df_modified = pd.concat(modified_groups)
    df_modified.index = range(len(df_modified))
    
    return df_modified

df_user_partition = partition_user_trajectory(df_user_segment)
df_user_partition

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition
0,39.978887,116.332237,0.0,2011-08-24 13:51:21,0,065_0,0.0,065_0_0
1,39.978683,116.332422,0.0,2011-08-24 13:51:40,0,065_0,19.0,065_0_0
2,39.978650,116.332470,0.0,2011-08-24 13:51:41,0,065_0,1.0,065_0_0
3,39.978628,116.332495,0.0,2011-08-24 13:51:42,0,065_0,1.0,065_0_0
4,39.978605,116.332522,0.0,2011-08-24 13:51:43,0,065_0,1.0,065_0_0
...,...,...,...,...,...,...,...,...
79097,39.978097,116.328763,0.0,2011-08-30 15:14:30,1,065_99,1.0,065_99_4
79098,39.978137,116.328762,0.0,2011-08-30 15:14:31,1,065_99,1.0,065_99_4
79099,39.978177,116.328765,0.0,2011-08-30 15:14:32,1,065_99,1.0,065_99_4
79100,39.978222,116.328767,0.0,2011-08-30 15:14:33,1,065_99,1.0,065_99_4


In [88]:
df_user_partition[df_user_partition['partition'] == 'partition']

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition


In [89]:
partition_counts = df_user_partition['partition'].value_counts().sort_index()
partition_counts

partition
065_0_0       31
065_100_0    100
065_100_1     52
065_101_0     66
065_102_0     85
            ... 
065_99_3     100
065_99_4      23
065_9_0      100
065_9_1      100
065_9_2       52
Name: count, Length: 947, dtype: int64

# Remove unsatisfied sub-trajectory

In [90]:
def remove_unsatisfied_partition(df_user_partition, threshold_num_points = 20 ):
    partition_counts = df_user_partition['partition'].value_counts().sort_index()
    satisfied_partition = partition_counts[partition_counts >= threshold_num_points].index
    satisfied_df_user = df_user_partition[df_user_partition['partition'].isin(satisfied_partition)]
#     if len(satisfied_df_user) == 0:
#         print("empty")
    return satisfied_df_user

satisfied_df_user = remove_unsatisfied_partition(df_user_partition)
satisfied_df_user

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition
0,39.978887,116.332237,0.0,2011-08-24 13:51:21,0,065_0,0.0,065_0_0
1,39.978683,116.332422,0.0,2011-08-24 13:51:40,0,065_0,19.0,065_0_0
2,39.978650,116.332470,0.0,2011-08-24 13:51:41,0,065_0,1.0,065_0_0
3,39.978628,116.332495,0.0,2011-08-24 13:51:42,0,065_0,1.0,065_0_0
4,39.978605,116.332522,0.0,2011-08-24 13:51:43,0,065_0,1.0,065_0_0
...,...,...,...,...,...,...,...,...
79097,39.978097,116.328763,0.0,2011-08-30 15:14:30,1,065_99,1.0,065_99_4
79098,39.978137,116.328762,0.0,2011-08-30 15:14:31,1,065_99,1.0,065_99_4
79099,39.978177,116.328765,0.0,2011-08-30 15:14:32,1,065_99,1.0,065_99_4
79100,39.978222,116.328767,0.0,2011-08-30 15:14:33,1,065_99,1.0,065_99_4


In [91]:
partition_counts = satisfied_df_user['partition'].value_counts().sort_index()
partition_counts

partition
065_0_0       31
065_100_0    100
065_100_1     52
065_101_0     66
065_102_0     85
            ... 
065_99_3     100
065_99_4      23
065_9_0      100
065_9_1      100
065_9_2       52
Name: count, Length: 868, dtype: int64

In [92]:
partition_counts.min()

20

# Features Engineering

In [93]:
import pandas as pd
import numpy as np
from math import atan2, degrees, sin, cos, sqrt, radians

In [94]:
# Calculate distance between consecutive coordinates
def calculate_distance(lat1, lon1, alt1, lat2, lon2, alt2):
    R = 6371  # Radius of the Earth in kilometers

    # Convert coordinates to radians
    lat1_rad = np.deg2rad(lat1)
    lon1_rad = np.deg2rad(lon1)
    lat2_rad = np.deg2rad(lat2)
    lon2_rad = np.deg2rad(lon2)

    # Haversine formula to calculate surface distance
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    surface_distance = R * c * 1000 # in meter

    # Calculate altitude difference
    dalt = alt2 - alt1  # in kilometers (assuming altitudes are provided in kilometers)

    # Combine surface distance and altitude difference using Pythagorean theorem
    distance = np.sqrt(surface_distance ** 2 + dalt ** 2) 

    return distance

In [95]:
def calculate_bearing_and_pitch(lat1, lon1, alt1, lat2, lon2, alt2):
    # Horizontal bearing calculation
    delta_lon = np.radians(lon2 - lon1)
    lat1_rad, lat2_rad = np.radians(lat1), np.radians(lat2)

    y = np.sin(delta_lon) * np.cos(lat2_rad)
    x = np.cos(lat1_rad) * np.sin(lat2_rad) - np.sin(lat1_rad) * np.cos(lat2_rad) * np.cos(delta_lon)

    bearing = np.degrees(np.arctan2(y, x))
    bearing = (bearing + 360) % 360

    # Vertical angle (pitch) calculation
    # Calculate the horizontal distance using the Haversine formula
    dlat = lat2_rad - lat1_rad
    dlon = delta_lon
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    horizontal_distance = 6371 * c * 1000  # in meters

    # Calculate the vertical angle (pitch)
    dalt = (alt2 - alt1) # in meters
    pitch = np.degrees(np.arctan2(dalt, horizontal_distance))

    return bearing, pitch

In [96]:
def create_attribute(df_user):
    
    modified_groups = []
    
    for partition, df in df_user.groupby('partition'):
        
        df['alt'] = df['alt'] * 0.3048  # Convert altitude from feet to meters
        df['time_diff'] = df['time'].diff().dt.total_seconds()
        df['distance'] = calculate_distance(df['lat'].shift().values, df['lon'].shift().values, df['alt'].shift().values, 
                                            df['lat'].values, df['lon'].values, df['alt'].values)
        
        df['speed'] = df['distance'] / df['time_diff']
        df.fillna(0, inplace = True)
        df['acceleration'] = df['speed'].diff() / df['time_diff']
        df['bearing'], df['pitch'] = calculate_bearing_and_pitch(df['lat'].shift().values, df['lon'].shift().values, df['alt'].shift().values,
                                                                 df['lat'].values, df['lon'].values , df['alt'].values)
        df.fillna(0, inplace = True)

        modified_groups.append(df)
    df_engineering = pd.concat(modified_groups)
    
    return df_engineering

df_user_engineering = create_attribute(satisfied_df_user)
df_user_engineering

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition,distance,speed,acceleration,bearing,pitch
0,39.978887,116.332237,0.0,2011-08-24 13:51:21,0,065_0,0.0,065_0_0,0.000000,0.000000,0.000000,0.000000,0.0
1,39.978683,116.332422,0.0,2011-08-24 13:51:40,0,065_0,19.0,065_0_0,27.562210,1.450643,0.076350,145.116065,0.0
2,39.978650,116.332470,0.0,2011-08-24 13:51:41,0,065_0,1.0,065_0_0,5.540648,5.540648,4.090005,131.987190,0.0
3,39.978628,116.332495,0.0,2011-08-24 13:51:42,0,065_0,1.0,065_0_0,3.215897,3.215897,-2.324751,138.517718,0.0
4,39.978605,116.332522,0.0,2011-08-24 13:51:43,0,065_0,1.0,065_0_0,3.448841,3.448841,0.232944,138.789624,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
75911,39.978753,116.332020,0.0,2011-08-28 14:17:41,0,065_9,1.0,065_9_2,3.710850,3.710850,-0.418014,95.732403,0.0
75912,39.978748,116.332078,0.0,2011-08-28 14:17:42,0,065_9,1.0,065_9_2,5.001392,5.001392,1.290543,96.382399,0.0
75913,39.978743,116.332153,0.0,2011-08-28 14:17:43,0,065_9,1.0,065_9_2,6.414647,6.414647,1.413254,94.972194,0.0
75914,39.978732,116.332248,0.0,2011-08-28 14:17:44,0,065_9,1.0,065_9_2,8.197937,8.197937,1.783291,99.104954,0.0


# Complete the dataframe

In [97]:
def read_all_users(folder):
    subfolders = os.listdir(folder)
    dfs = []
    for i, sf in enumerate(subfolders):
        print('[%d/%d] processing user %s' % (i + 1, len(subfolders), sf))
        user_folder = os.path.join(folder,sf)
        df_user = read_user(user_folder)
        df_user = correct_label(df_user)
        if len(df_user) != 0:
            df_user = remove_duplication(df_user)
            df_user_segment = segment_user_trajectory(user_folder, df_user)
            df_user_partition = partition_user_trajectory(df_user_segment)
            satisfied_df_user = remove_unsatisfied_partition(df_user_partition)
            if len(satisfied_df_user) != 0:
                df_user_engineering = create_attribute(satisfied_df_user)
                df_user_engineering['user'] = int(sf)
                dfs.append(df_user_engineering)
                
    return pd.concat(dfs)

df = read_all_users('/kaggle/input/microsoft-geolife-gps-trajectory-dataset/Geolife Trajectories 1.3/Data')

[1/182] processing user 135
[2/182] processing user 057
[3/182] processing user 086
[4/182] processing user 121
[5/182] processing user 061
[6/182] processing user 048
[7/182] processing user 053
[8/182] processing user 164
[9/182] processing user 147
[10/182] processing user 145
[11/182] processing user 051
[12/182] processing user 137
[13/182] processing user 095
[14/182] processing user 018
[15/182] processing user 044
[16/182] processing user 016
[17/182] processing user 007
[18/182] processing user 009
[19/182] processing user 012
[20/182] processing user 029
[21/182] processing user 025
[22/182] processing user 078
[23/182] processing user 001
[24/182] processing user 056
[25/182] processing user 006
[26/182] processing user 120
[27/182] processing user 109
[28/182] processing user 042
[29/182] processing user 000
[30/182] processing user 127
[31/182] processing user 150
[32/182] processing user 158
[33/182] processing user 082
[34/182] processing user 055
[35/182] processing use

In [98]:
df

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition,distance,speed,acceleration,bearing,pitch,user
0,39.990883,116.418150,57.0000,2007-05-04 01:45:11,0,086_0,0.0,086_0_0,0.000000,0.000000,0.000000,0.000000,0.000000,86
1,39.991183,116.418033,57.0000,2007-05-04 01:45:30,0,086_0,19.0,086_0_0,34.807640,1.831981,0.096420,343.408860,0.000000,86
2,39.991033,116.418000,57.0000,2007-05-04 01:46:00,0,086_0,30.0,086_0_0,16.919249,0.563975,-0.042267,189.662214,0.000000,86
3,39.991150,116.417850,57.0000,2007-05-04 01:46:39,0,086_0,39.0,086_0_0,18.209539,0.466911,-0.002489,315.431762,0.000000,86
4,39.990950,116.417833,57.0000,2007-05-04 01:47:08,0,086_0,29.0,086_0_0,22.284265,0.768423,0.010397,183.653119,0.000000,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406833,39.930511,116.317868,110.3376,2008-05-02 10:07:40,0,085_9,3.0,085_9_0,0.000000,0.000000,-0.087373,0.000000,0.000000,85
406834,39.930512,116.317869,111.8616,2008-05-02 10:07:42,0,085_9,2.0,085_9_0,1.530428,0.765214,0.382607,37.481837,84.746714,85
406835,39.930511,116.317855,113.0808,2008-05-02 10:07:47,0,085_9,5.0,085_9_0,1.709918,0.341984,-0.084646,264.678336,45.480894,85
406836,39.930494,116.317802,113.3856,2008-05-02 10:07:52,0,085_9,5.0,085_9_0,4.908041,0.981608,0.127925,247.300941,3.560484,85


In [99]:
df[ (df['tid'] == 'tid') | (df['partition'] == 'partition')]

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition,distance,speed,acceleration,bearing,pitch,user


In [100]:
df['partition'].nunique()

53410

In [101]:
partition_counts = df['partition'].value_counts().sort_index()
partition_counts

partition
010_100_0     66
010_101_0    100
010_102_0    100
010_102_1    100
010_102_2    100
            ... 
179_99_1     100
179_99_2     100
179_99_3      59
179_9_0      100
179_9_1       24
Name: count, Length: 53410, dtype: int64

In [102]:
partition_counts.max(), partition_counts.min()

(100, 20)

### Statistic

In [103]:
df.describe()

Unnamed: 0,lat,lon,alt,time,label,time_diff,distance,speed,acceleration,bearing,pitch,user
count,4801790.0,4801790.0,4801790.0,4801790,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0
mean,39.41052,114.2887,201.2535,2009-01-03 19:40:13.247898624,2.208988,2.612586,20.42229,7.571969,0.0580833,179.1678,0.1499245,93.96207
min,18.2499,-179.9696,-7022.988,2007-04-12 14:24:39,0.0,0.0,0.0,0.0,-2336.049,0.0,-90.0,10.0
25%,39.91366,116.3107,3.99288,2008-07-06 09:01:36,0.0,1.0,2.164675,1.115208,-0.2053499,88.61482,0.0,65.0
50%,39.97473,116.3384,42.0624,2008-10-02 01:56:47,2.0,2.0,6.152032,3.120739,0.0,178.4463,0.0,85.0
75%,39.9915,116.4188,63.00216,2009-03-14 04:09:46.500000,3.0,2.0,18.13442,10.35342,0.2162381,270.0,0.0,128.0
max,400.1667,179.9969,15837.99,2011-12-31 15:16:42,10.0,120.0,27515.26,4142.182,4140.748,359.9828,90.0,179.0
std,2.502948,16.07272,775.5197,,2.3262,4.698116,99.21455,12.34773,3.510371,109.2907,11.81535,49.41931


# Train_test_split

In [104]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Get unique 'tid' values
unique_partition = df['partition'].unique()

# Split unique 'tid' values
train_partitions, test_partitions = train_test_split(unique_partition, test_size=0.2, random_state=42)

# Filter the original DataFrame based on the split 'tid' values
train_raw_df = df[df['partition'].isin(train_partitions)]
test_raw_df = df[df['partition'].isin(test_partitions)]

In [105]:
train_raw_df.to_csv('geolife_train_raw_df.csv', index=False)
test_raw_df.to_csv('geolife_test_raw_df.csv', index=False)

# Noise Filtering

In [106]:
distance_mean = df.groupby('partition')['distance'].mean() 
distance_mean

partition
010_100_0    1.216117
010_101_0    2.518508
010_102_0    1.374399
010_102_1    1.569133
010_102_2    1.673482
               ...   
179_99_1     3.644470
179_99_2     3.778537
179_99_3     2.542235
179_9_0      7.656240
179_9_1      1.639758
Name: distance, Length: 53410, dtype: float64

In [107]:
partition_counts = df['partition'].value_counts().sort_index()
partition_counts

partition
010_100_0     66
010_101_0    100
010_102_0    100
010_102_1    100
010_102_2    100
            ... 
179_99_1     100
179_99_2     100
179_99_3      59
179_9_0      100
179_9_1       24
Name: count, Length: 53410, dtype: int64

In [108]:
term1 = partition_counts * distance_mean
term1

partition
010_100_0     80.263713
010_101_0    251.850790
010_102_0    137.439880
010_102_1    156.913267
010_102_2    167.348203
                ...    
179_99_1     364.446964
179_99_2     377.853668
179_99_3     149.991866
179_9_0      765.623962
179_9_1       39.354197
Length: 53410, dtype: float64

In [109]:
term2 = partition_counts.sum()
term2

4801790

In [110]:
mean_distance = term1.sum() / term2
mean_distance

20.422289101953595

### Median Filtering

In [111]:
import numpy as np
from scipy.signal import medfilt
import matplotlib.pyplot as plt
from tqdm import tqdm

In [112]:
df.columns

Index(['lat', 'lon', 'alt', 'time', 'label', 'tid', 'time_diff', 'partition',
       'distance', 'speed', 'acceleration', 'bearing', 'pitch', 'user'],
      dtype='object')

In [113]:
def noise_filtering (df, kernel_size = 12):
    location_features = ['lat', 'lon', 'alt']
    
    filtered_groups = []
    for segmentor, group in tqdm(df.groupby('partition')):
        for colname in location_features:
            group[colname] = medfilt(group[colname], kernel_size=kernel_size)

        group['distance'] = calculate_distance(group['lat'].shift().values, group['lon'].shift().values, group['alt'].shift().values,
                                               group['lat'].values, group['lon'].values, group['alt'].values)

        group['speed'] = group['distance'] / group['time_diff']
        group.fillna(0, inplace = True)
        group['acceleration'] = group['speed'].diff() / group['time_diff']
        group['bearing'], group['pitch'] = calculate_bearing_and_pitch(group['lat'].shift().values, group['lon'].shift().values, group['alt'].shift().values,
                                                                       group['lat'].values, group['lon'].values , group['alt'].values)
        group.fillna(0, inplace = True)
        
        filtered_groups.append(group)
    filtered_df = pd.concat(filtered_groups) 
    
    return filtered_df

In [114]:
filtered_df = noise_filtering (df, kernel_size = 21)
filtered_df.describe()

100%|██████████| 53410/53410 [03:50<00:00, 231.62it/s]


Unnamed: 0,lat,lon,alt,time,label,time_diff,distance,speed,acceleration,bearing,pitch,user
count,4801790.0,4801790.0,4801790.0,4801790,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0,4801790.0
mean,39.41038,114.2887,200.9064,2009-01-03 19:40:13.247899648,2.208988,2.612586,17.64793,6.654359,0.03567696,148.8845,0.2122175,93.96207
min,18.2499,-179.63,-7010.004,2007-04-12 14:24:39,0.0,0.0,0.0,0.0,-2056.666,0.0,-90.0,10.0
25%,39.91361,116.3107,3.99288,2008-07-06 09:01:36,0.0,1.0,1.019885,0.5337356,-0.1772478,37.45874,0.0,65.0
50%,39.97471,116.3384,42.00144,2008-10-02 01:56:47,2.0,2.0,4.336602,2.16872,0.0,164.3636,0.0,85.0
75%,39.99148,116.4188,61.99632,2009-03-14 04:09:46.500000,3.0,2.0,15.9994,9.014246,0.1853249,265.6209,0.0,128.0
max,58.76032,179.6656,15837.99,2011-12-31 15:16:42,10.0,120.0,25158.99,4113.401,4113.047,359.9916,90.0,179.0
std,2.49755,16.0726,775.0115,,2.3262,4.698116,90.45933,11.7358,2.870802,113.8853,16.00081,49.41931


In [115]:
df[df['speed'] >= 1000]

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition,distance,speed,acceleration,bearing,pitch,user
33548,40.06925,116.311428,110.3376,2009-04-16 13:47:48,4,082_151,2.0,082_151_3,2098.457128,1049.228564,517.762503,22.217805,1.414916,82
63414,39.987187,116.303453,65.01384,2008-10-18 05:31:06,0,062_122,1.0,062_122_1,4142.181866,4142.181866,4140.747659,276.773025,0.0,62
246914,39.817603,119.48152,-84.00288,2008-09-01 09:03:46,2,062_864,10.0,062_864_0,10351.635283,1035.163528,103.516353,295.800668,-83.162315,62
139,40.06965,116.3296,51.0,2007-06-27 11:13:48,0,117_4,6.0,117_4_0,8689.889203,1448.314867,241.385811,358.831052,0.191209,117
127977,39.94901,116.45565,56.9976,2008-06-22 01:38:48,10,163_811,3.0,163_811_1,7419.786119,2473.26204,820.747837,137.423744,-0.046367,163
325662,40.069333,116.329483,70.0,2007-05-29 14:41:18,0,128_50,4.0,128_50_0,10461.856582,2615.464145,653.866036,358.088933,0.0,128
465855,40.072035,116.61324,24.0792,2008-10-03 09:46:57,6,010_545,1.0,010_545_0,2348.80039,2348.80039,2348.80039,174.042727,-89.820006,10


In [116]:
filtered_df[filtered_df['speed'] >= 1000]

Unnamed: 0,lat,lon,alt,time,label,tid,time_diff,partition,distance,speed,acceleration,bearing,pitch,user
63414,39.982823,116.303455,65.01384,2008-10-18 05:31:06,0,062_122,1.0,062_122_1,4113.400701,4113.400701,4113.047114,270.015511,0.0,62
139,40.06965,116.32875,43.0,2007-06-27 11:13:48,0,117_4,6.0,117_4_0,8688.055648,1448.009275,241.334879,0.0,0.13849,117
325662,40.069333,116.32865,39.0,2007-05-29 14:41:18,0,128_50,4.0,128_50_0,10456.029602,2614.007401,653.50185,0.0,0.0,128
127977,39.955878,116.449205,59.00928,2008-06-22 01:38:48,10,163_811,3.0,163_811_1,6486.593528,2162.197843,717.059772,136.438006,-0.035269,163
352685,39.950942,116.309625,68.2752,2008-04-30 10:27:41,1,167_778,2.0,167_778_0,3248.804621,1624.402311,812.149288,210.718929,-0.215018,167


# Train_test_split

In [117]:
train_filtered_df = filtered_df[filtered_df['partition'].isin(train_partitions)]
test_filtered_df = filtered_df[filtered_df['partition'].isin(test_partitions)]

In [118]:
train_filtered_df.to_csv('geolife_train_filtered_df.csv', index=False)
test_filtered_df.to_csv('geolife_test_filtered_df.csv', index=False)