In [1]:
from pymove import filters
from pymove import MoveDataFrame
import numpy as np
from numpy.testing import assert_array_equal
import pandas as pd

In [2]:
from pymove.utils.constants import (
    DATETIME,
    DAY,
    DIST_PREV_TO_NEXT,
    DIST_TO_NEXT,
    DIST_TO_PREV,
    HOUR_COS,
    HOUR_SIN,
    LATITUDE,
    LONGITUDE,
    PERIOD,
    SPEED_TO_PREV,
    TID,
    TIME_TO_PREV,
    TRAJ_ID,
    TYPE_DASK,
    TYPE_PANDAS,
    UID,
)

In [3]:
list_data = [[39.984094, 116.319236, '2008-10-23 05:53:05', 1],
             [39.984198, 116.319322, '2008-10-23 05:53:06', 1],
             [39.984224, 116.319402, '2008-10-23 05:53:11', 2], 
             [39.984224, 116.319402, '2008-10-23 05:53:11', 2]]
move_df = MoveDataFrame(data=list_data, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
move_df

Unnamed: 0,lat,lon,datetime,id
0,39.984093,116.319237,2008-10-23 05:53:05,1
1,39.9842,116.319321,2008-10-23 05:53:06,1
2,39.984222,116.319405,2008-10-23 05:53:11,2
3,39.984222,116.319405,2008-10-23 05:53:11,2


In [4]:
list_data_test =[[39.984093, 116.319237, '2008-10-23 05:53:05', 1],
                 [39.984200, 116.319321, '2008-10-23 05:53:06', 1],
                 [39.984222, 116.319405, '2008-10-23 05:53:11', 1],
                 [39.984211, 116.319389,'2008-10-23 05:53:16', 1],
                 [39.984219, 116.319420, '2008-10-23 05:53:21', 1]]

In [5]:
def test_by_bbox():
    move_df_bbox = MoveDataFrame(data=list_data, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    bbox = (39.984193, 116.31924, 39.984222, 116.319405)
    
    filter_values = filters.by_bbox(move_df_bbox, bbox)
    assert_array_equal(filter_values, [[39.98419952392578, 116.31932067871094,
                                        pd.Timestamp('2008-10-23 05:53:06'), 1],
                                       [39.984222412109375, 116.31940460205078,
                                        pd.Timestamp('2008-10-23 05:53:11'), 2],
                                       [39.984222412109375, 116.31940460205078,
                                        pd.Timestamp('2008-10-23 05:53:11'), 2]])
    assert move_df_bbox.len() == 4
    filter_values = filters.by_bbox(move_df_bbox, bbox, filter_out=True)
    assert_array_equal(filter_values, [[39.984092712402344, 116.3192367553711,
                                        pd.Timestamp('2008-10-23 05:53:05'), 1]])
    assert move_df_bbox.len() == 4
    filters.by_bbox(move_df_bbox, bbox, inplace=True)
    assert_array_equal(move_df_bbox, [[39.98419952392578, 116.31932067871094,
                                        pd.Timestamp('2008-10-23 05:53:06'), 1],
                                       [39.984222412109375, 116.31940460205078,
                                        pd.Timestamp('2008-10-23 05:53:11'), 2],
                                       [39.984222412109375, 116.31940460205078,
                                        pd.Timestamp('2008-10-23 05:53:11'), 2]])
    assert move_df_bbox.len() == 3

In [6]:
test_by_bbox()

In [52]:
def test_by_datetime():
    filter_values_start = filters.by_datetime(move_df, start_datetime ='2008-10-23 05:53:06')
    assert_array_equal(filter_values_start, [[39.98419952392578, 116.31932067871094,
                                              pd.Timestamp('2008-10-23 05:53:06'), 1],
                                             [39.984222412109375, 116.31940460205078,
                                              pd.Timestamp('2008-10-23 05:53:11'), 2],
                                             [39.984222412109375, 116.31940460205078,
                                              pd.Timestamp('2008-10-23 05:53:11'), 2]])
    
    filter_values_end = filters.by_datetime(move_df, end_datetime ='2008-10-23 05:53:05')
    assert_array_equal(filter_values_end, [[39.984092712402344, 116.3192367553711,
                                            pd.Timestamp('2008-10-23 05:53:05'), 1]])
    
    filter_values = filters.by_datetime(move_df, start_datetime = '2008-10-23 05:53:04', end_datetime ='2008-10-23 05:53:06')
    assert_array_equal(filter_values, [[39.984092712402344, 116.3192367553711,
                                        pd.Timestamp('2008-10-23 05:53:05'), 1],
                                       [39.98419952392578, 116.31932067871094,
                                        pd.Timestamp('2008-10-23 05:53:06'), 1]])
    
    filter_out_values =filters.by_datetime(move_df, start_datetime = '2008-10-23 05:53:06', end_datetime ='2008-10-23 05:53:11', filter_out=True)
    print(move_df)

    assert_array_equal(filter_out_values, [[39.984092712402344, 116.3192367553711,
                                            pd.Timestamp('2008-10-23 05:53:05'), 1]])

In [53]:
test_by_datetime()

         lat         lon            datetime  id
0  39.984093  116.319237 2008-10-23 05:53:05   1
1  39.984200  116.319321 2008-10-23 05:53:06   1
2  39.984222  116.319405 2008-10-23 05:53:11   2
3  39.984222  116.319405 2008-10-23 05:53:11   2


In [9]:
def test_by_label():
    filter_values = filters.by_label(move_df, value = 1, label_name = 'id')
    assert_array_equal(filter_values, [[39.984092712402344, 116.3192367553711,
                                        pd.Timestamp('2008-10-23 05:53:05'), 1],
                                       [39.98419952392578, 116.31932067871094,
                                        pd.Timestamp('2008-10-23 05:53:06'), 1]])
    
    filter_out_values = filters.by_label(move_df, value = 1, label_name = 'id', filter_out=True)
    assert_array_equal(filter_out_values,[[39.984222412109375, 116.31940460205078,
                                            pd.Timestamp('2008-10-23 05:53:11'), 2],
                                           [39.984222412109375, 116.31940460205078,
                                            pd.Timestamp('2008-10-23 05:53:11'), 2]] )
    

In [10]:
test_by_label()

In [11]:
def test_by_id():
    filter_values = filters.by_id(move_df, 1)
    assert_array_equal(filter_values, [[39.984092712402344, 116.3192367553711,
                                        pd.Timestamp('2008-10-23 05:53:05'), 1],
                                       [39.98419952392578, 116.31932067871094,
                                        pd.Timestamp('2008-10-23 05:53:06'), 1]])


In [12]:
test_by_id()

In [13]:
def test_by_tid():
    filter_values = filters.by_tid(move_df, '12008102305')
    assert_array_equal(filter_values, [[39.984092712402344, 116.3192367553711,
                                        pd.Timestamp('2008-10-23 05:53:05'), 1, '12008102305'],
                                       [39.98419952392578, 116.31932067871094,
                                        pd.Timestamp('2008-10-23 05:53:06'), 1, '12008102305']])
    move_df.drop('tid', axis=1, inplace=True)

In [14]:
test_by_tid()


Creating or updating tid feature...

...Sorting by id and datetime to increase performance


...tid feature was created...



In [15]:
df = pd.read_csv('examples/geolife_sample.csv', parse_dates=['datetime'], nrows=5)
df_move = MoveDataFrame(data=df, latitude="latp", longitude="lon", datetime="datetime")
df_move

Unnamed: 0,lat,lon,datetime,id
0,39.984093,116.319237,2008-10-23 05:53:05,1
1,39.9842,116.319321,2008-10-23 05:53:06,1
2,39.984222,116.319405,2008-10-23 05:53:11,1
3,39.984211,116.319389,2008-10-23 05:53:16,1
4,39.984219,116.31942,2008-10-23 05:53:21,1


In [16]:
def test_outliers():
    df_move = MoveDataFrame(data=list_data_test, latitude="latp", longitude="lon", datetime="datetime")
    outliers = filters.outliers(move_data=df_move, jump_coefficient=1)
    assert_array_equal(outliers, [[1, 39.98421096801758, 116.31938934326172,
                                   pd.Timestamp('2008-10-23 05:53:16'), 1.6286216204832726,
                                   2.4484945931533275, 1.2242472060393084]]) 
    not_outliers = filters.outliers(move_data=df_move, jump_coefficient=1, filter_out=True)
    assert_array_equal(not_outliers.values.astype(str), [['1', '39.984092712402344', '116.3192367553711',
                                        '2008-10-23 05:53:05', 'nan', '14.015318782639952', 'nan'],
                                       ['1', '39.98419952392578', '116.31932067871094',
                                        '2008-10-23 05:53:06', '14.015318782639952',
                                        '7.345483960534693', '20.082061827224607'],
                                       ['1', '39.984222412109375', '116.31940460205078',
                                        '2008-10-23 05:53:11', '7.345483960534693',
                                        '1.6286216204832726', '5.929779944096936'],
                                       ['1', '39.98421859741211', '116.31941986083984',
                                        '2008-10-23 05:53:21', '2.4484945931533275', 'nan', 'nan']])
    df_move.set_index('id', inplace=True)
    outliers = filters.outliers(move_data=df_move, jump_coefficient=1)
    assert(move_df.index.name is None )

In [17]:
test_outliers()


Creating or updating distance features in meters...

...Sorting by id and datetime to increase performance

...Set id as index to increase attribution performance

(5/5) 100% in 00:00:00.001 - estimated end in 00:00:00.000
...Reset index

..Total Time: 0.004987001419067383
...Filtring jumps 

...Filtring jumps 

...Reset index for filtering

...Filtring jumps 



In [65]:
def test_clean_duplicates():
    df_move = MoveDataFrame(data=list_data, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    duplicates = filters.clean_duplicates(df_move)
    assert_array_equal(duplicates, [[39.984092712402344, 116.3192367553711,
                                 pd.Timestamp('2008-10-23 05:53:05'), 1],
                                [39.98419952392578, 116.31932067871094,
                                 pd.Timestamp('2008-10-23 05:53:06'), 1],
                                [39.984222412109375, 116.31940460205078,
                                 pd.Timestamp('2008-10-23 05:53:11'), 2]])    
    duplicates = filters.clean_duplicates(duplicates, subset='id')
    assert_array_equal(duplicates, [[39.984092712402344, 116.3192367553711,
                                 pd.Timestamp('2008-10-23 05:53:05'), 1],
                                [39.984222412109375, 116.31940460205078,
                                 pd.Timestamp('2008-10-23 05:53:11'), 2]]) 
    indexes = filters.clean_duplicates(df_move, subset='id', inplace = True)
    assert_array_equal(df_move, [[39.984092712402344, 116.3192367553711,
                                pd.Timestamp('2008-10-23 05:53:05'), 1],
                               [39.984222412109375, 116.31940460205078,
                                pd.Timestamp('2008-10-23 05:53:11'), 2]])
    assert(indexes is None)     

In [66]:
test_clean_duplicates()


Remove rows duplicates by subset
...Sorting by id and datetime to increase performance

...There are 1 GPS points duplicated

Remove rows duplicates by subset
...Sorting by id and datetime to increase performance

...There are 1 GPS points duplicated

Remove rows duplicates by subset
...Sorting by id and datetime to increase performance

...There are 2 GPS points duplicated


In [20]:
#def test_

In [21]:
#test_

In [22]:
#filters.clean_consecutive_duplicates(move_data=move_df, subset='id')

In [69]:
def test_clean_nan_value():
    move = MoveDataFrame(data=list_data, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    move.loc[3, 'id'] = np.nan
    assert(move.len() == 4)
    filters.clean_nan_values(move, inplace=True)
    print(move)
    assert(move.len() == 3)

In [70]:
test_clean_nan_value()

         lat         lon            datetime   id
0  39.984093  116.319237 2008-10-23 05:53:05  1.0
1  39.984200  116.319321 2008-10-23 05:53:06  1.0
2  39.984222  116.319405 2008-10-23 05:53:11  2.0


In [73]:
def test_clean_gps_jumps_by_distance():
    df_move = MoveDataFrame(data=list_data_test, latitude="latp", longitude="lon", datetime="datetime")
    filters.clean_gps_jumps_by_distance(df_move, jump_coefficient=1, threshold=0.5)
    print(df_move)
    assert_array_equal(df_move.values.astype(str), [['1', '39.984092712402344', '116.3192367553711',
                                                    '2008-10-23 05:53:05', 'nan', '14.015318782639952', 'nan'],
                                                   ['1', '39.98419952392578', '116.31932067871094',
                                                    '2008-10-23 05:53:06', '14.015318782639952',
                                                    '7.345483960534693', '20.082061827224607'],
                                                   ['1', '39.984222412109375', '116.31940460205078',
                                                    '2008-10-23 05:53:11', '7.345483960534693',
                                                    '1.6286216204832726', '5.929779944096936'],
                                                   ['1', '39.98421859741211', '116.31941986083984',
                                                    '2008-10-23 05:53:21', '2.4484945931533275', 'nan', 'nan']])

In [74]:
test_clean_gps_jumps_by_distance()


Creating or updating distance features in meters...

...Sorting by id and datetime to increase performance

...Set id as index to increase attribution performance



Generating distance, time and speed features: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1000.55it/s]


...Reset index

..Total Time: 0.023308992385864258

Cleaning gps jumps by distance to jump_coefficient 1...

...Filtering jumps 

...Dropping 1 rows of gps points

...Rows before: 5, Rows after:4, Sum drop:1

...Filtering jumps 

1 GPS points were dropped
         lat         lon            datetime  id
0  39.984093  116.319237 2008-10-23 05:53:05   1
1  39.984200  116.319321 2008-10-23 05:53:06   1
2  39.984222  116.319405 2008-10-23 05:53:11   1
3  39.984211  116.319389 2008-10-23 05:53:16   1
4  39.984219  116.319420 2008-10-23 05:53:21   1


AssertionError: 
Arrays are not equal

(shapes (5, 4), (4, 7) mismatch)
 x: array([['39.984092712402344', '116.3192367553711', '2008-10-23 05:53:05',
        '1'],
       ['39.98419952392578', '116.31932067871094', '2008-10-23 05:53:06',...
 y: array([['1', '39.984092712402344', '116.3192367553711',
        '2008-10-23 05:53:05', 'nan', '14.015318782639952', 'nan'],
       ['1', '39.98419952392578', '116.31932067871094',...

In [86]:
def test_clean_gps_nearby_points_by_distances():
    df_move = MoveDataFrame(data=list_data_test, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    df_move.set_index('id', inplace=True)
    filter_df = filters.clean_gps_nearby_points_by_distances(df_move, radius_area=10.0)
    print(filter_df)
    assert_array_equal(filter_df.values.astype(str), [['1', '39.984092712402344', '116.3192367553711',
                                 '2008-10-23 05:53:05', 'nan', '14.015318782639952', 'nan'],
                                 ['1', '39.98419952392578', '116.31932067871094',
                                 '2008-10-23 05:53:06', '14.015318782639952',
                                 '7.345483960534693', '20.082061827224607']])

    filters.clean_gps_nearby_points_by_distances(df_move, radius_area=10.0, inplace = True)
    assert_array_equal(df_move.values.astype(str), [['1', '39.984092712402344', '116.3192367553711',
                                 '2008-10-23 05:53:05', 'nan', '14.015318782639952', 'nan'],
                                 ['1', '39.98419952392578', '116.31932067871094',
                                 '2008-10-23 05:53:06', '14.015318782639952',
                                 '7.345483960534693', '20.082061827224607']])


In [87]:
test_clean_gps_nearby_points_by_distances()


Creating or updating distance features in meters...

...Sorting by id and datetime to increase performance



Generating distance, time and speed features: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 1003.66it/s]


...Reset index

..Total Time: 0.01793670654296875

Cleaning gps points from radius of 10.0 meters

...Dropping 3 gps points

...Rows before: 5, Rows after:2

3 GPS points were dropped
   id        lat         lon            datetime  dist_to_prev  dist_to_next  \
0   1  39.984093  116.319237 2008-10-23 05:53:05           NaN     14.015319   
1   1  39.984200  116.319321 2008-10-23 05:53:06     14.015319      7.345484   

   dist_prev_to_next  
0                NaN  
1          20.082062  

Creating or updating distance features in meters...

...Sorting by id and datetime to increase performance



Generating distance, time and speed features: 100%|█████████████████████████████████████| 1/1 [00:00<00:00, 992.73it/s]


...Reset index

..Total Time: 0.020562410354614258

Cleaning gps points from radius of 10.0 meters

...Dropping 3 gps points

...Rows before: 5, Rows after:2

3 GPS points were dropped


In [29]:
def test_clean_gps_nearby_points_by_speed():
    df_move = MoveDataFrame(data=list_data_test, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    filters.clean_gps_nearby_points_by_speed(df_move, speed_radius=20.0)
    assert_array_equal(df_move.values.astype(str), [['1', '39.984092712402344', '116.3192367553711',
                                                     '2008-10-23 05:53:05', 'nan', 'nan', 'nan']])

In [30]:
test_clean_gps_nearby_points_by_speed()


Creating or updating distance, time and speed features in meters by seconds

...Sorting by id and datetime to increase performance

...Set id as index to a higher peformance

(5/5) 100% in 00:00:00.008 - estimated end in 00:00:00.000
...Reset index...

..Total Time: 0.013

Cleaning gps points using 20.0 speed radius

...There are 4 gps points to drop

...Dropping 4 gps points

...Rows before: 5, Rows after:1


Cleaning gps points using 0.0 speed radius

...There are 0 gps points to drop



In [31]:
def test_clean_gps_speed_max_radius():
    df_move = MoveDataFrame(data=list_data_test, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    filters.clean_gps_speed_max_radius(df_move, speed_max=0.33)
    assert_array_equal(df_move.values.astype(str), [['1', '39.984092712402344', '116.3192367553711',
                                                    '2008-10-23 05:53:05', 'nan', 'nan', 'nan'],
                                                   ['1', '39.98421096801758', '116.31938934326172',
                                                    '2008-10-23 05:53:16', '1.6286216204832726', '5.0',
                                                    '0.3257243240966545']]) 

In [32]:
test_clean_gps_speed_max_radius()


Creating or updating distance, time and speed features in meters by seconds

...Sorting by id and datetime to increase performance

...Set id as index to a higher peformance

(5/5) 100% in 00:00:00.010 - estimated end in 00:00:00.000
...Reset index...

..Total Time: 0.016

Clean gps points with speed max > 0.33 meters by seconds
...There 3 gps points with speed_max > 0.33

...Dropping 3 rows of jumps by speed max

...Rows before: 5, Rows after:2


Clean gps points with speed max > 0.33 meters by seconds
...There 0 gps points with speed_max > 0.33



In [18]:
def test_clean_trajectories_with_few_points():
    df_move = MoveDataFrame(data=list_data_test, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    filters.clean_trajectories_with_few_points(df_move, min_points_per_trajectory=6)
    assert(df_move.len() == 0)

In [19]:
test_clean_trajectories_with_few_points()


Creating or updating tid feature...

...Sorting by id and datetime to increase performance


...tid feature was created...


...There are 1 ids with few points

...Tids before drop: 1

...Tids after drop: 0

...Shape - before drop: (5, 5) - after drop: (0, 5)

Creating or updating distance, time and speed features in meters by seconds

...Sorting by tid and datetime to increase performance

...Set tid as index to a higher peformance

...Reset index...

..Total Time: 0.012


In [108]:
def test_clean_id_by_time_max():
    df_move = MoveDataFrame(data=list_data, latitude="lat", longitude="lon", datetime="datetime", traj_id="id")
    df_move.generate_dist_time_speed_features()
    print(df_move)
    indexes = filters.clean_id_by_time_max(df_move, time_max =1.0)
    print(indexes['datetime'].values)
    assert_array_equal(indexes['datetime'].astype(str), ['2008-10-23 05:53:05', '2008-10-23 05:53:06'])    


In [109]:
test_clean_id_by_time_max()


Creating or updating distance, time and speed features in meters by seconds

...Sorting by id and datetime to increase performance

...Set id as index to a higher peformance



Generating distance features: 100%|████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1015.45it/s]


...Reset index...

..Total Time: 0.020
   id        lat         lon            datetime  dist_to_prev  time_to_prev  \
0   1  39.984093  116.319237 2008-10-23 05:53:05           NaN           NaN   
1   1  39.984200  116.319321 2008-10-23 05:53:06     14.015319           1.0   
2   2  39.984222  116.319405 2008-10-23 05:53:11           NaN           NaN   
3   2  39.984222  116.319405 2008-10-23 05:53:11      0.000000           0.0   

   speed_to_prev  
0            NaN  
1      14.015319  
2            NaN  
3            NaN  

Clean gps points with time max by id < 1.0 seconds
...Ids total: 2
Ids to drop:1
...Rows before drop: 4
 Rows after drop: 2
['2008-10-23T05:53:05.000000000' '2008-10-23T05:53:06.000000000']


In [110]:
move_df

Unnamed: 0,lat,lon,datetime,id
0,39.984093,116.319237,2008-10-23 05:53:05,1
1,39.9842,116.319321,2008-10-23 05:53:06,1
2,39.984222,116.319405,2008-10-23 05:53:11,2
3,39.984222,116.319405,2008-10-23 05:53:11,2


In [4]:
filters.clean_trajectories_short_and_few_points(move_df)


Remove short trajectories...


TypeError: clean_trajectories_with_few_points() takes from 1 to 4 positional arguments but 5 were given