diff --git a/docs/api/pymove.utils.rst b/docs/api/pymove.utils.rst index 90e467a0..b07889cd 100644 --- a/docs/api/pymove.utils.rst +++ b/docs/api/pymove.utils.rst @@ -84,6 +84,14 @@ pymove.utils.mem module :undoc-members: :show-inheritance: +pymove.utils.networkx module +---------------------------- + +.. automodule:: pymove.utils.networkx + :members: + :undoc-members: + :show-inheritance: + pymove.utils.trajectories module -------------------------------- diff --git a/pymove/tests/test_utils_data_augmentation.py b/pymove/tests/test_utils_data_augmentation.py index 9e99af15..648ac29e 100644 --- a/pymove/tests/test_utils_data_augmentation.py +++ b/pymove/tests/test_utils_data_augmentation.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd from numpy.testing import assert_array_almost_equal from pandas.testing import assert_frame_equal @@ -10,132 +9,152 @@ LATITUDE, LOCAL_LABEL, LONGITUDE, + PREV_LOCAL, START, TID, + TID_STAT, TRAJ_ID, ) from pymove.utils.data_augmentation import ( _augmentation, append_row, - augmentation_trajectories_df, - generate_destiny_feature, - generate_start_feature, - insert_points_in_df, + flatten_trajectories_dataframe, + generate_trajectories_df, + get_all_paths, instance_crossover_augmentation, + sliding_window, split_crossover, + transition_graph_augmentation_all_vertex, ) +from pymove.utils.networkx import build_transition_graph_from_df -list_data1 = [['abc-0000', 1, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:08:15'), 'abc-00002020010106'], - ['abc-0000', 2, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:16:51'), 'abc-00002020010106'], - ['abc-0000', 3, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:31:41'), 'abc-00002020010106'], - ['abc-0000', 4, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:45:25'), 'abc-00002020010106'], - ['abc-0000', 9, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:49:18'), 'abc-00002020010106'], - ['def-1111', 5, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:10:15'), 'def-11112020010109'], - ['def-1111', 6, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:15:45'), 'def-11112020010109'], - ['def-1111', 7, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:25:34'), 'def-11112020010109'], - ['def-1111', 8, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:40:25'), 'def-11112020010109'], - ['def-1111', 9, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:52:53'), 'def-11112020010109']] +list_data1 = [[1, pd.Timestamp('2017-09-02 21:59:34'), 162, -3.8431323, -38.5933142, '12017090221'], + [1, pd.Timestamp('2017-09-02 22:00:27'), 85, -3.8347478, -38.5921890, '12017090222'], + [1, pd.Timestamp('2017-09-02 22:01:36'), 673, -3.8235834, -38.5903890, '12017090222'], + [1, pd.Timestamp('2017-09-02 22:03:08'), 394, -3.8138890, -38.5904445, '12017090222'], + [1, pd.Timestamp('2017-09-02 22:03:46'), 263, -3.9067654, -38.5907723, '12017090222'], + [1, pd.Timestamp('2017-09-02 22:07:19'), 224, -3.8857223, -38.5928892, '12017090222'], + [1, pd.Timestamp('2017-09-02 22:07:40'), 623, -3.8828723, -38.5929789, '12017090222']] list_data2 = { - TRAJ_ID: ['abc-0000', 'def-1111'], - LOCAL_LABEL: [[5, 7, 9], [2, 4, 6, 8, 9]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 06:16:51'), - pd.Timestamp('2020-01-01 06:20:40')], - [pd.Timestamp('2020-01-01 09:10:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 09:25:30'), - pd.Timestamp('2020-01-01 09:30:17'), - pd.Timestamp('2020-01-01 09:45:16')]], - LATITUDE: [[3.1234567165374756, 3.1234567165374756, - 3.1234567165374756], - [3.1234567165374756, 3.1234567165374756, - 3.1234567165374756, 3.1234567165374756, - 3.1234567165374756]], - LONGITUDE: [[38.12345504760742, 38.12345504760742, - 38.12345504760742], - [38.12345504760742, 38.12345504760742, - 38.12345504760742, 38.12345504760742, - 38.12345504760742]], - TID: [['abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106'], - ['def-11112020010109', 'def-11112020010109', - 'abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106']]} + TRAJ_ID: [[1, 1, 1, 1, 1, 1]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), + pd.Timestamp('2017-09-02 22:03:46'), + pd.Timestamp('2017-09-02 22:07:19'), + pd.Timestamp('2017-09-02 22:07:40')]], + LOCAL_LABEL: [[85, 673, 394, 263, 224, 623]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889, + -3.9067654, -3.8857223, -3.8828723]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445, + -38.5907723, -38.5928892, -38.5929789]], + TID: [['12017090222', '12017090222', '12017090222', + '12017090222', '12017090222', '12017090222']] +} + +list_data3 = { + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), + pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.7040900]], + TID: [['12017090222', '12017090222', '12017090222'], + ['22017090223', '22017090223', '22017090223', '22017090223']] +} + +list_data4 = { + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), + pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.7040900]] +} def test_append_row(): - traj_df = pd.DataFrame( - data={ - TRAJ_ID: ['abc-0000'], - LOCAL_LABEL: [[1, 2]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15')]], - LATITUDE: [[3.1234567165374756]], - LONGITUDE: [[38.12345504760742]], - TID: [['abc-00002020010106']]}) + df = pd.DataFrame(list_data2) expected = pd.DataFrame( data={ - TRAJ_ID: ['abc-0000', 'def-1111'], - LOCAL_LABEL: [[1, 2], [5, 6]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15')], - [pd.Timestamp('2020-01-01 07:10:15')]], - LATITUDE: [[3.1234567165374756], [3.623471260070801]], - LONGITUDE: [[38.12345504760742], [38.397525787353516]], - TID: [['abc-00002020010106'], ['def-1111202001010']]}) + TRAJ_ID: [[1, 1, 1, 1, 1, 1], [2, 2, 2]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), + pd.Timestamp('2017-09-02 22:03:46'), + pd.Timestamp('2017-09-02 22:07:19'), + pd.Timestamp('2017-09-02 22:07:40')], + [pd.Timestamp('2017-09-03 14:10:15'), + pd.Timestamp('2017-09-03 14:20:30'), + pd.Timestamp('2017-09-03 14:30:45')]], + LOCAL_LABEL: [[85, 673, 394, 263, 224, 623], + [673, 263, 623]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889, + -3.9067654, -3.8857223, -3.8828723], + [-3.8235834, -3.9067654, -3.8828723]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445, + -38.5907723, -38.5928892, -38.5929789], + [-38.590389, -38.5907723, -38.5929789]], + TID: [['12017090222', '12017090222', '12017090222', + '12017090222', '12017090222', '12017090222'], + ['22017090314', '22017090314', '22017090314']]}) row = pd.Series( - data={TRAJ_ID: 'def-1111', - LOCAL_LABEL: [5, 6], - DATETIME: [pd.Timestamp('2020-01-01 7:10:15')], - LATITUDE: [3.6234712461], - LONGITUDE: [38.39752597257], - TID: ['def-1111202001010']}) - - append_row(traj_df, row=row) - assert_frame_equal(expected, traj_df) + data={ + TRAJ_ID: [2, 2, 2], + LOCAL_LABEL: [673, 263, 623], + DATETIME: [pd.Timestamp('2017-09-03 14:10:15'), + pd.Timestamp('2017-09-03 14:20:30'), + pd.Timestamp('2017-09-03 14:30:45')], + LATITUDE: [-3.8235834, -3.9067654, -3.8828723], + LONGITUDE: [-38.590389, -38.5907723, -38.5929789], + TID: ['22017090314', '22017090314', '22017090314']}) + append_row(df, row) + assert_frame_equal(df, expected) -def test__augmentation(): - traj_df = pd.DataFrame(list_data2) - expected = pd.DataFrame( - data={ - TRAJ_ID: ['abc-0000_def-1111', 'def-1111_abc-0000'], - LOCAL_LABEL: [[5, 6, 8, 9], [2, 4, 7, 9]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 09:25:30'), - pd.Timestamp('2020-01-01 09:30:17'), - pd.Timestamp('2020-01-01 09:45:16')], - [pd.Timestamp('2020-01-01 09:10:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 06:16:51'), - pd.Timestamp('2020-01-01 06:20:40')]], - LATITUDE: [[3.12345672, 3.12345672, 3.12345672, 3.12345672], - [3.12345672, 3.12345672, 3.12345672, 3.12345672]], - LONGITUDE: [[38.12345505, 38.12345505, 38.12345505, 38.12345505], - [38.12345505, 38.12345505, 38.12345505, 38.12345505]], - TID: [['abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106', 'abc-00002020010106'], - ['def-11112020010109', 'def-11112020010109', - 'abc-00002020010106', 'abc-00002020010106']] - } +def test_generate_trajectories_df(): + df = pd.DataFrame( + list_data1, + columns=[TRAJ_ID, DATETIME, LOCAL_LABEL, LATITUDE, LONGITUDE, TID] ) - aug_df = pd.DataFrame(columns=traj_df.columns) - - _augmentation(traj_df, aug_df) - assert_frame_equal(expected, aug_df) + expected = pd.DataFrame({ + TRAJ_ID: [[1, 1, 1, 1, 1, 1]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), + pd.Timestamp('2017-09-02 22:03:46'), + pd.Timestamp('2017-09-02 22:07:19'), + pd.Timestamp('2017-09-02 22:07:40')]], + LOCAL_LABEL: [[85, 673, 394, 263, 224, 623]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889, + -3.9067654, -3.8857223, -3.8828723]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445, + -38.5907723, -38.5928892, -38.5929789]], + TID: [['12017090222', '12017090222', '12017090222', + '12017090222', '12017090222', '12017090222']] + }) + + traj_df = generate_trajectories_df(df) + assert_frame_equal(traj_df, expected) def test_split_crossover(): @@ -151,229 +170,175 @@ def test_split_crossover(): assert_array_almost_equal(expected2, s2) -def test_insert_points_in_df(): - move_df = pd.DataFrame( - data=np.array(list_data1, dtype=object), - columns=[TRAJ_ID, LOCAL_LABEL, LATITUDE, LONGITUDE, DATETIME, TID] +def test_augmentation(): + df = pd.DataFrame(list_data3) + + expected = pd.DataFrame( + data = { + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2], [1, 2, 2], [2, 2, 1, 1]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), pd.Timestamp('2017-09-02 23:09:10')], + [pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 22:01:36'), pd.Timestamp('2017-09-02 22:03:08')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394], + [ 85, 623, 394], [263, 224, 673, 394]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834], + [-3.8347478, -3.8828723, -3.9939834], + [-3.9067654, -3.8857223, -3.8235834, -3.8138890]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409], + [-38.592189 , -38.5929789, -38.70409 ], + [-38.5907723, -38.5928892, -38.590389 , -38.5904445]], + TID: [['12017090222', '12017090222', '12017090222'], + ['22017090223', '22017090223', '22017090223', '22017090223'], + ['12017090222', '22017090223', '22017090223'], + ['22017090223', '22017090223', '12017090222', '12017090222']] + } ) - aug_df = pd.DataFrame( - data={ - TRAJ_ID: ['abc-0000_def-1111'], - LOCAL_LABEL: [[5, 6, 7, 9]], - LATITUDE: [[3.1234567165374756, 3.1234567165374756, - 3.1234567165374756, 3.1234567165374756]], - LONGITUDE: [[38.12345504760742, 38.12345504760742, - 38.12345504760742, 38.12345504760742]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 09:30:45'), - pd.Timestamp('2020-01-01 09:30:45')]], - TID: [['abc-0000202001010', 'abc-0000202001010', - 'abc-0000202001010', 'abc-0000202001010']], - DESTINY: [9]}) + aug_df = _augmentation(df, 0.5) + assert_frame_equal(aug_df, expected) - expected = pd.DataFrame( - data=np.array( - [['abc-0000', 1, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:08:15'), 'abc-00002020010106'], - ['abc-0000', 2, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:16:51'), 'abc-00002020010106'], - ['abc-0000', 3, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:31:41'), 'abc-00002020010106'], - ['abc-0000', 4, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:45:25'), 'abc-00002020010106'], - ['abc-0000', 9, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 06:49:18'), 'abc-00002020010106'], - ['def-1111', 5, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:10:15'), 'def-11112020010109'], - ['def-1111', 6, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:15:45'), 'def-11112020010109'], - ['def-1111', 7, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:25:34'), 'def-11112020010109'], - ['def-1111', 8, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:40:25'), 'def-11112020010109'], - ['def-1111', 9, 3.1234567, 38.1234567, - pd.Timestamp('2020-01-01 09:52:53'), 'def-11112020010109'], - ['abc-0000_def-1111', 5, 3.1234567165374756, 38.12345504760742, - pd.Timestamp('2020-01-01 06:08:15'), 'abc-0000202001010'], - ['abc-0000_def-1111', 6, 3.1234567165374756, 38.12345504760742, - pd.Timestamp('2020-01-01 09:15:45'), 'abc-0000202001010'], - ['abc-0000_def-1111', 7, 3.1234567165374756, 38.12345504760742, - pd.Timestamp('2020-01-01 09:30:45'), 'abc-0000202001010'], - ['abc-0000_def-1111', 9, 3.1234567165374756, 38.12345504760742, - pd.Timestamp('2020-01-01 09:30:45'), 'abc-0000202001010']], - dtype=object), - columns=[TRAJ_ID, LOCAL_LABEL, LATITUDE, LONGITUDE, DATETIME, TID]) - - insert_points_in_df(move_df, aug_df) - assert_frame_equal(expected, move_df) - - -def test_generate_start_feature(): - move_df = pd.DataFrame(list_data2) + +def test_flatten_trajectories_dataframe(): + traj_df = pd.DataFrame(list_data3) expected = pd.DataFrame( data={ - TRAJ_ID: ['abc-0000', 'def-1111'], - LOCAL_LABEL: [[5, 7, 9], [2, 4, 6, 8, 9]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 06:16:51'), - pd.Timestamp('2020-01-01 06:20:40')], - [pd.Timestamp('2020-01-01 09:10:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 09:25:30'), - pd.Timestamp('2020-01-01 09:30:17'), - pd.Timestamp('2020-01-01 09:45:16')]], - LATITUDE: [[3.1234567165374756, 3.1234567165374756, - 3.1234567165374756], - [3.1234567165374756, 3.1234567165374756, - 3.1234567165374756, 3.1234567165374756, - 3.1234567165374756]], - LONGITUDE: [[38.12345504760742, 38.12345504760742, - 38.12345504760742], - [38.12345504760742, 38.12345504760742, - 38.12345504760742, 38.12345504760742, - 38.12345504760742]], - TID: [['abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106'], - ['def-11112020010109', 'def-11112020010109', - 'abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106']], - START: [5, 2] - }) + TRAJ_ID: [1, 1, 1, 2, 2, 2, 2], + DATETIME: [pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), pd.Timestamp('2017-09-02 23:03:46'), + pd.Timestamp('2017-09-02 23:07:19'), pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')], + LOCAL_LABEL: [85, 673, 394, 263, 224, 623, 394], + LATITUDE: [-3.8347478, -3.8235834, -3.813889, -3.9067654, -3.8857223, + -3.8828723, -3.9939834], + LONGITUDE: [-38.592189, -38.590389, -38.5904445, -38.5907723, -38.5928892, + -38.5929789, -38.70409], + TID: ['12017090222', '12017090222', '12017090222', '22017090223', '22017090223', + '22017090223', '22017090223'] + } + ) - generate_start_feature(move_df, label_trajectory=LOCAL_LABEL) - assert_frame_equal(move_df, expected) + df = flatten_trajectories_dataframe(traj_df) + assert_frame_equal(df, expected) -def test_generate_destiny_feature(): - move_df = pd.DataFrame(list_data2) +def test_instance_crossover_augmentation(): + traj_df = pd.DataFrame(list_data3) expected = pd.DataFrame( data={ - TRAJ_ID: ['abc-0000', 'def-1111'], - LOCAL_LABEL: [[5, 7, 9], [2, 4, 6, 8, 9]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 06:16:51'), - pd.Timestamp('2020-01-01 06:20:40')], - [pd.Timestamp('2020-01-01 09:10:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 09:25:30'), - pd.Timestamp('2020-01-01 09:30:17'), - pd.Timestamp('2020-01-01 09:45:16')]], - LATITUDE: [[3.1234567165374756, 3.1234567165374756, - 3.1234567165374756], - [3.1234567165374756, 3.1234567165374756, - 3.1234567165374756, 3.1234567165374756, - 3.1234567165374756]], - LONGITUDE: [[38.12345504760742, 38.12345504760742, - 38.12345504760742], - [38.12345504760742, 38.12345504760742, - 38.12345504760742, 38.12345504760742, - 38.12345504760742]], - TID: [['abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106'], - ['def-11112020010109', 'def-11112020010109', - 'abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106']], - DESTINY: [9, 9] + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2], [1, 2, 2], [2, 2, 1, 1]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), pd.Timestamp('2017-09-02 23:09:10')], + [pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 22:01:36'), pd.Timestamp('2017-09-02 22:03:08')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394], [ 85, 623, 394], [263, 224, 673, 394]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834], + [-3.8347478, -3.8828723, -3.9939834], + [-3.9067654, -3.8857223, -3.8235834, -3.813889 ]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409], + [-38.592189 , -38.5929789, -38.70409 ], + [-38.5907723, -38.5928892, -38.590389 , -38.5904445]], + TID: [['12017090222', '12017090222', '12017090222'], + ['22017090223', '22017090223', '22017090223', '22017090223'], + ['12017090222', '22017090223', '22017090223'], + ['22017090223', '22017090223', '12017090222', '12017090222']] }) - generate_destiny_feature(move_df, label_trajectory=LOCAL_LABEL) - assert_frame_equal(move_df, expected) + aug_df = instance_crossover_augmentation(traj_df) + assert_frame_equal(aug_df, expected) -def test_augmentation_trajectories_df(): - move_df = pd.DataFrame(list_data2) - - expected = pd.DataFrame( - data={ - TRAJ_ID: ['abc-0000_def-1111', 'def-1111_abc-0000'], - LOCAL_LABEL: [[5, 6, 8, 9], [2, 4, 7, 9]], - DATETIME: [[pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 09:25:30'), - pd.Timestamp('2020-01-01 09:30:17'), - pd.Timestamp('2020-01-01 09:45:16')], - [pd.Timestamp('2020-01-01 09:10:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 06:16:51'), - pd.Timestamp('2020-01-01 06:20:40')]], - LATITUDE: [[3.12345672, 3.12345672, 3.12345672, 3.12345672], - [3.12345672, 3.12345672, 3.12345672, 3.12345672]], - LONGITUDE: [[38.12345505, 38.12345505, 38.12345505, 38.12345505], - [38.12345505, 38.12345505, 38.12345505, 38.12345505]], - TID: [['abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106', 'abc-00002020010106'], - ['def-11112020010109', 'def-11112020010109', - 'abc-00002020010106', 'abc-00002020010106']], - DESTINY: [[9], [9]]}) - - aug_df = augmentation_trajectories_df(move_df, label_trajectory=LOCAL_LABEL) - assert_frame_equal(expected, aug_df) - +def test_sliding_window(): + traj_df = pd.DataFrame(list_data2) -def test_instance_crossover_augmentation(): - move_df = pd.DataFrame( - list_data1, - columns=[TRAJ_ID, LOCAL_LABEL, LATITUDE, LONGITUDE, DATETIME, TID] - ) + expected = pd.DataFrame({ + 'id': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], + 'datetime': [[pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), pd.Timestamp('2017-09-02 22:03:46')], + [pd.Timestamp('2017-09-02 22:01:36'), pd.Timestamp('2017-09-02 22:03:08'), + pd.Timestamp('2017-09-02 22:03:46'), pd.Timestamp('2017-09-02 22:07:19')], + [pd.Timestamp('2017-09-02 22:03:08'), pd.Timestamp('2017-09-02 22:03:46'), + pd.Timestamp('2017-09-02 22:07:19'), pd.Timestamp('2017-09-02 22:07:40')]], + 'local_label': [[85, 673, 394, 263], [673, 394, 263, 224], [394, 263, 224, 623]], + 'lat': [[-3.8347478, -3.8235834, -3.813889, -3.9067654], + [-3.8235834, -3.813889, -3.9067654, -3.8857223], + [-3.813889, -3.9067654, -3.8857223, -3.8828723]], + 'lon': [[-38.592189, -38.590389, -38.5904445, -38.5907723], + [-38.590389, -38.5904445, -38.5907723, -38.5928892], + [-38.5904445, -38.5907723, -38.5928892, -38.5929789]], + 'tid': [['12017090222', '12017090222', '12017090222', '12017090222'], + ['12017090222', '12017090222', '12017090222', '12017090222'], + ['12017090222', '12017090222', '12017090222', '12017090222']] + }) + + sw_df = sliding_window(traj_df, size_window=4, size_jump=1) + assert_frame_equal(sw_df, expected) + + +def test_get_all_paths(): + traj_df = pd.DataFrame(list_data4) + graph = build_transition_graph_from_df(traj_df) expected = pd.DataFrame( data={ - TRAJ_ID: ['abc-0000', 'abc-0000', 'abc-0000', 'abc-0000', 'abc-0000', - 'def-1111', 'def-1111', 'def-1111', 'def-1111', 'def-1111', - 'abc-0000_def-1111', 'abc-0000_def-1111', 'abc-0000_def-1111', - 'abc-0000_def-1111', 'abc-0000_def-1111', 'def-1111_abc-0000', - 'def-1111_abc-0000', 'def-1111_abc-0000', 'def-1111_abc-0000', - 'def-1111_abc-0000'], - LOCAL_LABEL: [1.0, 2.0, 3.0, 4.0, 9.0, 5.0, 6.0, 7.0, 8.0, 9.0, - 1.0, 2.0, 7.0, 8.0, 9.0, 5.0, 6.0, 3.0, 4.0, 9.0], - LATITUDE: [3.1234567, 3.1234567, 3.1234567, 3.1234567, 3.1234567, - 3.1234567, 3.1234567, 3.1234567, 3.1234567, 3.1234567, - 3.1234567, 3.1234567, 3.1234567, 3.1234567, 3.1234567, - 3.1234567, 3.1234567, 3.1234567, 3.1234567, 3.1234567], - LONGITUDE: [38.1234567, 38.1234567, 38.1234567, 38.1234567, 38.1234567, - 38.1234567, 38.1234567, 38.1234567, 38.1234567, 38.1234567, - 38.1234567, 38.1234567, 38.1234567, 38.1234567, 38.1234567, - 38.1234567, 38.1234567, 38.1234567, 38.1234567, 38.1234567], - DATETIME: [pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 06:16:51'), - pd.Timestamp('2020-01-01 06:31:41'), - pd.Timestamp('2020-01-01 06:45:25'), - pd.Timestamp('2020-01-01 06:49:18'), - pd.Timestamp('2020-01-01 09:10:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 09:25:34'), - pd.Timestamp('2020-01-01 09:40:25'), - pd.Timestamp('2020-01-01 09:52:53'), - pd.Timestamp('2020-01-01 06:08:15'), - pd.Timestamp('2020-01-01 06:16:51'), - pd.Timestamp('2020-01-01 09:25:34'), - pd.Timestamp('2020-01-01 09:40:25'), - pd.Timestamp('2020-01-01 09:52:53'), - pd.Timestamp('2020-01-01 09:10:15'), - pd.Timestamp('2020-01-01 09:15:45'), - pd.Timestamp('2020-01-01 06:31:41'), - pd.Timestamp('2020-01-01 06:45:25'), - pd.Timestamp('2020-01-01 06:49:18')], - TID: ['abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106', 'abc-00002020010106', - 'abc-00002020010106', 'def-11112020010109', - 'def-11112020010109', 'def-11112020010109', - 'def-11112020010109', 'def-11112020010109', - 'abc-00002020010106_def-11112020010109', - 'abc-00002020010106_def-11112020010109', - 'abc-00002020010106_def-11112020010109', - 'abc-00002020010106_def-11112020010109', - 'abc-00002020010106_def-11112020010109', - 'def-11112020010109_abc-00002020010106', - 'def-11112020010109_abc-00002020010106', - 'def-11112020010109_abc-00002020010106', - 'def-11112020010109_abc-00002020010106', - 'def-11112020010109_abc-00002020010106'] - } - ) + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2], [3, 3, 3]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), pd.Timestamp('2017-09-02 23:09:10')], + [pd.Timestamp('2017-09-02 23:07:19'), pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394], [224.0, 623.0, 394.0]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834], + [-3.8857223, -3.8828723, -3.9939834]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409], + [-38.5928892, -38.5929789, -38.70409]] + }) - instance_crossover_augmentation(move_df, label_trajectory=LOCAL_LABEL) - assert_frame_equal(move_df, expected) + get_all_paths(traj_df, graph, '224', '394') + assert_frame_equal(traj_df, expected) + + +def test_transition_graph_augmentation_all_vertex(): + traj_df = pd.DataFrame(list_data4) + + expected = pd.DataFrame({ + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2], [3, 3, 3], [4, 4, 4]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), pd.Timestamp('2017-09-02 23:09:10')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40')], + [pd.Timestamp('2017-09-02 23:07:19'), pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394], + [263.0, 224.0, 623.0], [224.0, 623.0, 394.0]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834], + [-3.9067654, -3.8857223, -3.8828723], + [-3.8857223, -3.8828723, -3.9939834]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409], + [-38.5907723, -38.5928892, -38.5929789], + [-38.5928892, -38.5929789, -38.70409]] + }) + + transition_graph_augmentation_all_vertex(traj_df) + assert_frame_equal(traj_df, expected) diff --git a/pymove/tests/test_utils_networkx.py b/pymove/tests/test_utils_networkx.py new file mode 100644 index 00000000..77a4896c --- /dev/null +++ b/pymove/tests/test_utils_networkx.py @@ -0,0 +1,228 @@ +import json +import os + +import pandas as pd +from networkx.classes.digraph import DiGraph +from networkx.testing import assert_graphs_equal +from numpy.testing import assert_equal + +from pymove.utils.constants import ( + DATETIME, + LATITUDE, + LOCAL_LABEL, + LONGITUDE, + PREV_LOCAL, + TID, + TRAJ_ID, +) +from pymove.utils.networkx import ( + _populate_graph, + build_transition_graph_from_df, + build_transition_graph_from_dict, + graph_to_dict, + read_graph_json, + save_graph_as_json, +) + +dict_graph = { + 'nodes': { + 'coords': { '85': (-3.8347478, -38.592189), '673': (-3.8235834, -38.590389), + '394': (-3.813889, -38.5904445), '263': (-3.9067654, -38.5907723), + '224': (-3.8857223, -38.5928892), '623': (-3.8828723, -38.5929789)}, + 'datetime': { '85': ['2017-09-02 22:00:27'], '673': ['2017-09-02 22:01:36'], + '394': ['2017-09-02 22:03:08'], '263': ['2017-09-02 22:03:46'], + '224': ['2017-09-02 22:07:19'], '623': ['2017-09-02 22:07:40']}, + 'freq_source': {'85': 1, '673': 0, '394': 0, '263': 0, '224': 0, '623': 0}, + 'freq_target': {'85': 0, '673': 0, '394': 0, '263': 0, '224': 0, '623': 1}}, + 'edges': { + '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {}}} + +list_data1 = { + TRAJ_ID: [[1, 1, 1, 1, 1, 1]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), + pd.Timestamp('2017-09-02 22:03:46'), + pd.Timestamp('2017-09-02 22:07:19'), + pd.Timestamp('2017-09-02 22:07:40')]], + LOCAL_LABEL: [[85, 673, 394, 263, 224, 623]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889, + -3.9067654, -3.8857223, -3.8828723]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445, + -38.5907723, -38.5928892, -38.5929789]], + TID: [['12017090222', '12017090222', '12017090222', + '12017090222', '12017090222', '12017090222']] +} + + +def _transition_graph(): + expected_graph = DiGraph() + expected_graph.add_node('85', coords=(-3.8347478, -38.592189), + datetime=['2017-09-02 22:00:27'], freq_source=1, freq_target=0) + expected_graph.add_node('673', coords=(-3.8235834, -38.590389), + datetime=['2017-09-02 22:01:36'], freq_source=0, freq_target=0) + expected_graph.add_node('394', coords=(-3.813889, -38.5904445), + datetime=['2017-09-02 22:03:08'], freq_source=0, freq_target=0) + expected_graph.add_node('263', coords=(-3.9067654, -38.5907723), + datetime=['2017-09-02 22:03:46'], freq_source=0, freq_target=0) + expected_graph.add_node('224', coords=(-3.8857223, -38.5928892), + datetime=['2017-09-02 22:07:19'], freq_source=0, freq_target=0) + expected_graph.add_node('623', coords=(-3.8828723, -38.5929789), + datetime=['2017-09-02 22:07:40'], freq_source=0, freq_target=1) + expected_graph.add_edge( '85', '673', weight=1, mean_times='0 days 00:01:09') + expected_graph.add_edge('673', '394', weight=1, mean_times='0 days 00:01:32') + expected_graph.add_edge('394', '263', weight=1, mean_times='0 days 00:00:38') + expected_graph.add_edge('263', '224', weight=1, mean_times='0 days 00:03:33') + expected_graph.add_edge('224', '623', weight=1, mean_times='0 days 00:00:21') + + return expected_graph + + +def test_populate_graph(): + row = pd.DataFrame(list_data1).loc[0] + + nodes = {'datetime': {}, 'coords': {}, 'freq_source': {}, 'freq_target': {}} + edges = {} + + expected_nodes = { + 'datetime': {'85': ['2017-09-02 22:00:27'], '673': ['2017-09-02 22:01:36'], + '394': ['2017-09-02 22:03:08'], '263': ['2017-09-02 22:03:46'], + '224': ['2017-09-02 22:07:19'], '623': ['2017-09-02 22:07:40']}, + 'coords': {'85': (-3.8347478, -38.592189), '673': (-3.8235834, -38.590389), + '394': (-3.813889, -38.5904445), '263': (-3.9067654, -38.5907723), + '224': (-3.8857223, -38.5928892), '623': (-3.8828723, -38.5929789)}, + 'freq_source': {'85': 1, '673': 0, '394': 0, '263': 0, '224': 0, '623': 0}, + 'freq_target': {'85': 0, '673': 0, '394': 0, '263': 0, '224': 0, '623': 1}} + + expected_edges = { + '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}} + + _populate_graph(row, nodes, edges) + nodes, edges + + assert_equal(expected_nodes, nodes) + assert_equal(expected_edges, edges) + + +def test_build_transition_graph_from_dict(): + expected_graph = _transition_graph() + + graph = build_transition_graph_from_dict(dict_graph) + + assert_graphs_equal(expected_graph, graph) + + +def test_build_transition_graph_from_df(): + expected_graph = _transition_graph() + + traj_df = pd.DataFrame(list_data1) + + graph = build_transition_graph_from_df(traj_df) + + assert_graphs_equal(expected_graph, graph) + + +def test_graph_to_dict(): + graph = _transition_graph() + + expected_dict = { + 'nodes': { + 'coords': { '85': (-3.8347478, -38.592189), '673': (-3.8235834, -38.590389), + '394': (-3.813889, -38.5904445), '263': (-3.9067654, -38.5907723), + '224': (-3.8857223, -38.5928892), '623': (-3.8828723, -38.5929789)}, + 'datetime': { '85': ['2017-09-02 22:00:27'], '673': ['2017-09-02 22:01:36'], + '394': ['2017-09-02 22:03:08'], '263': ['2017-09-02 22:03:46'], + '224': ['2017-09-02 22:07:19'], '623': ['2017-09-02 22:07:40']}, + 'freq_source': {'85': 1, '673': 0, '394': 0, '263': 0, '224': 0, '623': 0}, + 'freq_target': {'85': 0, '673': 0, '394': 0, '263': 0, '224': 0, '623': 1}}, + 'edges': { + '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {}}} + + dict_graph = graph_to_dict(graph) + + assert_equal(expected_dict, dict_graph) + + +def test_save_graph_as_json(tmpdir): + + expected = { + 'nodes': { + 'coords': { '85': (-3.8347478, -38.592189), '673': (-3.8235834, -38.590389), + '394': (-3.813889, -38.5904445), '263': (-3.9067654, -38.5907723), + '224': (-3.8857223, -38.5928892), '623': (-3.8828723, -38.5929789)}, + 'datetime': { '85': ['2017-09-02 22:00:27'], '673': ['2017-09-02 22:01:36'], + '394': ['2017-09-02 22:03:08'], '263': ['2017-09-02 22:03:46'], + '224': ['2017-09-02 22:07:19'], '623': ['2017-09-02 22:07:40']}, + 'freq_source': {'85': 1, '673': 0, '394': 0, '263': 0, '224': 0, '623': 0}, + 'freq_target': {'85': 0, '673': 0, '394': 0, '263': 0, '224': 0, '623': 1}}, + 'edges': { + '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {}}} + + d = tmpdir.mkdir('utils') + + file_write_default = d.join('test_save_graph.json') + filename_write_default = os.path.join( + file_write_default.dirname, file_write_default.basename + ) + + graph = _transition_graph() + save_graph_as_json(graph, filename_write_default) + saved_graph = read_graph_json(filename_write_default) + + assert_equal(saved_graph, expected) + + +def test_read_graph_json(tmpdir): + + expected = { + 'nodes': { + 'coords': { '85': (-3.8347478, -38.592189), '673': (-3.8235834, -38.590389), + '394': (-3.813889, -38.5904445), '263': (-3.9067654, -38.5907723), + '224': (-3.8857223, -38.5928892), '623': (-3.8828723, -38.5929789)}, + 'datetime': { '85': ['2017-09-02 22:00:27'], '673': ['2017-09-02 22:01:36'], + '394': ['2017-09-02 22:03:08'], '263': ['2017-09-02 22:03:46'], + '224': ['2017-09-02 22:07:19'], '623': ['2017-09-02 22:07:40']}, + 'freq_source': {'85': 1, '673': 0, '394': 0, '263': 0, '224': 0, '623': 0}, + 'freq_target': {'85': 0, '673': 0, '394': 0, '263': 0, '224': 0, '623': 1}}, + 'edges': { + '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {}}} + + d = tmpdir.mkdir('utils') + + file_write_default = d.join('test_read_graph.json') + filename_write_default = os.path.join( + file_write_default.dirname, file_write_default.basename + ) + + graph = _transition_graph() + + with open(filename_write_default, 'w') as f: + json.dump(dict_graph, f) + + saved_graph = read_graph_json(filename_write_default) + + assert_equal(saved_graph, expected) diff --git a/pymove/tests/test_utils_trajectories.py b/pymove/tests/test_utils_trajectories.py index fd159718..86664334 100644 --- a/pymove/tests/test_utils_trajectories.py +++ b/pymove/tests/test_utils_trajectories.py @@ -1,6 +1,7 @@ import os import numpy as np +import pandas as pd from numpy.testing import assert_array_almost_equal, assert_array_equal, assert_equal from pandas import DataFrame from pandas.testing import assert_frame_equal @@ -9,11 +10,22 @@ from pymove.utils.constants import ( DATETIME, LATITUDE, + LOCAL_LABEL, LONGITUDE, + PREV_LOCAL, + TID, + TID_STAT, TRAJ_ID, TYPE_DASK, TYPE_PANDAS, ) +from pymove.utils.networkx import build_transition_graph_from_df +from pymove.utils.trajectories import ( + append_trajectory, + columns_to_array, + object_for_array, + split_trajectory, +) list_data = [ [39.984094, 116.319236, '2008-10-23 05:53:05', 1], @@ -23,6 +35,39 @@ [39.984217, 116.319422, '2008-10-23 05:53:21', 1], ] +list_data2 = { + TRAJ_ID: [[1, 1, 1, 1, 1, 1]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), + pd.Timestamp('2017-09-02 22:03:46'), + pd.Timestamp('2017-09-02 22:07:19'), + pd.Timestamp('2017-09-02 22:07:40')]], + LOCAL_LABEL: [[85, 673, 394, 263, 224, 623]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889, + -3.9067654, -3.8857223, -3.8828723]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445, + -38.5907723, -38.5928892, -38.5929789]], + TID: [['12017090222', '12017090222', '12017090222', + '12017090222', '12017090222', '12017090222']] +} + +list_data4 = { + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), + pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), + pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.7040900]], +} + str_data_default = """ lat,lon,datetime,id 39.984094, 116.319236, 2008-10-23 05:53:05, 1 @@ -47,7 +92,7 @@ def test_read_csv(tmpdir): expected = _default_move_df() - d = tmpdir.mkdir('prepossessing') + d = tmpdir.mkdir('utils') file_default_columns = d.join('test_read_default.csv') file_default_columns.write(str_data_default) @@ -134,54 +179,89 @@ def test_fill_list_with_new_values(): assert_array_equal(original_list, exected) +def test_append_trajectory(): + traj_df = pd.DataFrame(list_data4) + graph = build_transition_graph_from_df(traj_df) + + expected = pd.DataFrame({ + TRAJ_ID: [[1, 1, 1], [2, 2, 2, 2], [3, 3, 3]], + DATETIME: [[pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08')], + [pd.Timestamp('2017-09-02 23:03:46'), pd.Timestamp('2017-09-02 23:07:19'), + pd.Timestamp('2017-09-02 23:07:40'), pd.Timestamp('2017-09-02 23:09:10')], + [pd.Timestamp('2017-09-02 23:07:19'), pd.Timestamp('2017-09-02 23:07:40'), + pd.Timestamp('2017-09-02 23:09:10')]], + LOCAL_LABEL: [[85, 673, 394], [263, 224, 623, 394], [224.0, 623.0, 394.0]], + LATITUDE: [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834], + [-3.8857223, -3.8828723, -3.9939834]], + LONGITUDE: [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409], + [-38.5928892, -38.5929789, -38.70409]], + }) + + trajectory = [224, 623, 394] + append_trajectory(traj_df, trajectory, graph) + assert_frame_equal(traj_df, expected) + + +def test_split_trajectory(): + trajectory = pd.DataFrame(list_data2).loc[0] + + expected = pd.DataFrame({ + 'id': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], + 'datetime': [[pd.Timestamp('2017-09-02 22:00:27'), pd.Timestamp('2017-09-02 22:01:36'), + pd.Timestamp('2017-09-02 22:03:08'), pd.Timestamp('2017-09-02 22:03:46')], + [pd.Timestamp('2017-09-02 22:01:36'), pd.Timestamp('2017-09-02 22:03:08'), + pd.Timestamp('2017-09-02 22:03:46'), pd.Timestamp('2017-09-02 22:07:19')], + [pd.Timestamp('2017-09-02 22:03:08'), pd.Timestamp('2017-09-02 22:03:46'), + pd.Timestamp('2017-09-02 22:07:19'), pd.Timestamp('2017-09-02 22:07:40')]], + 'local_label': [[85, 673, 394, 263], [673, 394, 263, 224], [394, 263, 224, 623]], + 'lat': [[-3.8347478, -3.8235834, -3.813889, -3.9067654], + [-3.8235834, -3.813889, -3.9067654, -3.8857223], + [-3.813889, -3.9067654, -3.8857223, -3.8828723]], + 'lon': [[-38.592189, -38.590389, -38.5904445, -38.5907723], + [-38.590389, -38.5904445, -38.5907723, -38.5928892], + [-38.5904445, -38.5907723, -38.5928892, -38.5929789]], + 'tid': [['12017090222', '12017090222', '12017090222', '12017090222'], + ['12017090222', '12017090222', '12017090222', '12017090222'], + ['12017090222', '12017090222', '12017090222', '12017090222']] + }) + + split = split_trajectory(trajectory, size_window=4, size_jump=1) + assert_frame_equal(split, expected) + + def test_object_for_array(): data_1 = '[1, 2, 3]' data_2 = '[1.5, 2.5, 3.5]' - data_3 = '[event, event]' + data_3 = "['event', 'event']" expected_data_1 = np.array([1., 2., 3.], dtype=np.float32) expected_data_2 = np.array([1.5, 2.5, 3.5], dtype=np.float32) expected_data_3 = np.array(['event', 'event'], dtype='object_') - assert_array_almost_equal(trajectories.object_for_array(data_1), expected_data_1) - assert_array_almost_equal(trajectories.object_for_array(data_2), expected_data_2) - assert_array_equal(trajectories.object_for_array(data_3), expected_data_3) - - -def test_column_to_array(): - list_data_1 = [ - '[1, 2, 3]', - '[5, 8]', - '[13, 21, 34, 55]', - '[89, 144]' - ] - - list_data_2 = [ - '[event]', - '[missa, culto]', - '[festa da cidade]' - ] - - df_1 = DataFrame(list_data_1, columns=['label']) - df_2 = DataFrame(list_data_2, columns=['label']) - - expected_data_1 = DataFrame( - data={'label': [[1, 2, 3], - [5, 8], - [13, 21, 34, 55], - [89, 144]]}, - index=[0, 1, 2, 3] - ) + assert_array_almost_equal(object_for_array(data_1), expected_data_1) + assert_array_almost_equal(object_for_array(data_2), expected_data_2) + assert_array_equal(object_for_array(data_3), expected_data_3) - expected_data_2 = DataFrame( - data={'label': [['event'], - ['missa', 'culto'], - ['festa da cidade']]}, - index=[0, 1, 2] - ) - df_1 = trajectories.column_to_array(df_1, 'label') - df_2 = trajectories.column_to_array(df_2, 'label') +def test_columns_to_array(): + df = DataFrame({ + 'ids': ['[1, 1, 1]', '[2, 2, 2]', '[3, 3, 3, 3]', '[4, 4]'], + 'descritions': ["['event', 'event', 'event']", "['bike', 'bike', 'bike']", + "['car', 'car', 'car', 'car']", "['house', 'house']"], + 'price': ['[10.5, 20.5, 13.5]', '[50.2, 33.4, 90.0]', + '[1.0, 2.9, 3.4, 8.4]', '[100.4, 150.5]'] + }) + + expected = DataFrame({ + 'ids': [[1, 1, 1], [2, 2, 2], [3, 3, 3, 3], [4, 4]], + 'descritions': [['event', 'event', 'event'], ['bike', 'bike', 'bike'], + ['car', 'car', 'car', 'car'], ['house', 'house']], + 'price': [[10.5, 20.5, 13.5], [50.2, 33.4, 90.0], + [1.0, 2.9, 3.4, 8.4], [100.4, 150.5]] + }) - assert_frame_equal(df_1, expected_data_1) - assert_frame_equal(df_2, expected_data_2) + columns_to_array(df) + assert_frame_equal(df, expected) diff --git a/pymove/utils/data_augmentation.py b/pymove/utils/data_augmentation.py index 1bbb2771..ace41852 100644 --- a/pymove/utils/data_augmentation.py +++ b/pymove/utils/data_augmentation.py @@ -3,25 +3,30 @@ append_row, generate_trajectories_df, -generate_start_feature, -generate_destiny_feature, split_crossover, -augmentation_trajectories_df, -insert_points_in_df, -instance_crossover_augmentation +_augmentation, +flatten_trajectories_dataframe, +instance_crossover_augmentation, +sliding_window, +get_all_paths, +transition_graph_augmentation_all_vertex """ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Text +import networkx as nx import numpy as np import pandas as pd +from networkx.classes.digraph import DiGraph from pandas.core.frame import DataFrame from pandas.core.series import Series -from pymove.utils.constants import DESTINY, START, TID, TRAJECTORY +from pymove.utils.constants import DESTINY, LOCAL_LABEL, START, TID from pymove.utils.log import progress_bar +from pymove.utils.networkx import build_transition_graph_from_df +from pymove.utils.trajectories import append_trajectory, split_trajectory if TYPE_CHECKING: from pymove.core.dask import DaskMoveDataFrame @@ -45,6 +50,39 @@ def append_row( columns : dict, optional Dictionary containing the values to be added, by default None + Example + ------- + >>> from pymove.utils.data_augmentation import append_row + >>> + >>> df + id datetime local lat lon tid + 0 1 2017-09-02 21:59:34 162 -3.843132 -38.593314 12017090221 + 1 1 2017-09-02 22:00:27 85 -3.834748 -38.592189 12017090222 + 2 1 2017-09-02 22:01:36 673 -3.823583 -38.590389 12017090222 + 3 1 2017-09-02 22:03:08 394 -3.813889 -38.590444 12017090222 + 4 1 2017-09-02 22:03:46 263 -3.906765 -38.590772 12017090222 + 5 1 2017-09-02 22:07:19 224 -3.885722 -38.592889 12017090222 + >>> + >>> row + id 1 + datetime 2017-09-02 22:07:40 + local 623 + lat -3.88287 + lon -38.593 + tid 12017090222 + dtype: object + >>> + >>> append_row(df, row) + >>> df + id datetime local lat lon tid + 0 1 2017-09-02 21:59:34 162 -3.843132 -38.593314 12017090221 + 1 1 2017-09-02 22:00:27 85 -3.834748 -38.592189 12017090222 + 2 1 2017-09-02 22:01:36 673 -3.823583 -38.590389 12017090222 + 3 1 2017-09-02 22:03:08 394 -3.813889 -38.590444 12017090222 + 4 1 2017-09-02 22:03:46 263 -3.906765 -38.590772 12017090222 + 5 1 2017-09-02 22:07:19 224 -3.885722 -38.592889 12017090222 + 6 1 2017-09-02 22:07:40 623 -3.882872 -38.592979 12017090222 + """ if row is not None: keys = row.index.tolist() @@ -57,7 +95,9 @@ def append_row( def generate_trajectories_df( - data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame' + data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', + label_tid: Text = TID, + min_points_traj: int = 3 ) -> DataFrame: """ Generates a dataframe with the sequence of location points of a trajectory. @@ -66,83 +106,52 @@ def generate_trajectories_df( ---------- data : DataFrame The input trajectory data. + label_tid: String, optional + Label referring to the ID of the trajectories, by default TID + min_points_traj: Number, optional + Minimum points per trajectory, by default 3 Return ------ DataFrame DataFrame of the trajectories - """ - if TID not in data: - data.generate_tid_based_on_id_datetime() - data.reset_index(drop=True, inplace=True) - - tids = data[TID].unique() - new_df = pd.DataFrame( - columns=data.columns - ) - - for tid in progress_bar(tids, total=len(tids)): - filter_ = data[data[TID] == tid] - filter_.reset_index(drop=True, inplace=True) - - if filter_.shape[0] > 4: - - values = [] - for col in filter_.columns: - if filter_[col].nunique() == 1: - values.append(filter_.at[0, col]) - else: - values.append( - np.array( - filter_[col], dtype=type(filter_.at[0, col]) - ).tolist() - ) - - row = pd.Series(values, filter_.columns) - append_row(new_df, row=row) - - return new_df - - -def generate_start_feature( - data: DataFrame, label_trajectory: str = TRAJECTORY -): - """ - Removes the last point from the trajectory and adds it in a new column 'destiny'. - - Parameters - ---------- - data : DataFrame - The input trajectory data. - label_trajectory : str, optional - Label of the points sequences, by default TRAJECTORY + Example + ------- + >>> from pymove.utils.data_augmentation import generate_trajectories_df + >>> + >>> df + id datetime local lat lon tid + 0 1 2017-09-02 21:59:34 162 -3.8431323 -38.5933142 12017090221 + 1 1 2017-09-02 22:00:27 85 -3.8347478 -38.5921890 12017090222 + 2 1 2017-09-02 22:01:36 673 -3.8235834 -38.5903890 12017090222 + 3 1 2017-09-02 22:03:08 394 -3.8138890 -38.5904445 12017090222 + 4 1 2017-09-02 22:03:46 263 -3.9067654 -38.5907723 12017090222 + 5 1 2017-09-02 22:07:19 224 -3.8857223 -38.5928892 12017090222 + 6 1 2017-09-02 22:07:40 623 -3.8828723 -38.5929789 12017090222 + >>> + >>> traj_df = generate_trajectories_df(df) + >>> traj_df.local + 0 [85, 673, 394, 263, 224, 623] + Name: local, dtype: object """ - if START not in data: - data[START] = data[label_trajectory].apply( - lambda x: np.int64(x[0]) + if label_tid not in data: + raise ValueError( + '{} not in DataFrame'.format(label_tid) ) + frames = [] + tids = data[label_tid].unique() -def generate_destiny_feature( - data: DataFrame, label_trajectory: str = TRAJECTORY -): - """ - Removes the first point from the trajectory and adds it in a new column 'start'. + desc = 'Gererating Trajectories DataFrame' + for tid in progress_bar(tids, desc=desc, total=len(tids)): + frame = data[data[label_tid] == tid] - Parameters - ---------- - data : DataFrame - The input trajectory data. - label_trajectory : str, optional - Label of the points sequences, by default 'trajectory' + if frame.shape[0] >= min_points_traj: + frames.append(frame.T.values.tolist()) - """ - if DESTINY not in data: - data[DESTINY] = data[label_trajectory].apply( - lambda x: np.int64(x[-1]) - ) + return pd.DataFrame(frames, columns=data.columns) def split_crossover( @@ -162,9 +171,20 @@ def split_crossover( Returns ------- - Tuple[List, List] + tuple[list, list] Arrays with the halves exchanged. + Example + ------- + >>> from pymove.utils.data_augmentation import split_crossover + >>> + >>> sequence_a, sequence_b + ([0, 2, 4, 6, 8], [1, 3, 5, 7, 9]) + >>> + >>> sequence_a, sequence_b = split_crossover(sequence_a, sequence_b) + >>> sequence_a, sequence_b + ([0, 2, 5, 7, 9], [1, 3, 4, 6, 8]) + """ size_a = int(len(sequence_a) * frac) size_b = int(len(sequence_b) * frac) @@ -181,7 +201,9 @@ def split_crossover( return sequence_a, sequence_b -def _augmentation(data: DataFrame, aug_df: DataFrame, frac: float = 0.5): +def _augmentation( + traj_df: DataFrame, frac: float = 0.5 +): """ Generates new data with unobserved trajectories. @@ -189,185 +211,432 @@ def _augmentation(data: DataFrame, aug_df: DataFrame, frac: float = 0.5): ---------- data : DataFrame The input trajectories data. - aug_df : DataFrame - The dataframe with new trajectories - frac : number, optional + frac : float, optional Represents the percentage to be exchanged, by default 0.5 + Return + ------ + DataFrame + Increased data set. + + Example + ------- + >>> from pymove.utils.data_augmentation import _augmentation + >>> + >>> traj_df + id local + 0 [1, 1, 1] [85, 673, 394] + 1 [2, 2, 2, 2] [263, 224, 623, 515] + >>> + >>> _augmentations(traj_df, frac=0.5) + id local + 0 [1, 1, 1] [85, 673, 394] + 1 [2, 2, 2, 2] [263, 224, 623, 515] + 2 [1, 2, 2] [85, 623, 515] + 3 [2, 2, 1, 1] [263, 224, 673, 394] + """ - data.reset_index(drop=True, inplace=True) - - for idx in range(data.shape[0] - 1): - for idx_ in range(idx + 1, data.shape[0]): - sequences1 = [] - sequences2 = [] - - columns = data.columns - - for col in columns: - if (isinstance( - data.at[idx, col], list - ) or isinstance( - data.at[idx, col], np.ndarray - )) and (isinstance( - data.at[idx_, col], list - ) or isinstance( - data.at[idx_, col], np.ndarray - )): - seq1, seq2 = split_crossover( - data.at[idx, col], - data.at[idx_, col], - frac=frac - ) - sequences1.append(seq1) - sequences2.append(seq2) - else: - value1 = data.at[idx, col] - value2 = data.at[idx_, col] - - if isinstance(value1, str) and isinstance(value2, str): - sequences1.append(value1 + '_' + value2) # type: ignore - sequences2.append(value2 + '_' + value1) # type: ignore - else: - sequences1.append(value1) - sequences2.append(value2) - - row = pd.Series(sequences1, index=columns) - append_row(aug_df, row=row) - - row = pd.Series(sequences2, index=columns) - append_row(aug_df, row=row) - - -def augmentation_trajectories_df( - data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame', + traj_df.reset_index(drop=True, inplace=True) + + frames = {} + for idx, row in traj_df.iterrows(): + if idx + 1 < traj_df.shape[0]: + series = {} + for column in traj_df.columns: + series[column] = pd.Series( + traj_df[idx + 1:][column].apply( + lambda x: split_crossover(row[column], x, frac) + ).values[0], name=column, + ) + frames[idx] = pd.concat([series[col] for col in traj_df.columns], axis=1) + + aug_df = pd.concat( + [frames[i] for i in range(len(frames))], axis=1 + ) + return pd.concat([traj_df, aug_df], ignore_index=True) + + +def flatten_trajectories_dataframe(traj_df: DataFrame) -> DataFrame: + """ + Extracts information from trajectories. + + Parameters + ---------- + traj_df : DataFrame + The input trajectories data + + Return + ------ + DataFrames + Flat trajectories. + + Example + ------- + >>> from pymove.utils.data_augmentation import flatten_trajectories_dataframe + >>> + >>> traj_df + id local + 0 [1, 1, 1] [85, 673, 394] + 1 [2, 2, 2, 2] [263, 224, 623, 515] + >>> + >>> flatten_trajectories_dataframe(traj_df) + id local + 0 1 85 + 1 1 673 + 2 1 394 + 3 2 263 + 4 2 224 + 5 2 623 + 6 2 515 + + """ + frames = {} + for idx, row in progress_bar(traj_df.iterrows(), total=traj_df.shape[0]): + frames[idx] = pd.DataFrame(row.to_dict()) + + return pd.concat([frames[i] for i in range(len(frames))], ignore_index=True) + + +def instance_crossover_augmentation( + data: DataFrame, restriction: str = 'destination only', - label_trajectory: str = TRAJECTORY, - insert_at_df: bool = False, + label_local: Text = LOCAL_LABEL, frac: float = 0.5, ) -> DataFrame: """ - Generates new data from unobserved trajectories, given a specific restriction. + Generates new data from unobserved trajectories, with a specific restriction. - By default, the algorithm uses the same route destination constraint. + By default, the algorithm uses the same destination constraint + as the route and inserts the points on the + original dataframe. Parameters ---------- data : DataFrame - The input trajectories data. + The input trajectories data restriction : str, optional Constraint used to generate new data, by default 'destination only' - label_trajectory : str, optional - Label of the points sequences, by default TRAJECTORY - insert_at_df : boolean, optional - Whether to return a new DataFrame, by default False - If True then value of copy is ignored. - frac : number, optional + label_local : str, optional + Label of the points sequences, by default LOCAL_LABEL + frac : float, optional Represents the percentage to be exchanged, by default 0.5 - Returns + Example ------- - DataFrame - Dataframe with the new data generated + >>> from pymove.utils.data_augmentation import instance_crossover_augmentation + >>> + >>> df + id local_label + 0 [1, 1, 1] [85, 673, 394] + 1 [2, 2, 2, 2] [85, 224, 623, 394] + 2 [3, 3, 3] [263, 673, 394] + >>> + >>> aug_df = instance_crossover_augmentation(df) + >>> aug_df + id local_label + 0 [1, 1, 1] [85, 673, 394] + 1 [2, 2, 2, 2] [85, 224, 623, 394] + 2 [3, 3, 3] [263, 673, 394] + 3 [1, 2, 2] [85, 623, 394] + 4 [2, 2, 1, 1] [85, 224, 673, 394] + 5 [2, 2, 3, 3] [85, 224, 673, 394] + 6 [3, 2, 2] [263, 623, 394] """ - if DESTINY not in data: - generate_destiny_feature(data, label_trajectory=label_trajectory) + df = data.copy() - if restriction == 'departure and destination': - generate_start_feature(data) + df[DESTINY] = df[label_local].apply(lambda x: x[-1]) + df[START] = df[label_local].apply(lambda x: x[0]) - if insert_at_df: - aug_df = data - else: - aug_df = pd.DataFrame(columns=data.columns) - - destinations = data[DESTINY].unique() - for dest in progress_bar(destinations, total=len(destinations)): - filter_ = data[data[DESTINY] == dest] + frames = {} + destinations = df[DESTINY].unique() + for idx, dest in progress_bar(enumerate(destinations), total=len(destinations)): + filter_ = df[df[DESTINY] == dest] if restriction == 'departure and destination': starts = filter_[START].unique() for st in progress_bar(starts, total=len(starts)): - f_filter_ = filter_[filter_[START] == st] - - if f_filter_.shape[0] >= 2: - _augmentation(f_filter_, aug_df, frac=frac) + filter_ = filter_[filter_[START] == st] + if filter_.shape[0] >= 2: + frames[idx] = _augmentation(filter_.iloc[:, :-2], frac=frac) else: if filter_.shape[0] >= 2: - _augmentation(filter_, aug_df, frac=frac) + frames[idx] = _augmentation(filter_.iloc[:, :-2], frac=frac) - return aug_df + return pd.concat([frames[i] for i in range(len(frames))], axis=0, ignore_index=True) -def insert_points_in_df(data: DataFrame, aug_df: DataFrame): +def sliding_window( + data: DataFrame, + size_window: int = 6, + size_jump: int = 3, + label_local: Text = LOCAL_LABEL, + columns: list = None, +) -> DataFrame: """ - Inserts the points of the generated trajectories to the original data sets. + Sliding window technique. + + Performs an increase in the trajectory data by sliding a window + Over each sequence to a specified size n, skipping m points. + This process inserts sub-trajectories in the data set. Parameters ---------- - data : DataFrame - The input trajectories data - aug_df : DataFrame - The data of unobserved trajectories + data: DataFrame + Trajectory data in sequence format + size_window: int, optional + Sliding window size, by default 6 + size_jump: int, optional + Size of the jump in the trajectory, by default 3 + label_local: str, optional + Name of the column referring to the trajectories, by default LOCAL_LABEL + columns: list, optional + Columns to which the split will be applied, by default None + + Return + ------ + DataFrame + Increased data set. + + Example + ------- + >>> from pymove.utils.data_augmentation import sliding_window + >>> + >>> traj_df + id local + 0 [1, 1, 1, 1, 1, 1, 1, 1] [85, 673, 394, 85, 224, 623, 394, 263] + 1 [2, 2, 2, 2, 2, 2] [85, 224, 623, 394, 263, 673, 394] + 2 [3, 3, 3, 3, 3, 3, 3] [263, 673, 394, 85, 673, 394, 85, 224] + >>> + >>> sliding_window(traj_df, size_jump=1) + id local_label + 0 [1, 1, 1, 1, 1, 1] [ 85, 673, 394, 85, 224, 623] + 1 [1, 1, 1, 1, 1, 1] [673, 394, 85, 224, 623, 394] + 2 [1, 1, 1, 1, 1, 1] [394, 85, 224, 623, 394, 263] + 3 [2, 2, 2, 2, 2, 2] [85, 224, 623, 394, 263, 673] + 4 [3, 3, 3, 3, 3, 3] [263, 673, 394, 85, 673, 394] + 5 [3, 3, 3, 3, 3, 3] [673, 394, 85, 673, 394, 85] """ - for _, row in progress_bar(aug_df.iterrows(), total=aug_df.shape[0]): + if columns is None: + columns = data.columns - keys = row.index.tolist() - values = row.values.tolist() + frames = {} + desc = 'Sliding Window...' + for idx, row in progress_bar(data.iterrows(), desc=desc, total=data.shape[0]): + frames[idx] = split_trajectory(row, size_window, size_jump, label_local, columns) - row_df = pd.DataFrame() + return pd.concat([frame for frame in frames.values()], ignore_index=True) - for k, v in zip(keys, values): - if k in data: - if isinstance(v, list) or isinstance(v, np.ndarray): - row_df[k] = v - for k, v in zip(keys, values): - if k in data: - if not isinstance(v, list) and not isinstance(v, np.ndarray): - row_df[k] = v +def get_all_paths( + traj_df: DataFrame, graph: DiGraph, source: str, target: str, + min_path_size: int = 3, max_path_size: int = 6, + max_sampling_source: int = 100, + max_sampling_target: int = 100, + label_local: str = LOCAL_LABEL, + simple_paths: bool = False +): + """ + Generate All Paths. - for _, row_ in row_df.iterrows(): - append_row(data, row=row_) + Retrieves all paths in the graph between the past source and + destination, if any. The number of paths returned is limited + by the max_sampling_source and max_sampling_target + parameters, and the path size is limited by the + min_path_size and max_path_size parameters. + Parameters + ---------- + traj_df: DataFrame + Trajectory data in sequence format. + graph: DiGraph + Transition graph constructed from trajectory data. + source: Node + Sequence source node. + target: Node + Sequence destination node. + min_path_size: int, optional + Minimum number of points for the trajectory, by default 3 + max_path_size: int, optional + Maximum number of points for the trajectory, by default 6 + max_sampling_source: int, optional + Maximum number of paths to be returned, + considering the observed origin, by default 10 + max_sampling_target: int, optional + Maximum number of paths to be returned, + considering the observed destination, by default 10 + label_local: str, optional + Name of the column referring to the trajectories, by default LOCAL_LABEL + simple_paths: bool, optional + If true, use the paths with the most used sections + Otherwise, use paths with less used sections, by default False + + Example + ------- + >>> from pymove.utils.data_augmentation import get_all_paths + >>> + >>> traj_df.to_dict() + {'id': [[1, 1, 1], [2, 2, 2, 2]], + 'datetime': [['2017-09-02 22:00:27', '2017-09-02 22:01:36', + '2017-09-02 22:03:08'], + ['2017-09-02 23:03:46', '2017-09-02 23:07:19', + '2017-09-02 23:07:40', '2017-09-02 23:09:10']], + 'local_label': [[85, 673, 394], [263, 224, 623, 394]], + 'lat': [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], + 'lon': [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409]]} + >>> + >>> graph = build_transition_graph_from_df(traj_df) + >>> + >>> get_all_paths(traj_df, graph, 224, 394) + [224.0, 623.0, 394.0] -def instance_crossover_augmentation( - data: DataFrame, - restriction: str = 'destination only', - label_trajectory: str = TRAJECTORY, - frac: float = 0.5 -): """ - Generates new data from unobserved trajectories, with a specific restriction. + if not nx.has_path(graph, source, target): + return [] - By default, the algorithm uses the same destination constraint - as the route and inserts the points on the - original dataframe. + param: int | None = None + + if simple_paths: + all_paths = nx.all_simple_paths + param = max_path_size - 1 + + else: + all_paths = nx.shortest_simple_paths + + for path in all_paths(graph, source, target, param): + freq_source = nx.get_node_attributes(graph, 'freq_source')[source] + freq_target = nx.get_node_attributes(graph, 'freq_target')[target] + + if freq_source >= max_sampling_source: + break + + if freq_target >= max_sampling_target: + break + + if len(path) > max_path_size and simple_paths is False: + break + + if len(path) >= min_path_size: + path_ = np.array(path, dtype='float32').tolist() + if path_ not in traj_df[label_local].values.tolist(): + + print(path_) + append_trajectory(traj_df, path, graph) + + freq_source += 1 + freq_target += 1 + + graph.add_node(source, freq_source=freq_source) + graph.add_node(target, freq_target=freq_target) + + +def transition_graph_augmentation_all_vertex( + traj_df: DataFrame, + graph: DiGraph | None = None, + min_path_size: int = 3, + max_path_size: int = 6, + max_sampling_source: int = 10, + max_sampling_target: int = 10, + source: dict | None = None, + target: dict | None = None, + label_local: Text = LOCAL_LABEL, + simple_paths: bool = False, + inplace: bool = True +) -> DataFrame: + """ + Transition Graph Data Augmentation. + + Performs the data increase from the transition graph. Parameters ---------- - data : DataFrame - The input trajectories data - restriction : str, optional - Constraint used to generate new data, by default 'destination only' - label_trajectory : str, optional - Label of the points sequences, by default 'trajectory' - frac : number, optional - Represents the percentage to be exchanged, by default 0.5 + traj_df: DataFrame + Trajectory data in sequence format + graph: DiGraph + Transition graph constructed from trajectory data + min_path_size: int, optional + Minimum number of points for the trajectory, by default 3 + max_path_size: int, optional + Maximum number of points for the trajectory, by default 6 + max_sampling_source: int, optional + Maximum number of paths to be returned, + considering the observed origin, by default 10 + max_sampling_target: int, optional + Maximum number of paths to be returned, + considering the observed destination, by default 10 + source: dict, optional + Degree of entry of each node in the graph, by default None + Example: {node: degree-of-entry} + target: dict, optional + Degree of output of each node in the graph, by default None + Example: {node: degree-of-output} + label_local: str, optional + Name of the column referring to the trajectories, by default LOCAL_LABEL + label_tid: str, optional + Column name for trajectory IDs, by default TID_STAT + simple_paths: boolean, optional + If true, use the paths with the most used sections + Otherwise, use paths with less used sections, by default False + inplace : boolean, optional + if set to true the original dataframe will be altered to contain the result + of the augmentation, otherwise a copy will be returned, by default True + + Return + ------ + DataFrame + Increased data set. + + Example + ------- + >>> from pymove.utils.data_augmentation import ( + transition_graph_augmentation_all_vertex + ) + >>> + >>> traj_df.to_dict() + {'id': [[1, 1, 1], [2, 2, 2, 2]], + 'datetime': [['2017-09-02 22:00:27', '2017-09-02 22:01:36', + '2017-09-02 22:03:08'], + ['2017-09-02 23:03:46', '2017-09-02 23:07:19', + '2017-09-02 23:07:40', '2017-09-02 23:09:10']], + 'local_label': [[85, 673, 394], [263, 224, 623, 394]], + 'lat': [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], + 'lon': [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409]]} + >>> + >>> transition_graph_augmentation_all_vertex(traj_df) + [263.0, 224.0, 623.0] + [224.0, 623.0, 394.0] """ - traj_df = generate_trajectories_df(data) + if inplace: + traj_df_ = traj_df + else: + traj_df_ = traj_df.copy() - generate_destiny_feature(traj_df, label_trajectory=label_trajectory) + if graph is None: + graph = build_transition_graph_from_df(traj_df_) - if restriction == 'departure and destination': - generate_start_feature(traj_df, label_trajectory=label_trajectory) + if source is None: + source = dict(graph.nodes) + source = {key: value['freq_source'] for key, value in source.items()} - aug_df = augmentation_trajectories_df( - traj_df, restriction=restriction, frac=frac - ) - insert_points_in_df(data, aug_df) + if target is None: + target = dict(graph.nodes) + target = {key: value['freq_source'] for key, value in target.items()} + + targets = sorted(target.items(), key=lambda x: x[1], reverse=True) + sources = sorted(source.items(), key=lambda x: x[1], reverse=True) + + [[get_all_paths( + traj_df_, graph, s, t, min_path_size, max_path_size, + max_sampling_source, max_sampling_target, label_local, simple_paths + ) for s, _ in sources] for t, _ in targets] + + if not inplace: + return traj_df_ diff --git a/pymove/utils/networkx.py b/pymove/utils/networkx.py new file mode 100644 index 00000000..7759d125 --- /dev/null +++ b/pymove/utils/networkx.py @@ -0,0 +1,484 @@ +""" +Graph operations. + +_populate_graph, +build_transition_graph_from_dict, +build_transition_graph_from_df, +graph_to_dict, +save_graph_as_json, +read_graph_json + +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Dict, Text + +import networkx as nx +import pandas as pd +from networkx.classes.digraph import DiGraph +from pandas.core.frame import DataFrame +from pandas.core.series import Series + +from pymove.utils.constants import DATETIME, LATITUDE, LOCAL_LABEL, LONGITUDE +from pymove.utils.log import progress_bar + + +def _populate_graph( + row: Series, nodes: Dict, edges: Dict, + label_local: Text = LOCAL_LABEL +): + """ + Populate Transition Graph. + + Insert the nodes and edges in the transition graph with all the + necessary attributes for the execution of the search and + recovery operations of paths / trajectories. The required + parameters are: latitude, longitude and datetime. + + Parameters + ---------- + row: Series + Line of the trajectory dataframe. + nodes: dict + Attributes of the transition graph nodes. + edges: dict + Attributes of the transition graph edges. + label_local: str, optional + Name of the column referring to the trajectories, by default LOCAL_LABEL + + Example + ------- + >>> from pymove.utils.networkx import _populate_graph + >>> + >>> trajectory + {"datetime": ["2017-09-02 22:00:27", "2017-09-02 22:01:36", "2017-09-02 22:03:08", + "2017-09-02 22:03:46", "2017-09-02 22:07:19", "2017-09-02 22:07:40"], + "local_label": [85, 673, 394, 263, 224, 623], + "lat": [-3.8347478, -3.8235834, -3.813889, -3.9067654, -3.8857223, -3.8828723], + "lon": [-38.592189, -38.590389, -38.59044, -38.590772, -38.592889, -38.592978]} + >>> + >>> nodes = {'datetime': {}, 'coords': {}, 'freq_source': {}, 'freq_target': {}} + >>> edges = {} + >>> + >>> _populate_graph(pd.Series(trajectory), nodes, edges) + >>> nodes, edges + ({"datetime": { 85: ["2017-09-02 22:00:27"], 673: ["2017-09-02 22:01:36"], + 394: ["2017-09-02 22:03:08"], 263: ["2017-09-02 22:03:46"], + 224: ["2017-09-02 22:07:19"], 623: ["2017-09-02 22:07:40"]}, + "coords": { 85: (-3.8347478, -38.592189), 673: (-3.8235834, -38.590389), + 394: (-3.8138891, -38.590441), 263: (-3.9067654, -38.590772), + 224: (-3.8857223, -38.592889), 623: (-3.8828723, -38.592978)}, + "freq_source": {85: 1, 673: 0, 394: 0, 263: 0, 224: 0, 623: 0}, + "freq_target": {85: 0, 673: 0, 394: 0, 263: 0, 224: 0, 623: 1}}, + { 85: {673: {"weight": 1, "mean_times": "0 days 00:01:09"}}, + 673: {394: {"weight": 1, "mean_times": "0 days 00:01:32"}}, + 394: {263: {"weight": 1, "mean_times": "0 days 00:00:38"}}, + 263: {224: {"weight": 1, "mean_times": "0 days 00:03:33"}}, + 224: {623: {"weight": 1, "mean_times": "0 days 00:00:21"}}}) + + """ + traj = row[label_local] + + for index, local in enumerate(traj): + + local_curr = str(local) + + dt = [str(row[DATETIME][index])] + fs = (index == 0) + ft = (index == len(traj) - 1) + + if local in nodes['datetime']: + dt.extend(nodes['datetime'][local_curr]) + fs += nodes['freq_source'][local_curr] + ft += nodes['freq_target'][local_curr] + + nodes['datetime'][local_curr] = dt + nodes['freq_source'][local_curr] = fs + nodes['freq_target'][local_curr] = ft + nodes['coords'][local_curr] = (row[LATITUDE][index], row[LONGITUDE][index]) + + if index == len(traj) - 1: + break + + next_local = str(traj[index + 1]) + + weight = 1 + mean_times = pd.Timestamp( + row[DATETIME][index + 1] + ) - pd.Timestamp(row[DATETIME][index]) + + if local_curr not in edges: + edges[local_curr] = {next_local: {}} + edges[local_curr][next_local] = { + 'weight': 1, + 'mean_times': str(mean_times) + } + + elif next_local not in edges[local_curr]: + edges[local_curr] = {**edges[local_curr], **{next_local: {}}} + edges[local_curr][next_local] = { + 'weight': 1, + 'mean_times': str(mean_times) + } + else: + weight += edges[local_curr][next_local]['weight'] + mean_times = ( + mean_times + pd.Timedelta(edges[local_curr][next_local]['mean_times']) + ) / 2 + + edges[local_curr][next_local]['weight'] = weight + edges[local_curr][next_local]['mean_times'] = str(mean_times) + + +def build_transition_graph_from_dict(dict_graph: Dict) -> DiGraph: + """ + Built Graph from Dict. + + It builds a transition graph from a dictionary + with nodes and edges and all necessary parameters. + Example: {'nodes': nodes, 'edges': edges}. + + Parameters + ---------- + dict_graph: dict + Dictionary with the attributes of nodes and edges. + + Return + ------ + graph: DiGraph + Transition graph constructed from trajectory data. + + Example + ------- + >>> from pymove.utils.networkx import build_transition_graph_from_dict + >>> + >>> dict_graph + {"nodes": { + "datetime": { 85: ["2017-09-02 22:00:27"], 673: ["2017-09-02 22:01:36"], + 394: ["2017-09-02 22:03:08"], 263: ["2017-09-02 22:03:46"], + 224: ["2017-09-02 22:07:19"], 623: ["2017-09-02 22:07:40"]}, + "coords": { 85: (-3.8347478, -38.592189), 673: (-3.8235834, -38.590389), + 394: (-3.8138891, -38.590441), 263: (-3.9067654, -38.590772), + 224: (-3.8857223, -38.592889), 623: (-3.8828723, -38.592978)}, + "freq_source": {85: 1, 673: 0, 394: 0, 263: 0, 224: 0, 623: 0}, + "freq_target": {85: 0, 673: 0, 394: 0, 263: 0, 224: 0, 623: 1}}, + "edges": { 85: {673: {"weight": 1, "mean_times": "0 days 00:01:09"}}, + 673: {394: {"weight": 1, "mean_times": "0 days 00:01:32"}}, + 394: {263: {"weight": 1, "mean_times": "0 days 00:00:38"}}, + 263: {224: {"weight": 1, "mean_times": "0 days 00:03:33"}}, + 224: {623: {"weight": 1, "mean_times": "0 days 00:00:21"}}}} + >>> + >>> graph = build_transition_graph_from_dict(dict_graph) + >>> + >>> graph.nodes + NodeView((85, 673, 394, 263, 224, 623)) + >>> + >>> graph.edges + OutEdgeView([(85, 673), (673, 394), (394, 263), (263, 224), (224, 623)]) + >>> + >>> graph.adj + AdjacencyView({ 85: {673: {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + 673: {394: {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + 394: {263: {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + 263: {224: {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + 224: {623: {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + 623: {}}) + + """ + graph = nx.DiGraph(dict_graph['edges']) + + for key in dict_graph['nodes']['coords']: + graph.add_node(key, coords=dict_graph['nodes']['coords'][key]) + graph.add_node(key, datetime=dict_graph['nodes']['datetime'][key]) + graph.add_node(key, freq_source=dict_graph['nodes']['freq_source'][key]) + graph.add_node(key, freq_target=dict_graph['nodes']['freq_target'][key]) + + return graph + + +def build_transition_graph_from_df(data: DataFrame) -> DiGraph: + """ + Build Graph from data. + + Constructs a Transition Graph from trajectory data. + + Parameters + ---------- + data: DataFrame + Trajectory data in sequence format. + + Return + ------ + graph: DiGraph + Transition graph constructed from trajectory data. + + Example + ------- + >>> from pymove.utils.networkx import build_transition_graph_from_df + >>> + >>> data + {"id": [[1, 1, 1], [2, 2, 2, 2]], + "datetime": [["2017-09-02 22:00:27", "2017-09-02 22:01:36", + "2017-09-02 22:03:08"], + ["2017-09-02 23:03:46", "2017-09-02 23:07:19", + "2017-09-02 23:07:40", "2017-09-02 23:09:10"]], + "local_label": [[85, 673, 394], [263, 224, 623, 394]], + "lat": [[-3.8347478, -3.8235834, -3.8138890], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], + "lon": [[-38.5921890, -38.5903890, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.7040900]]} + >>> + >>> graph = build_transition_graph_from_df(pd.DataFrame(traj_df)) + >>> + >>> graph.nodes + NodeView(('85', '673', '394', '263', '224', '623')) + >>> + >>> graph.edges + OutEdgeView([( '85', '673'), ('673', '394'), ('394', '263'), + ('263', '224'), ('224', '623')]) + >>> + >>> graph.adj + AdjacencyView({ '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {'394': {'weight': 1, 'mean_times': '0 days 00:01:30'}}, + '394': {}}) + + """ + nodes: dict = {'datetime': {}, 'coords': {}, 'freq_source': {}, 'freq_target': {}} + edges: dict = {} + + desc = 'Building Transition Graph...' + for _, row in progress_bar(data.iterrows(), desc=desc, total=data.shape[0]): + _populate_graph(row, nodes, edges) + + return build_transition_graph_from_dict( + {'nodes': nodes, 'edges': edges} + ) + + +def graph_to_dict(graph: DiGraph) -> Dict: + """ + Graph to Dict. + + Converts nodes and edges from the Transition Graph + with all your attributes in a dictionary. + + Parameters + ---------- + graph: DiGraph + Transition graph constructed from trajectory data. + + Return + ------ + dict + Dictionary with the attributes of nodes and edges. + + Example + ------- + >>> from pymove.utils.networkx import graph_to_dict + >>> + >>> graph = DiGraph() + >>> graph.add_node('85', coords=(-3.8347478, -38.592189), + datetime=['2017-09-02 22:00:27'], freq_source=1, freq_target=0) + >>> graph.add_node('673', coords=(-3.8235834, -38.590389), + datetime=['2017-09-02 22:01:36'], freq_source=0, freq_target=0) + >>> graph.add_node('394', coords=(-3.813889, -38.5904445), + datetime=['2017-09-02 22:03:08'], freq_source=0, freq_target=0) + >>> graph.add_node('263', coords=(-3.9067654, -38.5907723), + datetime=['2017-09-02 22:03:46'], freq_source=0, freq_target=0) + >>> graph.add_node('224', coords=(-3.8857223, -38.5928892), + datetime=['2017-09-02 22:07:19'], freq_source=0, freq_target=0) + >>> graph.add_node('623', coords=(-3.8828723, -38.5929789), + datetime=['2017-09-02 22:07:40'], freq_source=0, freq_target=1) + >>> graph.add_edge('85', '673', weight=1, mean_times='0 days 00:01:09') + >>> graph.add_edge('673', '394', weight=1, mean_times='0 days 00:01:09') + >>> graph.add_edge('394', '263', weight=1, mean_times='0 days 00:01:09') + >>> graph.add_edge('263', '224', weight=1, mean_times='0 days 00:01:09') + >>> graph.add_edge('224', '623', weight=1, mean_times='0 days 00:01:09') + >>> + >>> dict_graph = graph_to_dict(graph) + >>> dict_graph + {'nodes': { + 'coords': { '85': (-3.8347478, -38.5921890), '673': (-3.8235834, -38.5903890), + '394': (-3.8138890, -38.5904445), '263': (-3.9067654, -38.5907723), + '224': (-3.8857223, -38.5928892), '623': (-3.8828723, -38.5929789)}, + 'datetime': { '85': ['2017-09-02 22:00:27'], '673': ['2017-09-02 22:01:36'], + '394': ['2017-09-02 22:03:08'], '263': ['2017-09-02 22:03:46'], + '224': ['2017-09-02 22:07:19'], '623': ['2017-09-02 22:07:40']}, + 'freq_source': {'85': 1, '673': 0, '394': 0, '263': 0, '224': 0, '623': 0}, + 'freq_target': {'85': 0, '673': 0, '394': 0, '263': 0, '224': 0, '623': 1}}, + 'edges': { + '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {}}} + """ + dict_graph: dict = {'nodes': {}, 'edges': {}} + + dict_graph['nodes']['coords'] = nx.get_node_attributes(graph, 'coords') + dict_graph['nodes']['datetime'] = nx.get_node_attributes(graph, 'datetime') + dict_graph['nodes']['freq_source'] = nx.get_node_attributes(graph, 'freq_source') + dict_graph['nodes']['freq_target'] = nx.get_node_attributes(graph, 'freq_target') + dict_graph['edges'] = nx.to_dict_of_dicts(graph) + + return dict_graph + + +def save_graph_as_json( + graph: DiGraph, + file_path: Path | str = 'graph.json' +): + """ + Save Graph as JSON. + + Saves the data extracted from the Transition Graph + into a JSON file. + + Parameters + ---------- + graph: DiGraph + Transition graph constructed from trajectory data. + file_path: str or path, optional + File name that will be saved with transition graph data, by default 'graph.json'. + + Example + ------- + >>> from pymove.utils.networkx import save_graph_as_json + >>> + >>> graph.nodes + NodeView(('85', '673', '394', '263', '224', '623')) + >>> + >>> graph.edges + OutEdgeView([ + ('85', '673'), ('673', '394'), ('394', '263'), ('263', '224'), ('224', '623') + ]) + >>> + >>> graph.adj + AdjacencyView({ '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {}}) + >>> + >>> save_graph_as_json(graph, 'graph.json') + >>> + >>> with open('graph.json', 'r') as f: + >>> lines = f.readlines() + >>> print(lines) + ['{"nodes": { + "coords": { + "85": [-3.8347478, -38.592189], "673": [-3.8235834, -38.590389], + "394": [-3.813889, -38.5904445], "263": [-3.9067654, -38.5907723], + "224": [-3.8857223, -38.5928892], "623": [-3.8828723, -38.5929789]}, + "datetime": { + "85": ["2017-09-02 22:00:27"], "673": ["2017-09-02 22:01:36"], + "394": ["2017-09-02 22:03:08"], "263": ["2017-09-02 22:03:46"], + "224": ["2017-09-02 22:07:19"], "623": ["2017-09-02 22:07:40"]}, + "freq_source": { + "85": 1, "673": 0, "394": 0, "263": 0, "224": 0, "623": 0}, + "freq_target": { + "85": 0, "673": 0, "394": 0, "263": 0, "224": 0, "623": 1}}, + "edges": { + "85": {"673": {"weight": 1, "mean_times": "0 days 00:01:09"}}, + "673": {"394": {"weight": 1, "mean_times": "0 days 00:01:32"}}, + "394": {"263": {"weight": 1, "mean_times": "0 days 00:00:38"}}, + "263": {"224": {"weight": 1, "mean_times": "0 days 00:03:33"}}, + "224": {"623": {"weight": 1, "mean_times": "0 days 00:00:21"}}, + "623": {}}}'] + """ + dict_graph = graph_to_dict(graph) + + path = Path(file_path) + + if path.suffix != '.json': + raise ValueError( + f'Unsupported file extension {path.suffix},' + f'Expected extension = json' + ) + + with open(path, 'w') as f: + json.dump(dict_graph, f) + + +def read_graph_json(file_path: Path | str): + """ + Read Graph from JSON file. + + You load a Transition Graph from a file in JSON format. + + Parameters + ---------- + file_path: str or path + Name of the JSON file to be read + + Return + ------ + dict + Dictionary with the attributes of nodes and edges + + Example + ------- + >>> from pymove.utils.networkx import read_graph_json + >>> + >>> with open('graph.json', 'r') as f: + >>> lines = f.readlines() + >>> print(lines) + ['{"nodes": { + "coords": { + "85": [-3.8347478, -38.592189], "673": [-3.8235834, -38.590389], + "394": [-3.813889, -38.5904445], "263": [-3.9067654, -38.5907723], + "224": [-3.8857223, -38.5928892], "623": [-3.8828723, -38.5929789]}, + "datetime": { + "85": ["2017-09-02 22:00:27"], "673": ["2017-09-02 22:01:36"], + "394": ["2017-09-02 22:03:08"], "263": ["2017-09-02 22:03:46"], + "224": ["2017-09-02 22:07:19"], "623": ["2017-09-02 22:07:40"]}, + "freq_source": { + "85": 1, "673": 0, "394": 0, "263": 0, "224": 0, "623": 0}, + "freq_target": { + "85": 0, "673": 0, "394": 0, "263": 0, "224": 0, "623": 1}}, + "edges": { + "85": {"673": {"weight": 1, "mean_times": "0 days 00:01:09"}}, + "673": {"394": {"weight": 1, "mean_times": "0 days 00:01:32"}}, + "394": {"263": {"weight": 1, "mean_times": "0 days 00:00:38"}}, + "263": {"224": {"weight": 1, "mean_times": "0 days 00:03:33"}}, + "224": {"623": {"weight": 1, "mean_times": "0 days 00:00:21"}}, + "623": {}}}'] + >>> + >>> read_graph_json('graph.json') + {'nodes': { + 'coords': {'85': [-3.8347478, -38.592189], '673': [-3.8235834, -38.590389], + '394': [-3.813889, -38.5904445], '263': [-3.9067654, -38.5907723], + '224': [-3.8857223, -38.5928892], '623': [-3.8828723, -38.5929789]}, + 'datetime': {'85': ['2017-09-02 22:00:27'], '673': ['2017-09-02 22:01:36'], + '394': ['2017-09-02 22:03:08'], '263': ['2017-09-02 22:03:46'], + '224': ['2017-09-02 22:07:19'], '623': ['2017-09-02 22:07:40']}, + 'freq_source': {'85': 1, '673': 0, '394': 0, '263': 0, '224': 0, '623': 0}, + 'freq_target': {'85': 0, '673': 0, '394': 0, '263': 0, '224': 0, '623': 1}}, + 'edges': { + '85': {'673': {'weight': 1, 'mean_times': '0 days 00:01:09'}}, + '673': {'394': {'weight': 1, 'mean_times': '0 days 00:01:32'}}, + '394': {'263': {'weight': 1, 'mean_times': '0 days 00:00:38'}}, + '263': {'224': {'weight': 1, 'mean_times': '0 days 00:03:33'}}, + '224': {'623': {'weight': 1, 'mean_times': '0 days 00:00:21'}}, + '623': {}}} + """ + path = Path(file_path) + + if path.suffix != '.json': + raise ValueError( + f'Unsupported file extension {path.suffix},' + f'Expected extension = json' + ) + + with open(file_path, 'r') as f: + dict_graph = json.load(f) + + return dict_graph diff --git a/pymove/utils/trajectories.py b/pymove/utils/trajectories.py index 5ccd621e..670ae13a 100644 --- a/pymove/utils/trajectories.py +++ b/pymove/utils/trajectories.py @@ -7,24 +7,36 @@ flatten_columns, shift, fill_list_with_new_values, +append_trajectory, +split_trajectory, object_for_array, column_to_array """ from __future__ import annotations +from ast import literal_eval from itertools import chain -from typing import Any +from typing import Any, Generator, Text import numpy as np +import pandas as pd +from networkx.classes.digraph import DiGraph from numpy import ndarray from pandas import DataFrame, Series from pandas import read_csv as _read_csv from pandas._typing import FilePathOrBuffer from pymove.core.dataframe import MoveDataFrame -from pymove.utils.constants import DATETIME, LATITUDE, LONGITUDE, TRAJ_ID, TYPE_PANDAS -from pymove.utils.math import is_number +from pymove.utils.constants import ( + DATETIME, + LATITUDE, + LOCAL_LABEL, + LONGITUDE, + TRAJ_ID, + TYPE_PANDAS, +) +from pymove.utils.networkx import graph_to_dict def read_csv( @@ -314,13 +326,152 @@ def fill_list_with_new_values(original_list: list, new_list_values: list): original_list[:n] = new_list_values +def append_trajectory( + data: DataFrame, + trajectory: list, + graph: DiGraph +): + """ + Inserts a trajectory in the data set. + + Inserts the trajectory retrieved from the + transition graph in the trajectory data set. + + Parameters + ---------- + data: DataFrame + Trajectory data in sequence format + trajectory: list + Trajectory recovered from the transition graph + graph: DiGraph + Transition graph constructed from trajectory data + + Example + ------- + >>> from pymove.utils.data_augmentation import append_trajectory + >>> + >>> traj_df.to_dict() + {'id': [[1, 1, 1], [2, 2, 2, 2]], + 'datetime': [['2017-09-02 22:00:27', '2017-09-02 22:01:36', + '2017-09-02 22:03:08'], + ['2017-09-02 23:03:46', '2017-09-02 23:07:19', + '2017-09-02 23:07:40', '2017-09-02 23:09:10']], + 'local_label': [[85, 673, 394], [263, 224, 623, 394]], + 'lat': [[-3.8347478, -3.8235834, -3.813889], + [-3.9067654, -3.8857223, -3.8828723, -3.9939834]], + 'lon': [[-38.592189, -38.590389, -38.5904445], + [-38.5907723, -38.5928892, -38.5929789, -38.70409]]} + >>> + >>> trajectory = [263, 224, 623] + >>> graph = build_transition_graph_from_df(traj_df) + >>> + >>> append_trajectory(traj_df, trajectory, graph) + >>> traj_df.iloc[-1] + id [3, 3, 3] + datetime [2017-09-02 23:03:46, 2017-09-02 23:07:19, 2017-09-02 23:07:40] + local_label [263, 224, 623] + lat [-3.9067654, -3.8857223, -3.8828723] + lon [-38.5907723, -38.5928892, -38.5929789] + Name: 2, dtype: object + + """ + source = str(trajectory[0]) + dict_graph = graph_to_dict(graph) + + dt = np.random.choice(dict_graph['nodes']['datetime'][source]) + datetimes = [pd.Timestamp(str(dt))] + + coords = dict_graph['nodes']['coords'] + lats, lons = [coords[source][0]], [coords[source][1]] + + for idx, edge in enumerate(zip(trajectory[:-1], trajectory[1:])): + u, v = str(edge[0]), str(edge[1]) + mean_times = dict_graph['edges'][u][v]['mean_times'] + + datetime = pd.Timestamp(str(datetimes[idx])) + pd.Timedelta(mean_times) + datetimes.append(datetime) + + lats.append(coords[v][0]) + lons.append(coords[v][1]) + + prev_id = data.loc[data.shape[0] - 1, TRAJ_ID][0] + ids = np.full(len(trajectory), prev_id + 1, dtype=np.int32).tolist() + + path = np.array(trajectory, dtype=np.float32).tolist() + + data.loc[data.shape[0], [ + DATETIME, TRAJ_ID, LOCAL_LABEL, LATITUDE, LONGITUDE + ]] = [datetimes, ids, path, lats, lons] + + +def split_trajectory( + row: Series, + size_window: int = 6, + size_jump: int = 3, + label_local: Text = LOCAL_LABEL, + columns: list = None +) -> Generator[Series, None, None]: + """ + It breaks the trajectory in stretches. + + Extracts all possible sub-trajectories, according to the specified + window size and jump. + + Parameters + ---------- + row: Series + Line of the trajectory dataframe + size_window: int, optional + Sliding window size, by default 6 + size_jump: int, optional + Size of the jump in the trajectory, by default 3 + label_local: str, optional + Name of the column referring to the trajectories, by default LOCAL_LABEL + columns: list, optional + Columns to which the split will be applied, by default None + + Return + ------ + Generator of Series + Series with the stretches recovered from the observed trajectory. + + Example + ------- + >>> from pymove.utils.trajectories import split_trajectories + >>> + >>> trajectory + id [1, 1, 1, 1, 1, 1, 1, 1] + local_label [85, 673, 394, 85, 224, 623, 394, 263] + dtype: object + >>> + >>> split = split_trajectory(trajectory, size_jump=1) + >>> split + + id local_label + 0 [1, 1, 1, 1, 1, 1] [ 85, 673, 394, 85, 224, 623] + 1 [1, 1, 1, 1, 1, 1] [673, 394, 85, 224, 623, 394] + 2 [1, 1, 1, 1, 1, 1] [394, 85, 224, 623, 394, 263] + + """ + if columns is None: + columns = row.index + + size_t = len(row[label_local]) + + return pd.concat([ + pd.Series( + {col: row[col][i:i + size_window] for col in columns} + ) for i in range(0, size_t, size_jump) if (size_t - i) > size_window - 1 + ], axis=1).T + + def object_for_array(object_: str) -> ndarray: """ Transforms an object into an array. Parameters ---------- - object : str + object_ : str object representing a list of integers or strings Returns @@ -328,64 +479,61 @@ def object_for_array(object_: str) -> ndarray: array object converted to a list - Examples - -------- + Example + ------- >>> from pymove.utils.trajectories import object_for_array - >>> list_str = '[1,2,3,4,5]' - >>> object_for_array(list_str) - array([1., 2., 3., 4., 5.], dtype=float32) + >>> + >>> object_1, object_2, object_3 + ('[1, 2, 3]', '[1.5, 2.5, 3.5]', "['event', 'event']") + >>> + >>> object_for_array(object_1) + [1, 2, 3] + >>> object_for_array(object_2) + [1.5, 2.5, 3.5] + >>> object_for_array(object_3) + ['event', 'event' ] """ if object_ is None: return object_ - conv = np.array([*map(str.strip, object_[1:-1].split(','))]) - - if is_number(conv[0]): - return conv.astype(np.float32) - else: - return conv.astype('object_') + return literal_eval('[' + object_ + ']')[0] -def column_to_array(data: DataFrame, column: str) -> DataFrame: +def columns_to_array( + traj_df: DataFrame, + columns: list | None = None +): """ Transforms all columns values to list. Parameters ---------- - data : dataframe - The input trajectory data - - column : str - Label of data referring to the column for conversion - - Returns - ------- - dataframe - Dataframe with the selected column converted to list + traj_df : DataFrame + The input trajectory data. + columns : list, optional + List of the columns for conversion. Example ------- - >>> from pymove.utils.trajectories import column_to_array - >>> move_df - lat lon datetime id list_column - 0 39.984094 116.319236 2008-10-23 05:53:05 1 '[1,2]' - 1 39.984198 116.319322 2008-10-23 05:53:06 1 '[3,4]' - 2 39.984224 116.319402 2008-10-23 05:53:11 1 '[5,6]' - 3 39.984211 116.319389 2008-10-23 05:53:16 1 '[7,8]' - 4 39.984217 116.319422 2008-10-23 05:53:21 1 '[9,10]' - >>> column_to_array(move_df, column='list_column') - lat lon datetime id list_column - 0 39.984094 116.319236 2008-10-23 05:53:05 1 [1.0,2.0] - 1 39.984198 116.319322 2008-10-23 05:53:06 1 [3.0,4.0] - 2 39.984224 116.319402 2008-10-23 05:53:11 1 [5.0,6.0] - 3 39.984211 116.319389 2008-10-23 05:53:16 1 [7.0,8.0] - 4 39.984217 116.319422 2008-10-23 05:53:21 1 [9.0,10.0] + >>> from pymove.utils.trajectories import columns_to_array + >>> + >>> traj_df + ids descritions price + 0 '[1, 1, 1]' "['event', 'event', 'event']" '[10.5, 20.5, 13.5]' + 1 '[2, 2, 2]' "['bike', 'bike', 'bike']" '[50.2, 33.4, 90.0]' + 2 '[3, 3, 3, 3]' "['car', 'car', 'car', 'car']" '[1.0, 2.9, 3.4, 8.4]' + 3 '[4, 4]' "['house', 'house']" '[100.4, 150.5]' + >>> + >>> columns_to_array(traj_df) + >>> traj_df + ids descritions price + 0 [1, 1, 1] [event, event, event] [10.5, 20.5, 13.5] + 1 [2, 2, 2] [bike, bike, bike] [50.2, 33.4, 90.0] + 2 [3, 3, 3, 3] [car, car, car, car] [1.0, 2.9, 3.4, 8.4] + 3 [4, 4] [house, house] [100.4, 150.5] """ - data = data.copy() - if column not in data: - raise KeyError( - 'Dataframe must contain a %s column' % column - ) + if columns is None: + columns = list(traj_df.columns) - data[column] = data[column].apply(object_for_array) - return data + f = {col: object_for_array for col in columns} + traj_df[columns] = traj_df[columns].agg(f) diff --git a/requirements.txt b/requirements.txt index ebe72608..2757dcb4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ ipywidgets joblib matplotlib mplleaflet +networkx numpy pandas>=1.1.0 psutil diff --git a/setup.cfg b/setup.cfg index 648286f1..a9b5dbf3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,7 @@ files = pymove multi_line_output = 3 include_trailing_comma = True line_length = 90 -known_third_party = IPython,branca,dask,dateutil,folium,geohash2,holidays,ipywidgets,joblib,matplotlib,numpy,pandas,psutil,scipy,setuptools,shapely,sklearn,tqdm +known_third_party = IPython,branca,dask,dateutil,folium,geohash2,holidays,ipywidgets,joblib,matplotlib,networkx,numpy,pandas,psutil,scipy,setuptools,shapely,sklearn,tqdm [tool:pytest] addopts =