In [1]:
import pandas as pd
import numpy as np
from utils import trajutils

# TRAJUTILS

In [2]:
filename = '/home/nicksson/Git/2_Doutorado/PyRoad/data.csv'

dict_type = {'lat' : np.float32,
             'lon' : np.float32
            }

usecols=["id", "datetime", "lat", "lon"]
df = pd.read_csv(filename, dtype=dict_type, parse_dates=['datetime'], usecols=usecols)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2737386 entries, 0 to 2737385
Data columns (total 4 columns):
id          object
datetime    datetime64[ns]
lat         float32
lon         float32
dtypes: datetime64[ns](1), float32(2), object(1)
memory usage: 62.7+ MB


## 1 GENERAL FUCTIONS

####  show trajectories

In [4]:
trajutils.show_trajectories_info(df)



Number of Rows: 2737386
Number of Ids: 290 
Trajectories between 2019-02-01 00:00:02 and 2019-02-28 23:59:53
Bounding Box: (-25.53447, -49.34629, -3.42381, -37.97867)




#### format labels

In [5]:
dic = trajutils.format_labels(df, current_id='id', current_lat='lat', current_lon='lon', current_datetime='datetime')

#### bbox fuctions

In [6]:
trajutils.get_bbox(df)

(-25.53447, -49.34629, -3.42381, -37.97867)

In [7]:
trajutils.bbox_split(trajutils.get_bbox(df), 2)

const_lat: 11.055330276489258
const_lon: 5.683811187744141


Unnamed: 0,lat_min,lon_min,lat_max,lon_max
0,-25.53447,-49.346291,-3.42381,-43.662479
1,-25.53447,-43.662479,-3.42381,-37.978668


#### filter

In [8]:
trajutils.filter_by_datetime(df, startDatetime='2019-02-01 00:33:26', endDatetime='2019-02-25 00:36:26', filter_out=False).head()

Unnamed: 0,id,datetime,lat,lon
0,M41859,2019-02-02 00:31:26,-3.69871,-38.586281
1,M41859,2019-02-02 00:32:26,-3.69871,-38.586281
2,M41859,2019-02-02 00:33:26,-3.69871,-38.586281
3,M41859,2019-02-02 00:34:26,-3.69871,-38.586281
4,M41859,2019-02-02 00:35:26,-3.69871,-38.586288


In [9]:
trajutils.filter_by_id(df, 'M41859', filter_out=False).head()

Unnamed: 0,id,datetime,lat,lon
0,M41859,2019-02-02 00:31:26,-3.69871,-38.586281
1,M41859,2019-02-02 00:32:26,-3.69871,-38.586281
2,M41859,2019-02-02 00:33:26,-3.69871,-38.586281
3,M41859,2019-02-02 00:34:26,-3.69871,-38.586281
4,M41859,2019-02-02 00:35:26,-3.69871,-38.586288


# 2. FUCTIONS TO LAT AND LONG COORDINATES 

In [10]:
trajutils.lon2XSpherical(-38.501597)

-4285978.172767829

In [11]:
trajutils.lat2YSpherical(-3.797864)

-423086.2213610324

In [12]:
trajutils.x2LonSpherical(-4285978.17)

-38.50159697513617

In [13]:
trajutils.y2LatSpherical(-423086.2213610324)

-3.7978639999999944

In [14]:
trajutils.show_trajectories_info(df)



Number of Rows: 2737386
Number of Ids: 290 
Trajectories between 2019-02-01 00:00:02 and 2019-02-28 23:59:53
Bounding Box: (-25.53447, -49.34629, -3.42381, -37.97867)




# 3. CREATING FEATURES BASED ON DATETIME

In [15]:
%%time
trajutils.create_update_tid_based_on_id_datatime(df)


Creating or updating tid feature...


...tid feature was created...

CPU times: user 3.67 s, sys: 145 ms, total: 3.82 s
Wall time: 3.32 s


In [16]:
%%time
trajutils.create_update_day_of_the_week_features(df)


Creating or updating day of the week feature...

...the day of the week feature was created...

CPU times: user 662 ms, sys: 43.6 ms, total: 705 ms
Wall time: 701 ms


In [17]:
%%time
trajutils.create_update_time_of_day_features(df)


Creating or updating period feature
...early morning from 0H to 6H
...morning from 6H to 12H
...afternoon from 12H to 18H
...evening from 18H to 24H

...the period of day feature was created
CPU times: user 3.04 s, sys: 276 ms, total: 3.32 s
Wall time: 973 ms


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2737386 entries, 0 to 2737385
Data columns (total 7 columns):
id          object
datetime    datetime64[ns]
lat         float32
lon         float32
tid         object
day         object
period      object
dtypes: datetime64[ns](1), float32(2), object(4)
memory usage: 125.3+ MB


# 4. CREATING FEATURES BASED ON LAT AND LONG COORDINATES

In [19]:
%%time
trajutils.create_update_dist_features(df,label_dtype=np.float32, sort=True)


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(1439/2737386) 0% in 00:00:00.213 - estimated end in 00:06:45.788
(572347/2737386) 20% in 00:00:00.370 - estimated end in 00:00:01.402
(1105939/2737386) 40% in 00:00:00.457 - estimated end in 00:00:00.675
(1643031/2737386) 60% in 00:00:00.546 - estimated end in 00:00:00.363
(2193264/2737386) 80% in 00:00:00.634 - estimated end in 00:00:00.157
(2737386/2737386) 100% in 00:00:00.727 - estimated end in 00:00:00.000
...Reset index

CPU times: user 4.27 s, sys: 301 ms, total: 4.57 s
Wall time: 2.86 s


In [20]:
%%time
trajutils.create_update_dist_time_speed_features(df, label_dtype=np.float64)


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(1439/2737386) 0% in 00:00:00.255 - estimated end in 00:08:05.510
(572347/2737386) 20% in 00:00:00.355 - estimated end in 00:00:01.346
(1105939/2737386) 40% in 00:00:00.445 - estimated end in 00:00:00.657
(1643031/2737386) 60% in 00:00:00.560 - estimated end in 00:00:00.373
(2193264/2737386) 80% in 00:00:00.672 - estimated end in 00:00:00.166
(2737386/2737386) 100% in 00:00:00.776 - estimated end in 00:00:00.000
...Reset index


Creating or updating time and speed features in meters by seconds

...Set id as index to a higher peformance

(1439/2737386) 0% in 00:00:00.261 - estimated end in 00:08:17.400
(572347/2737386) 20% in 00:00:00.285 - estimated end in 00:00:01.078
(1105939/2737386) 40% in 00:00:00.306 - estimated end in 00:00:00.451
(1643031/2737386) 60% in 00:00:00.333 - estimated end in 00:00:00.222
(2193264/2737386)

In [21]:
%%time
trajutils.create_update_move_and_stop_by_radius(df, radius=0, target_label='speed_to_prev')


Creating or updating features MOVE and STOPS...


....There are 1535040 stops to this parameters

CPU times: user 3.53 s, sys: 788 ms, total: 4.31 s
Wall time: 2.34 s


# 5. DATA CLEANING BY ID

In [22]:
%%time
trajutils.clean_duplicates(df,['id', 'datetime'], inplace=True)


Remove rows duplicates by subset
...There are no GPS points duplicated
CPU times: user 518 ms, sys: 106 µs, total: 518 ms
Wall time: 515 ms


In [23]:
%%time
trajutils.clean_NaN_values(df, inplace=False)

CPU times: user 2.85 s, sys: 164 ms, total: 3.01 s
Wall time: 1.74 s


In [24]:
%%time
trajutils.clean_gps_nearby_points(df)


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(1439/2737386) 0% in 00:00:00.220 - estimated end in 00:07:00.084
(572347/2737386) 20% in 00:00:00.311 - estimated end in 00:00:01.180
(1105939/2737386) 40% in 00:00:00.394 - estimated end in 00:00:00.581
(1643031/2737386) 60% in 00:00:00.487 - estimated end in 00:00:00.324
(2193264/2737386) 80% in 00:00:00.577 - estimated end in 00:00:00.143
(2737386/2737386) 100% in 00:00:00.667 - estimated end in 00:00:00.000
...Reset index


Cleaning gps points from radius of 10.0 meters

...There are 2459010 gps points to drop

...Dropping 2459010 gps points

...Rows before: 2737386, Rows after:278376


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(237/278376) 0% in 00:00:00.023 - estimated end in 00:00:27.769
(56171/2

(214/250229) 0% in 00:00:00.352 - estimated end in 00:06:51.588
(50888/250229) 20% in 00:00:00.378 - estimated end in 00:00:01.483
(100493/250229) 40% in 00:00:00.405 - estimated end in 00:00:00.603
(151234/250229) 60% in 00:00:00.432 - estimated end in 00:00:00.282
(200823/250229) 80% in 00:00:00.457 - estimated end in 00:00:00.112
(250229/250229) 100% in 00:00:00.484 - estimated end in 00:00:00.000
...Reset index


Cleaning gps points from radius of 10.0 meters

...There are 1 gps points to drop

...Dropping 1 gps points

...Rows before: 250229, Rows after:250228


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(214/250228) 0% in 00:00:00.022 - estimated end in 00:00:26.589
(50888/250228) 20% in 00:00:00.049 - estimated end in 00:00:00.195
(100492/250228) 40% in 00:00:00.076 - estimated end in 00:00:00.113
(151233/250228) 60% in 00:00:00.103 - estimated end in 00:00:00

In [25]:
%%time
trajutils.clean_gps_jumps_by_distance(df, jump_coefficient=3.0, threshold=1, label_dtype=np.float64)


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(214/250225) 0% in 00:00:00.023 - estimated end in 00:00:28.037
(50888/250225) 20% in 00:00:00.052 - estimated end in 00:00:00.206
(100489/250225) 40% in 00:00:00.080 - estimated end in 00:00:00.119
(151230/250225) 60% in 00:00:00.107 - estimated end in 00:00:00.070
(200819/250225) 80% in 00:00:00.133 - estimated end in 00:00:00.032
(250225/250225) 100% in 00:00:00.160 - estimated end in 00:00:00.000
...Reset index


Cleaning gps jumps by distance to jump_coefficient 3.0...

...There are 13851 gps points to drop 

...Dropping 13851 rows of gps points

...Rows before: 250225, Rows after:236374


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(198/236374) 0% in 00:00:00.023 - estimated end in 00:00:28.133
(4775

In [26]:
%%time
trajutils.clean_gps_speed_max_radius(df,  speed_max=50.0, label_dtype=np.float64)


Creating or updating distance features in meters...

...Set id as index to increase attribution performance

...Sorting by id and datetime to increase performance

(197/236088) 0% in 00:00:00.023 - estimated end in 00:00:27.728
(47691/236088) 20% in 00:00:00.049 - estimated end in 00:00:00.196
(95206/236088) 40% in 00:00:00.077 - estimated end in 00:00:00.115
(142537/236088) 60% in 00:00:00.113 - estimated end in 00:00:00.074
(189119/236088) 80% in 00:00:00.150 - estimated end in 00:00:00.037
(236088/236088) 100% in 00:00:00.177 - estimated end in 00:00:00.000
...Reset index


Creating or updating time and speed features in meters by seconds

...Set id as index to a higher peformance

(197/236088) 0% in 00:00:00.019 - estimated end in 00:00:23.369
(47691/236088) 20% in 00:00:00.036 - estimated end in 00:00:00.145
(95206/236088) 40% in 00:00:00.057 - estimated end in 00:00:00.084
(142537/236088) 60% in 00:00:00.073 - estimated end in 00:00:00.048
(189119/236088) 80% in 00:00:00.089 - e

# 6. DATA CLEANING BY TID

In [27]:
%%time
trajutils.clean_trajectories_with_few_points(df, label_id='tid', min_points_per_trajectory=500, label_dtype=np.float64)


Creating or updating distance features in meters...

...Set tid as index to increase attribution performance

...Sorting by tid and datetime to increase performance

(197/236080) 0% in 00:00:00.056 - estimated end in 00:01:07.210
...id:M118902019-02-13, must have at least 2 GPS points

...id:M129682019-02-24, must have at least 2 GPS points

(47412/236080) 20% in 00:00:00.185 - estimated end in 00:00:00.736
(94500/236080) 40% in 00:00:00.329 - estimated end in 00:00:00.493
...id:M516432019-02-25, must have at least 2 GPS points

(141776/236080) 60% in 00:00:00.429 - estimated end in 00:00:00.285
...id:M647572019-02-13, must have at least 2 GPS points

...id:M688302019-02-07, must have at least 2 GPS points

...id:M703642019-02-07, must have at least 2 GPS points

...id:M703642019-02-13, must have at least 2 GPS points

...id:M774782019-02-14, must have at least 2 GPS points

(188993/236080) 80% in 00:00:00.560 - estimated end in 00:00:00.139
...Reset index


Creating or updating time 

In [28]:
%%time 
trajutils.clean_short_and_few_points_trajectories(df, min_trajectory_distance=100, min_points_per_trajectory=2)


Remove short trajectories...

Creating or updating distance features in meters...

...Set tid as index to increase attribution performance

...Sorting by tid and datetime to increase performance

(3633/3633) 100% in 00:00:00.212 - estimated end in 00:00:00.000
...Reset index


Creating or updating time and speed features in meters by seconds

...Set tid as index to a higher peformance

(3633/3633) 100% in 00:00:00.002 - estimated end in 00:00:00.000
...Reset index...


...there are 0 ids with few points

Creating or updating distance features in meters...

...Set tid as index to increase attribution performance

...Sorting by tid and datetime to increase performance

(3633/3633) 100% in 00:00:00.003 - estimated end in 00:00:00.000
...Reset index


Creating or updating time and speed features in meters by seconds

...Set tid as index to a higher peformance

(3633/3633) 100% in 00:00:00.002 - estimated end in 00:00:00.000
...Reset index...


...Dropping unnecessary trajectories...

...s

# 7. SPLIT TRAJECTORIES

In [29]:
%%time
trajutils.split_trajectories(df, label_id='tid', max_dist_between_adj_points=1000,
    max_time_between_adj_points=120000, label_new_id='tid_part')


Split trajectories
...max_time_between_adj_points: 120000
...max_dist_between_adj_points: 1000
...max_speed: 25
...setting tid as index
(3633/3633) 100% in 00:00:00.002 - estimated end in 00:00:00.000
no trajs with only one point - nothing to change: (3633, 16)
CPU times: user 13.2 ms, sys: 77 µs, total: 13.2 ms
Wall time: 11.8 ms


  (df_.at[idx, dic_features_label['dist_to_prev']] > max_dist_between_adj_points) | \
  (df_.at[idx, dic_features_label['speed_to_prev']] > max_speed)


# 8. TRANFORMATION

### Distance

In [30]:
trajutils.transform_dist_from_meters_to_kilometers(df, label_distance='dist_to_prev')

In [31]:
trajutils.transform_dist_from_to_kilometers_to_meters(df, label_distance='dist_to_prev', new_label='dist_m')

### Time

In [32]:
trajutils.transform_time_from_seconds_to_minutes(df, label_time='time_to_prev')

In [33]:
trajutils.transform_time_from_minute_to_hours(df, label_time='time_to_prev')

In [34]:
trajutils.transform_time_from_hours_to_minute(df, label_time='time_to_prev')

In [35]:
trajutils.transform_time_from_minute_to_seconds(df, label_time='time_to_prev')

In [36]:
trajutils.transform_time_from_seconds_to_hours(df, label_time='time_to_prev')

In [37]:
trajutils.transform_time_from_hours_to_seconds(df, label_time='time_to_prev')

In [38]:
trajutils.transform_time_from_seconds_to_minutes(df, label_time='time_to_prev', new_label='time_m')

### Speed

In [39]:
trajutils.transform_speed_from_ms_to_kmh(df, label_speed='speed_to_prev')

In [40]:
trajutils.transform_speed_from_ms_to_kmh(df, label_speed='speed_to_prev', new_label='speed_ms')