In [1]:
# imports
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import os
import folium
import random
from shapely.geometry import Point, LineString, Polygon, MultiPolygon
from shapely import wkt


# from inrix_data_science_utils.api.trajectories import TrajectoryAPI
# set cwd to src
os.chdir('../src')
from trajectories import TrajectoryAPI
from utils import linestring_to_geojson

## Use TrajectoryAPI from DataScience utils to query trajectories

In [2]:
# traj_api = TrajectoryAPI(
#     region_name = 'us-west-2',
#     profile = 'analytics',
#     table_name = 'trajectories.trajectories_restricted',
#     # table_name = 'tel_tapp.trajectories_restricted',
#     s3_staging_dir = 's3://aws-athena-query-results-861914951438-us-west-2/data-science/',
# )

In [3]:
# qk_list = ['023013202100232', '023013202100233', '023013202102010', '023013202102011']

# query = traj_api.create_partitions(
#     mapversion='20220601',
#     region='na',
#     years=['2023'],
#     months=['01'],
#     days=[str.zfill(str(i), 2) for i in range(1, 3)],
#     providers=['292', '362', '407', '460', '461'],
#     qks=list(set([qk[:8] for qk in qk_list])),  # Irvine Spectrum Center
#     auto_run=True
# )
# print(query)

In [4]:
# osm_segs = []
# start_time_utc = '2023-01-01 00:00:00'
# end_time_utc = '2023-01-02 00:00:00'
# map_version = '20240601'
# region = 'na'

# providers = ['292', '362', '407', '460', '461']
# qk_list = ['023013202100232', '023013202100233', '023013202102010', '023013202102011']
# qks = list(set([qk[:8] for qk in qk_list]))
# print(qks)

# columns = {'trip': [
#                 'trip_id',
#                 'device_id'
#             ],
#            'traj': [
#                 'traj_idx',
#                 'raw_points',
#                 'traj_raw_distance_m',
#                 'traj_raw_duration_millis'
#             ],
#            'seg': [
#                'point_id',
#                'point_idx',
#                'utc_ts',
#                'speed_kph',
#                'raw_speed_kph',
#            ]}


In [5]:

# trajectories = traj_api.trajectories_on_segments(
#     osm_segs,
#     start_time_utc,
#     end_time_utc,
#     map_version,
#     region,
#     providers=providers,
#     qks=qks,
#     columns=columns
# )

# trajectories.head()

## Manually downloaded from Amazon Athena

In [6]:
trajs = pd.read_csv('../data/trajectories_2023-01-01.csv')
trajs['timestamp'] = pd.to_datetime(trajs['utc_ts'], unit='ms')
trajs = trajs.drop(columns=['utc_ts'])
print(trajs.shape)
trajs['geometry'] = trajs.apply(lambda row: Point(row['raw_lon'], row['raw_lat']), axis=1)
trajs = gpd.GeoDataFrame(trajs, geometry='geometry')
print(trajs.shape)
display(trajs.head(1))

parking_df = pd.read_csv('../data/parking_Irvine_2023-01-01_2023-01-31.csv')
parking_lots = parking_df.groupby('pk_lot').first()
parking_lots['geometry'] = parking_lots['geometry'].apply(lambda x: wkt.loads(x))
parking_lots = gpd.GeoDataFrame(parking_lots, geometry='geometry')
# filter out airport
westmost = parking_lots['geometry'].bounds['minx'].min()
eastmost = parking_lots['geometry'].bounds['maxx'].max()
midpoint = (westmost + eastmost) / 2
is_east = parking_lots['geometry'].centroid.x > midpoint
parking_lots['is_east'] = is_east
parking_lots = parking_lots[parking_lots['is_east']]
parking_lots = parking_lots.drop(columns=['is_east'])
print(parking_lots.shape)
display(parking_lots.head(1))

# filter trajectories to only those that end in parking lots
trajs = gpd.sjoin(trajs, parking_lots, predicate='within', how='inner')
print(trajs.shape)

(337651, 11)
(337651, 12)


Unnamed: 0,trip_id,device_id,end_qk,provider_id,traj_idx,traj_raw_distance_m,traj_raw_duration_millis,point_idx,raw_lat,raw_lon,timestamp,geometry
0,348e92995ff1201d0dec41afa1b9ad15,f23592f65cbab194f71217de10d34411,23013202100232320,80,0,3985.748866,602000,0,33.62875,-117.71876,2023-01-01 01:49:15,POINT (-117.71876 33.62875)


(6, 9)


Unnamed: 0_level_0,dt_start_date,dt_end_date,i_avail,i_occ,f_pct_occ,pk_lot_alias,f_occupancy_rank,location,geometry
pk_lot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
329825,2023-01-31 07:20:01.586735+00:00,2023-01-31 11:20:01.586735+00:00,183,,24.0,fe03f4c4-ddc5-4cdc-b4cc-6171ea5584c5,3.0,POINT(-117.742706 33.6479646),"MULTIPOLYGON (((-117.74335 33.64835, -117.7420..."


(18331, 21)


In [7]:
def plot_parking_lots(parking_lots):
    m = folium.Map(location=[33.6846, -117.8265], zoom_start=14)
    for idx, row in parking_lots.iterrows():
        folium.GeoJson(row['geometry']).add_to(m)
    return m

In [8]:
# print the unique number of trip_ids and device_ids
for att in ['trip_id', 'device_id', 'end_qk', 'traj_idx', 'point_idx']:
    print(f'Number of unique {att}: {trajs[att].nunique()}')

Number of unique trip_id: 926
Number of unique device_id: 919
Number of unique end_qk: 32
Number of unique traj_idx: 9
Number of unique point_idx: 1491


In [9]:
agg_dict = {
    'raw_lat': list,
    'raw_lon': list,
    'timestamp': list,
    'point_idx': 'max',
    'provider_id': 'first',
    'traj_raw_distance_m': 'first',
    'traj_raw_duration_millis': 'first',
}
trips_df = trajs.groupby(['trip_id', 'traj_idx']).agg(agg_dict).reset_index()
# filter out trips with only one point
trips_df = trips_df[trips_df['raw_lat'].apply(lambda x: len(x)) > 1]
trips_df['geometry'] = trips_df.apply(lambda row: LineString([Point(lon, lat) for lon, lat in zip(row['raw_lon'], row['raw_lat'])]), axis=1)
trips_gpd = gpd.GeoDataFrame(trips_df, geometry='geometry')

trips_gpd['end_time'] = trips_gpd['timestamp'].apply(lambda lst: max([pd.to_datetime(x) for x in lst]))
trips_gpd['start_time'] = trips_gpd['timestamp'].apply(lambda lst: min([pd.to_datetime(x) for x in lst]))
trips_gpd['hunting_time'] = trips_gpd['end_time'] - trips_gpd['start_time']
trips_gpd['hunting_time'] = trips_gpd['hunting_time'].apply(lambda x: x.total_seconds())

print(trips_gpd.shape)
display(trips_gpd.head(1))

(626, 13)


Unnamed: 0,trip_id,traj_idx,raw_lat,raw_lon,timestamp,point_idx,provider_id,traj_raw_distance_m,traj_raw_duration_millis,geometry,end_time,start_time,hunting_time
1,004c864d2c1480d7a1ffaa5ccfade70c,0,"[33.651582, 33.651582, 33.651582, 33.651582, 3...","[-117.743051, -117.743051, -117.743051, -117.7...","[2023-01-01 22:34:25, 2023-01-01 22:34:28, 202...",312,470,7277.93716,993000,"LINESTRING (-117.74305 33.65158, -117.74305 33...",2023-01-01 22:40:09,2023-01-01 22:34:25,344.0


In [10]:
# map the points from a single trip
trip_id = random.choice(trips_gpd['trip_id'].unique())
df = trips_gpd[trips_gpd['trip_id'] == trip_id].copy()

hunting_time = df['hunting_time'].iloc[0]
print(f'Hunting time: {hunting_time}')


m = plot_parking_lots(parking_lots)
# m = folium.Map(location=[33.6489, -117.7479], zoom_start=12)
N = 1  # every Nth point
for i in range(0, len(df), N):
    row = df.iloc[i]
    t = f"trip_id: {row['trip_id']}\n traj_idx: {row['traj_idx']}"
    gjson_str = linestring_to_geojson(row['geometry'], color='black')
    # gjson = ... # make it a geojson object with the color black
    folium.GeoJson(gjson_str).add_to(m)
m.fit_bounds(m.get_bounds())
display(m)

Hunting time: 53.0


In [11]:
# # map the points from a single trip
# trip_id = random.choice(trajs['trip_id'].unique())
# df = trajs[trajs['trip_id'] == trip_id].copy()
# traj_id = random.choice(df['traj_idx'].unique())
# df = df[df['traj_idx'] == traj_id]

# start_time = df['timestamp'].min()
# end_time = df['timestamp'].max()
# hunting_time = end_time - start_time
# print(f'Hunting time: {hunting_time}')


# m = plot_parking_lots(parking_lots)
# # m = folium.Map(location=[33.6489, -117.7479], zoom_start=12)
# N = 1  # every Nth point
# for i in range(0, len(df), N):
#     t = f"trip_id: {df.iloc[i]['trip_id']}\n device_id: {df.iloc[i]['device_id']}\n traj_idx: {df.iloc[i]['traj_idx']}\n point_idx: {df.iloc[i]['point_idx']}"
#     folium.Circle([df.iloc[i]['raw_lat'], df.iloc[i]['raw_lon']], tooltip=t, radius=3, color='black').add_to(m)
# m.fit_bounds(m.get_bounds())
# display(m)

In [12]:
print(f"Good trip_id: {trip_id}, traj_idx: {traj_id}")

NameError: name 'traj_id' is not defined

In [None]:
print(f"Good trip_id: {trip_id}, traj_idx: {traj_id}")

In [None]:
print(f"Good trip_id: {trip_id}, traj_idx: {traj_id}, hunting time: {hunting_time}")