In [None]:
import os
import ijson
import pandas as pd

In [None]:
def get_traj_stats(input_filename):
    '''
    This function reads the input file and calculates the travel distance, travel time and travel speed for each trajectory from INCEPTION dataset.
    
    :param input_filename: the input file name
    :type input_filename: str
    :return: the dataframe containing the trajectory statistics with columns traj_id, travel_distance, travel_time, travel_speed
    :rtype: pandas.DataFrame
    
    example: get_traj_stats('/.../I24M_INCEPTION_v1/11-22-2022/637c399add50d54aa5af0cf4__post2.json')
    '''
    # Store stats in a list of tuples first.
    traj_stats_list = []
    # Read the input file by parsing the json lines iteratively (for memory conservation).
    with open(input_filename, 'r') as input_file:
        parser = ijson.items(input_file, 'item')
        for doc in parser:
            # Filter out the trajectories with direction = -1 (Westbound) and length > 0
            if ((int(doc['direction']) == -1) & (int(doc['length']) > 0)):
                # Calculate the travel distance, travel time, and travel speed.
                travel_distance = - float(doc['ending_x']) + float(doc['starting_x'])
                travel_time = float(doc['last_timestamp']) - float(doc['first_timestamp'])
                travel_speed = travel_distance / travel_time
                traj_stats_list.append((doc['_id']['$oid'], travel_distance, travel_time, travel_speed))
    # Turn the list of tuples into a DataFrame
    traj_stats = pd.DataFrame(traj_stats_list, columns=['traj_id', 'travel_distance', 'travel_time', 'travel_speed'])
    return traj_stats

In [None]:
inception_root = ''    # the root directory of the INCEPTION dataset
date_dir = ''    # the date directory, like '11-22-2022'
file_name = ''    # the file name, like '637c399add50d54aa5af0cf4__post2.json' for '11-22-2022'
input_filename = os.path.join(inception_root, date_dir, file_name)

In [None]:
traj_stats = get_traj_stats(input_filename)
traj_stats

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 8))

distances = traj_stats[traj_stats['travel_distance'] < traj_stats['travel_distance'].quantile(.995)]['travel_distance']
ax1.hist(traj_stats['travel_distance'], bins=40, 
         color='c', edgecolor='k', alpha=0.7)
ax1.axvline(distances.mean(), color='k', linestyle='dashed', linewidth=2)
ax1.set_xlabel('trajectory distance (ft)')

times = traj_stats[traj_stats['travel_time'] < traj_stats['travel_time'].quantile(.995)]['travel_time']
ax2.hist(times, bins=40, 
         color='m', edgecolor='k', alpha=0.7)
ax2.axvline(times.mean(), color='k', linestyle='dashed', linewidth=2)
ax2.set_xlabel('trajectory duration (s)')

speeds = traj_stats[traj_stats['travel_speed'] < traj_stats['travel_speed'].quantile(.995)]['travel_speed']
ax3.hist(speeds, bins=40, color='y', edgecolor='k', alpha=0.7)
ax3.axvline(traj_stats['travel_speed'].mean(), 
            color='k', linestyle='dashed', linewidth=2)
ax3.set_xlabel('trajectory speed (ft/s)')

plt.tight_layout()
plt.show()

In [None]:
# Save the DataFrame of trajectory stats to a csv file as needed.
traj_stats.to_csv('temp.csv', index=False)    # Fill in the filename.