In [1]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import os
import json
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
output_path = "./../output/population/"

# Loading events to separate tables
* agents: people
* vehicles:
    * cars
    * subway
    * buses
    * trams
    * trains

In [2]:
events = pd.DataFrame()

dtypes = {
    "time": np.float64,
    "type": str,
    "driverId": str,
    "vehicleId": str,
    "transitLineId": str,
    "transitRouteId": str,
    "departureId": str,
    "person": str,
    "link": str,
    "legMode": 'category', #category
    "vehicle": str,
    "networkMode": str, #category
    "relativePosition": np.float64,
    "facility": str,
    "delay": np.float64,
    "x": np.float64,
    "y": np.float64,
    "actType": str,
}

#reading all events at once, will be an issue for 100k+ population
for csv in os.listdir(output_path):
    df = pd.read_csv(output_path+csv, dtype=dtypes)
    events =events.append(df, ignore_index=True)

In [3]:
events.info()
#events.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10639855 entries, 0 to 10639854
Data columns (total 25 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Unnamed: 0                int64  
 1   time                      float64
 2   type                      object 
 3   driverId                  object 
 4   vehicleId                 object 
 5   transitLineId             object 
 6   transitRouteId            object 
 7   departureId               object 
 8   person                    object 
 9   link                      object 
 10  legMode                   object 
 11  vehicle                   object 
 12  networkMode               object 
 13  relativePosition          float64
 14  facility                  object 
 15  delay                     float64
 16  x                         float64
 17  y                         float64
 18  actType                   object 
 19  computationalRoutingMode  object 
 20  distance              

In [None]:
events.legMode.value_counts()
#events.facility.value_counts()

In [None]:
np.where(events.person == "8")

In [None]:
person = events.loc[np.where(events.person == 1.0)]
#display(person.head(10))
#person.iloc[np.where(person.legMode == "walk")]

## Loading people to separate file

In [6]:
agents = pd.DataFrame()
agents =  events[pd.to_numeric(events['person'], errors='coerce').notnull()] # removes drivers
agents.head()
agents.vehicle.unique()

array([nan, '2451', '1083', ..., 'veh_2194_bus', 'veh_12613_bus',
       'veh_23236_tram'], dtype=object)

In [13]:
"""
people = [
        {id: 1,
        events: [
                {event_id :,
                 time: 
                 type:
                 link:
                 vehicle_id:
                 delay:
                 coords: [x,y]
                 destination:  Act)
                 }
        ]}
]
"""

def save_agents(persons, json_file, chunk_size=-1):
    people = pd.DataFrame()
    counter = 0
    chunk_i = 0
    for person in persons:
        agent = pd.DataFrame(person[1])
        agent_id = person[0]
        agent = agent.sort_values("time")
        #print("Person Id:", agent_id)
        events = []
        for id, row in agent.iterrows():
            event = {}
            event["event_id"] = row[0]
            event["time"] = row["time"]
            event["type"] = row["type"]
            event["link"] = row["link"]
            event["vehicle_id"] = row["vehicle"]
            event["delay"] = row["delay"]
            event["actType"] = row["actType"]
            event["legMode"] = row["legMode"]
            event["coords_x"] = row["x"]
            event["coords_y"] = row["y"]
            events.append(event)

        new_vehicle = pd.DataFrame(columns=['id','events'])
        new_vehicle.loc[0,'id'] = agent_id
        new_vehicle.loc[0,'events']  =  events
        people = people.append(new_vehicle)

        if(chunk_size != -1 and counter > 0 and counter % chunk_size == 0):
            if not os.path.exists(json_file+"/agent"):
                os.makedirs(json_file+"/agent")
            
            people.reset_index(drop=True, inplace=True)
            people.to_json(json_file+"/agent/"+str(chunk_i)+".json", lines=True, orient='records') 
            people = pd.DataFrame()
            chunk_i+=1

        counter += 1
        

    people.reset_index(drop=True, inplace=True)
    if(chunk_size == -1):
        people.to_json(json_file+"/agent.json", lines=True, orient='records')
    else:  
        people.to_json(json_file+"/agent/"+str(chunk_i)+".json", lines=True, orient='records') 

        
    print("People save to:", json_file)

In [14]:
save_agents(agents.groupby("person"), json_file = "./../output/events", chunk_size=500)

People save to: ./../output/events


## Loading vehicle events to separate files

- cars
- subway
- buses
- trams
- trains
- ferry
- funicular


In [15]:
def save_vehicle(vehicle_events, json_file, vehicle_type = "", chunk_size=-1):
    vehicles = pd.DataFrame()
    counter = 0
    chunk_i = 0
    for agent in vehicle_events:
        vehicle = pd.DataFrame(agent[1])
        vehicle = vehicle.sort_values("time")
        events = []
        if(vehicle_type == "car"):
            vehicle_id = int(vehicle.vehicle.unique()[0])
        else:
            vehicle_id = vehicle.vehicle.unique()[0]

        #parse events
        for id, row in vehicle.iterrows():
            event = {}
            event["event_id"] = row[0]
            event["time"] = row["time"]
            event["type"] = row["type"]
            event["link"] = row["link"]
            event["person_id"] = row["person"]
            event["delay"] = row["delay"]
            event["facility"] = row['facility']
            #if facility, parse out link
            #print(type(row['facility']))
            if isinstance(row['facility'],str): #and not(np.isnan(row['facility'])):
                #print("filling link", row['facility'],row['facility'].split(":")[-1])
                event['link'] = row['facility'].split(":")[-1]
                #return
            event["networkMode"] = row['networkMode']
            event["relativePosition"] = row['relativePosition']
            event["actType"] = row["actType"]
            event["legMode"] = row["legMode"]
            event["coords_x"] = row["x"]
            event["coords_y"] = row["y"]

            if(vehicle_type != "car"):
                if(event["type"] == "TransitDriverStarts"):
                    #print("Driver starts")
                    event["transitLine"] = row['transitLineId']
                    event["transitRoute"] = row['transitRouteId'] ## add to output
                event["departure"] = row['departureId']
                event["atStop"] = row["atStop"]
                event["destinationStop"] = row["destinationStop"]

            events.append(event)
            

        new_vehicle = pd.DataFrame(columns=['id','events'])
        new_vehicle.loc[0,'id'] = vehicle_id
        new_vehicle.loc[0,'events']  =  events
        vehicles = vehicles.append(new_vehicle)
        vehicles.reset_index(drop=True, inplace=True)
        if(chunk_size != -1 and counter > 0 and counter % chunk_size == 0):
            if not os.path.exists(json_file+"/"+vehicle_type):
                os.makedirs(json_file+"/"+vehicle_type)
            vehicles.to_json(json_file+"/"+vehicle_type+"/"+str(chunk_i)+".json", lines=True, orient='records') 
            vehicles = pd.DataFrame()
            chunk_i+=1

        counter += 1

    vehicles.reset_index(drop=True, inplace=True)
    if(chunk_size == -1):
        vehicles.to_json(json_file+"/"+vehicle_type+".json", lines=True, orient='records')  
    else:
        vehicles.to_json(json_file+"/"+vehicle_type+"/"+str(chunk_i)+".json", lines=True, orient='records') 
        
    del vehicles 


In [16]:
vehicle_types = ["car","funicular","ferry","subway","rail","tram","bus"]

def filter_vehicle_events(events, vehicle_type, json_path="./../output/events"):
    from datetime import datetime
    print("Started filtering:",vehicle_type, "at", datetime.now())
    vehicles = pd.DataFrame()
    if vehicle_type == 'car':
        vehicles = events[pd.to_numeric(events['vehicle'], errors='coerce').notnull()]
    else:
        vehicles = events.loc[events['vehicle'].str.contains(vehicle_type, case=False)]
        driver_events = events[events['vehicleId'].notnull() & events['vehicleId'].str.contains(vehicle_type, case=False)]
        driver_events['vehicle'] = driver_events['vehicleId']
        vehicles = vehicles.append(driver_events)

    save_vehicle(vehicles.groupby("vehicle"), json_file = json_path, vehicle_type=vehicle_type, chunk_size=500)
    return print("Saved vehicle type:",vehicle_type, "to", json_path, "at", datetime.now())
        

In [18]:
for veh_type in ['bus',"car","funicular"]: #tram, subway
        events.vehicle = events.vehicle.astype("string")
        filter_vehicle_events(events, veh_type)

Started filtering: bus at 2021-11-16 12:06:26.124477


KeyboardInterrupt: 