In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
import os
import shutil
import json
import warnings
import gc
from memory_profiler import profile
from multiprocessing import Pool

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
output_path = "./../output/population/"
agents_path = "./../output/agents/"

In [3]:
dtypes = {
    "Unnamed: 0" : np.float64,
    "time": np.float64,
    "type": str,
    "driverId": str,
    "vehicleId": str,
    "transitLineId": str,
    "transitRouteId": str,
    "departureId": str,
    "person": str,
    "link": str,
    "legMode": 'category', #category
    "vehicle": str,
    "networkMode": str, #category
    "relativePosition": np.float64,
    "facility": str,
    "delay": np.float64,
    "x": np.float64,
    "y": np.float64,
    "actType": str,
    "computationalRoutingMode": str,
    "distance" : np.float64,
    "mode": str,
    "agent": str,
    "atStop": str
}

vehicle_types = ["bus","car","funicular","subway", "tram"]
#vehicle_types = ["subway"]


In [4]:
def read_csv(path):
    return pd.read_csv(path, dtype=dtypes)

def clear_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        shutil.rmtree(directory)
        os.makedirs(directory)


args = list()

for csv in os.listdir(output_path):
    args.append(output_path+csv)

print("Files prepared:", len(args), "files")

Files prepared: 3 files


In [5]:

def load_agent_events(row):
    event = {}
    event["event_id"] = row[0]
    event["time"] = row["time"]
    event["type"] = row["type"]
    event["link"] = row["link"]
    event["vehicle_id"] = row["vehicle"]
    event["delay"] = row["delay"]
    event["actType"] = row["actType"]
    event["legMode"] = row["legMode"]
    event["coords_x"] = row["x"]
    event["coords_y"] = row["y"]
    return event

def load_vehicle_events(row, vehicle_type):
    event = {}
    event["event_id"] = row[0]
    event["time"] = row["time"]
    event["type"] = row["type"]
    event["link"] = row["link"]
    event["person_id"] = row["person"]
    event["delay"] = row["delay"]
    event["facility"] = row['facility']

    if isinstance(row['facility'],str):
        event['link'] = row['facility'].split(":")[-1]
    
    event["networkMode"] = row['networkMode']
    event["relativePosition"] = row['relativePosition']
    event["actType"] = row["actType"]
    event["legMode"] = row["legMode"]
    event["coords_x"] = row["x"]
    event["coords_y"] = row["y"]

    if(vehicle_type != "car"):
        if(event["type"] == "TransitDriverStarts"):
            event["transitLine"] = row['transitLineId']
            event["transitRoute"] = row['transitRouteId'] ## add to output
        event["departure"] = row['departureId']
        event["atStop"] = row["atStop"]
        event["destinationStop"] = row["destinationStop"]
    return event


In [6]:
def save_chunk(path, file, chunk):
    #check if exists
    #print("saving chunk", file)
    if not os.path.exists(path):
        os.makedirs(path)
    #open load and append
    if(os.path.isfile(path+file)):
        with open(path+file, 'r') as f:
            saved = json.load(f)

        saved["events"].extend(chunk["events"])
        with open(path+file, 'w') as f:
            json.dump(saved,f)
        #print("Appending events.")
    else:
        with open(path+file, 'w') as f:
            if(os.path.getsize(path+file) == 0):
                json.dump(chunk,f)

    return

def process_agent(person):
    agent_id = person.person.unique()[0]
    agent = person.sort_values("time")
    #print("Person Id:", agent_id)
    events = []
    chunk = {
        "id" : agent_id,
        "events" : []
    }
    for i, row in agent.iterrows():
        events.append(load_agent_events(row))

    chunk["events"] = events
    save_chunk(agents_path+"/agent","/"+str(agent_id)+".json", chunk)
    return

def process_vehicle(args):
    vehicle_df, vehicle_type = args
    vehicle_id = vehicle_df.vehicle.unique()[0]
    vehicle = vehicle_df.sort_values("time", kind="stable")
    events = []
    
    if(vehicle_type == "car"):
        vehicle_id = int(vehicle.vehicle.unique()[0])
    else:
        vehicle_id = vehicle.vehicle.unique()[0]

    chunk = {
        "id" : vehicle_id,
        "events" : []
    }

    for i, row in vehicle.iterrows():
        events.append(load_vehicle_events(row, vehicle_type))

    chunk["events"] = events
    save_chunk(agents_path+"/"+vehicle_type,"/"+str(vehicle_id)+".json", chunk)
    return


def save_agents_parallel(persons, cpus):
    print("Number of agents in loaded chunk:", len(persons))
    print("processing:")
    if cpus == 1:
        print("Seq solution")
        for person in persons:
            process_agent(person)
        return

    with Pool(cpus) as pool:
        pool.map(process_agent, persons)

    pool.close()
    pool.join()
    return


def save_vehicles_parallel(args, cpus):
    print("Number of vehicles in loaded chunk:", len(args))
    print("processing:")
    if cpus == 1:
        print("Seq solution")
        for arg in args:
            process_vehicle(arg)
        return

    with Pool(cpus) as pool:
        pool.map(process_vehicle, args)

    pool.close()
    pool.join()
    return


def load_agents_from_population(path):
    print("Loading file:", path)
    events = pd.read_csv(path, dtype=dtypes) #.fillna(np.nan)
    print("Parsing events:")
    agent_loads = []

    print("\t Grouping agents:")
    agents = pd.DataFrame()
    # removes drivers
    agents =  events[pd.to_numeric(events['person'], errors='coerce').notnull()] 
    dfs = [x for _, x in agents.groupby("person")] #each person in own dataframe
    del agents
    gc.collect()
    print("\t agents # ",len(dfs))
    agent_loads.append(len(dfs))

    events.vehicle = events.vehicle.astype("string")
    vehicle_dfs = []
    vehicle_dfs_types = []
    args = []

    for veh_type in vehicle_types:
        print("\t Grouping",veh_type,":")
        
        vehicles = pd.DataFrame()
        if veh_type == 'car':
            vehicles = events[pd.to_numeric(events['vehicle'], errors='coerce').notnull()]
        else:
            vehicles = events.loc[events['vehicle'].str.contains(veh_type, case=False)]
            driver_events = events[events['vehicleId'].notnull() & events['vehicleId'].str.contains(veh_type, case=False)]
            driver_events['vehicle'] = driver_events['vehicleId']
            print("\t routes:",len(driver_events.transitRouteId.unique()))
            vehicles = vehicles.append(driver_events)
            
        vehs = [x for _, x in vehicles.groupby("vehicle")]
        vehicle_dfs.extend(vehs)
        vehicle_dfs_types.extend([veh_type]*len(vehs))
        print("\t",veh_type,"# ",len(vehs))
        agent_loads.append(len(vehs))
    
    args = [ [df,t] for df,t in zip(vehicle_dfs, vehicle_dfs_types)]
    del vehicle_dfs
    del vehicle_dfs_types

    total_agents = sum(agent_loads)
    cpu_available = os.cpu_count()

    del events
    gc.collect()
    save_vehicles_parallel(args, cpu_available)
    save_agents_parallel(dfs, cpu_available)

    return

In [7]:

clear_directory(agents_path+"/agent")
for veh_type in vehicle_types:
    clear_directory(agents_path+"/"+veh_type)

for csv in args[:]:
    load_agents_from_population(csv)

Loading file: ./../output/population/0.csv
Parsing events:
	 Grouping agents:
	 agents #  10000
	 Grouping bus :
	 routes: 1153
	 bus #  9717
	 Grouping car :
	 car #  5160
	 Grouping funicular :
	 routes: 2
	 funicular #  48
	 Grouping subway :
	 routes: 77
	 subway #  925
	 Grouping tram :
	 routes: 356
	 tram #  3232
Number of vehicles in loaded chunk: 19082
processing:
Number of agents in loaded chunk: 10000
processing:
Loading file: ./../output/population/1.csv
Parsing events:
	 Grouping agents:
	 agents #  8682
	 Grouping bus :
	 routes: 1151
	 bus #  11447
	 Grouping car :
	 car #  4560
	 Grouping funicular :
	 routes: 2
	 funicular #  74
	 Grouping subway :
	 routes: 41
	 subway #  874
	 Grouping tram :
	 routes: 460
	 tram #  3389
Number of vehicles in loaded chunk: 20344
processing:
