In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [10]:
import os
import shutil
import json
import warnings
import gc
from memory_profiler import profile
from multiprocessing import Pool

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
output_path = "./../output/population/"
agents_path = "./../output/agents/"

In [11]:
dtypes = {
    "Unnamed: 0" : np.float64,
    "time": np.float64,
    "type": str,
    "driverId": str,
    "vehicleId": str,
    "transitLineId": str,
    "transitRouteId": str,
    "departureId": str,
    "person": str,
    "link": str,
    "legMode": 'category', #category
    "vehicle": str,
    "networkMode": str, #category
    "relativePosition": np.float64,
    "facility": str,
    "delay": np.float64,
    "x": np.float64,
    "y": np.float64,
    "actType": str,
    "computationalRoutingMode": str,
    "distance" : np.float64,
    "mode": str,
    "agent": str,
    "atStop": str
}

vehicle_types = ["bus","car","funicular","subway", "tram"]


In [12]:
def read_csv(path):
    return pd.read_csv(path, dtype=dtypes)

def clear_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    else:
        shutil.rmtree(directory)
        os.makedirs(directory)


args = list()

for csv in os.listdir(output_path):
    args.append(output_path+csv)

print("Files prepared:", len(args), "files")

Files prepared: 3 files


In [13]:

def load_agent_events(row):
    event = {}
    event["event_id"] = row[0]
    event["time"] = row["time"]
    event["type"] = row["type"]
    event["link"] = row["link"]
    event["vehicle_id"] = row["vehicle"]
    event["delay"] = row["delay"]
    event["actType"] = row["actType"]
    event["legMode"] = row["legMode"]
    event["coords_x"] = row["x"]
    event["coords_y"] = row["y"]
    return event

def load_vehicle_events(row, vehicle_type):
    event = {}
    event["event_id"] = row[0]
    event["time"] = row["time"]
    event["type"] = row["type"]
    event["link"] = row["link"]
    event["person_id"] = row["person"]
    event["delay"] = row["delay"]
    event["facility"] = row['facility']

    if isinstance(row['facility'],str):
        event['link'] = row['facility'].split(":")[-1]
    
    event["networkMode"] = row['networkMode']
    event["relativePosition"] = row['relativePosition']
    event["actType"] = row["actType"]
    event["legMode"] = row["legMode"]
    event["coords_x"] = row["x"]
    event["coords_y"] = row["y"]

    if(vehicle_type != "car"):
        if(event["type"] == "TransitDriverStarts"):
            event["transitLine"] = row['transitLineId']
            event["transitRoute"] = row['transitRouteId'] ## add to output
        event["departure"] = row['departureId']
        event["atStop"] = row["atStop"]
        event["destinationStop"] = row["destinationStop"]
    return event


In [14]:
def save_chunk(path, file, chunk):
    #check if exists
    #print("saving chunk", file)
    if not os.path.exists(path):
        os.makedirs(path)
    #open load and append
    with open(path+file, 'w') as f:
        if(os.path.getsize(path+file) == 0):
            json.dump(chunk,f)
        else:
            saved = json.load(f)
            saved["events"].extend(chunk["events"])
            json.dump(saved,f)
    return

def process_agent(person):
    agent_id = person.person.unique()[0]
    agent = person.sort_values("time")
    #print("Person Id:", agent_id)
    events = []
    chunk = {
        "id" : agent_id,
        "events" : []
    }
    for i, row in agent.iterrows():
        events.append(load_agent_events(row))

    chunk["events"] = events
    save_chunk(agents_path+"/agent","/"+str(agent_id)+".json", chunk)
    return

def process_vehicle(args):
    vehicle_df, vehicle_type = args
    vehicle_id = vehicle_df.vehicle.unique()[0] #(?)
    vehicle = vehicle_df.sort_values("time")
    events = []
    
    if(vehicle_type == "car"):
        vehicle_id = int(vehicle.vehicle.unique()[0])
    else:
        vehicle_id = vehicle.vehicle.unique()[0]

    chunk = {
        "id" : vehicle_id,
        "events" : []
    }

    for i, row in vehicle.iterrows():
        events.append(load_vehicle_events(row, vehicle_type))

    chunk["events"] = events
    save_chunk(agents_path+"/"+vehicle_type,"/"+str(vehicle_id)+".json", chunk)
    return



def save_agents_parallel(persons, cpus):
    print("Number of agents in loaded chunk:", len(persons))
    print("processing:")
    with Pool(cpus) as pool:
        pool.map(process_agent, persons)

    pool.close()
    pool.join()
    return


def save_vehicles_parallel(args, cpus):
    print("Number of vehicles in loaded chunk:", len(args))

    print("processing:")
    with Pool(cpus) as pool:
        pool.map(process_vehicle, args)

    pool.close()
    pool.join()
    return


def load_agents_from_population(path):
    print("Loading file:", path)
    events = pd.read_csv(path, dtype=dtypes) #.fillna(np.nan)
    print("Parsing events:")
    agent_loads = []

    print("\t Grouping agents:")
    agents = pd.DataFrame()
    # removes drivers
    agents =  events[pd.to_numeric(events['person'], errors='coerce').notnull()] 
    dfs = [x for _, x in agents.groupby("person")] #each person in own dataframe
    del agents
    gc.collect()
    print("\t agents # ",len(dfs))
    agent_loads.append(len(dfs))

    events.vehicle = events.vehicle.astype("string")
    vehicle_dfs = []
    vehicle_dfs_types = []
    args = []

    for veh_type in vehicle_types:
        print("\t Grouping",veh_type,":")
        
        vehicles = pd.DataFrame()
        if veh_type == 'car':
            vehicles = events[pd.to_numeric(events['vehicle'], errors='coerce').notnull()]
        else:
            vehicles = events.loc[events['vehicle'].str.contains(veh_type, case=False)]
            driver_events = events[events['vehicleId'].notnull() & events['vehicleId'].str.contains(veh_type, case=False)]
            driver_events['vehicle'] = driver_events['vehicleId']
            vehicles = vehicles.append(driver_events)
            
        vehs = [x for _, x in vehicles.groupby("vehicle")]
        vehicle_dfs.extend(vehs)
        vehicle_dfs_types.extend([veh_type]*len(vehs))
        print("\t",veh_type,"# ",len(vehs))
        agent_loads.append(len(vehs))
    
    args = [ [df,t] for df,t in zip(vehicle_dfs, vehicle_dfs_types)]
    del vehicle_dfs
    del vehicle_dfs_types

    total_agents = sum(agent_loads)
    cpu_available = os.cpu_count()

    del events
    gc.collect()
    save_vehicles_parallel(args, cpu_available)
    save_agents_parallel(dfs, cpu_available)

    return

In [15]:

clear_directory(agents_path+"/agent")
for veh_type in vehicle_types:
    clear_directory(agents_path+"/"+veh_type)

for csv in args[:]:
    load_agents_from_population(csv)
    

Loading file: ./../output/population/0.csv
Parsing events:
	 Grouping agents:
	 agents #  10000
	 Grouping bus :
	 bus #  9717
	 Grouping car :
	 car #  5160
	 Grouping funicular :
	 funicular #  48
	 Grouping subway :
	 subway #  925
	 Grouping tram :
	 tram #  3232
Number of vehicles in loaded chunk: 19082
processing:
Number of agents in loaded chunk: 10000
processing:
Loading file: ./../output/population/1.csv
Parsing events:
	 Grouping agents:
	 agents #  8682
	 Grouping bus :
	 bus #  11447
	 Grouping car :
	 car #  4560
	 Grouping funicular :
	 funicular #  74
	 Grouping subway :
	 subway #  874
	 Grouping tram :
	 tram #  3389
Number of vehicles in loaded chunk: 20344
processing:
Number of agents in loaded chunk: 8682
processing:
Loading file: ./../output/population/2.csv
Parsing events:
	 Grouping agents:
	 agents #  3
	 Grouping bus :
	 bus #  3008
	 Grouping car :
	 car #  0
	 Grouping funicular :
	 funicular #  0
	 Grouping subway :
	 subway #  6
	 Grouping tram :
	 tram #  

In [8]:




    for i,event in tqdm(events.iterrows()):
        #person event
        if event['person'] == event['person'] and event.person.isnumeric():
            try:
                loaded = load_agent_events(event)
                agent_id = event["person"]

                if(agent_id in agents.keys()):
                    agents[agent_id]["events"].append(loaded)
                    if len(agents[agent_id]["events"]) >= 50000:
                        #append to file
                        save_chunk(agents_path+"/agent","/"+str(agent_id)+".json", agents[agent_id])
                        agents[agent_id]["events"] = []
    
                else:
                    agents[agent_id] = { "id": agent_id, "events":[] }
            except KeyError:
                print(event, path, event.type)

        #vehicle event
        if event.vehicle == event.vehicle: # or (event.person == event.person and not event.person.isnumeric()):
            #print(event)
            vehicle_type = ""
            if event.vehicle == event.vehicle  and (event.vehicle.isnumeric()):
                vehicle_type = "car"
            else:
                vehicle_type = event["vehicle"].split('_')[-1]


            vehicle_id = event["vehicle"]
            try:
                loaded = load_vehicle_events(event, vehicle_type)
                if(not vehicle_type in vehicles.keys()):
                    vehicles[vehicle_type] = {}

                if(vehicle_id in vehicles[vehicle_type].keys()):
                    vehicles[vehicle_type][vehicle_id]["events"].append(loaded)
                    if len(vehicles[vehicle_type][vehicle_id]["events"]) >= 50000:
                        #append to file
                        save_chunk(agents_path+"/"+vehicle_type,"/"+str(vehicle_id)+".json", vehicles[vehicle_type][vehicle_id])
                        vehicles[vehicle_type][vehicle_id]["events"] = []

                else:
                    vehicles[vehicle_type][vehicle_id] = { "id": vehicle_id, "events":[] }
                    
            except KeyError:
                print(event, path)
                #return

    del events
    gc.collect()
    
    for agent_id in agents.keys():
        save_chunk(agents_path+"/agent","/"+str(agent_id)+".json", agents[agent_id])

    for vehicle_type in vehicles.keys():
        for vehicle_id in vehicles[vehicle_type].keys():
            save_chunk(agents_path+"/"+vehicle_type,"/"+str(vehicle_id)+".json", vehicles[vehicle_type][vehicle_id])

    

NameError: name 'events' is not defined