In [8]:
import pandas as pd
import re
import numpy as np
import math
import datetime
import decimal

from glob import glob
from google.cloud import storage

client = storage.Client()
bucket = client.get_bucket('datamining-ulb')

decimal.getcontext().prec = 3
TO_MS = decimal.Decimal("1000")

In [9]:
'''
 Helper functions
'''

def epoch_ms_to_datetime(epoch_ms: int) -> datetime.datetime:
    return datetime.datetime.fromtimestamp(epoch_ms / int(TO_MS))

In [23]:
# Extract actual stops
actual_df = pd.read_csv(f"gs://{bucket.name}/data/actual_stops.csv")
size = actual_df.shape[0]
actual_df["stop_id"] = [re.sub("\D", "", x.strip()) for x in actual_df["stop_id"]]

actual_df["succession"] = actual_df["succession"].astype(str)
actual_df["numero_lig"] = actual_df["numero_lig"].astype(str)
actual_df["variante"] = actual_df["variante"].astype(str)

actual_stops = set(actual_df["stop_id"])

# Extract Line stop sequence
extracted_lines = {}

delimiters = np.array(["_" for _ in range(size)])
stops_info = (actual_df["numero_lig"].to_numpy() + delimiters +  actual_df["succession"].to_numpy() + delimiters + actual_df["variante"].to_numpy() + delimiters + actual_df["stop_id"].to_numpy()).tolist()

prev_succession = -999999
direction = ""
for stops in stops_info[::-1]:
    info = stops.split("_")
    if int(prev_succession) < int(info[1]):
        direction = info[3]
    
    data = extracted_lines.setdefault(info[0], {})
    
    succession_info = data.setdefault(info[3], {})
    succession_info[direction] = info[1]
    
    data[info[3]] = succession_info
    extracted_lines[info[0]] = data
    
    line_direction = data.setdefault("direction", {})
    visited_stops = line_direction.setdefault(direction, set())
    visited_stops.add(info[3])
    
    line_direction[direction] = visited_stops
    data["direction"] = line_direction
    
    prev_succession = info[1]

'''
Pair stops with the succession line, 
 1. if there's multiple sucession line check whether direction is the same
 2. If not, try to find direction id in the direction stops
 3. otherwise set sucession to 99999
'''

# Dont iterate over complete data
names = set()
for file in list(bucket.list_blobs(prefix='ordered_lines/Line')):
    file_path=f"gs://{file.bucket.name}/{file.name}"
    
    name = file_path.strip().split("/")[4].split("_")[0]
    names.add(name)

lines = []
for file_path in list(bucket.list_blobs(prefix='Lines_vehiclePositions/Line')):
    file = f"gs://{file_path.bucket.name}/{file_path.name}"
    
    if "null" in file or file in names:
          continue
            
    df = pd.read_csv(file);
    
    df["lineID"] = df["lineID"].astype(str)
    df["directionID"] = df["directionID"].astype(str)
    df["pointID"] = df["pointID"].astype(str)
    
    delimiters = np.array(["_" for _ in range(df.shape[0])])
    stops_infos = (df["lineID"].to_numpy() + delimiters + df["directionID"].to_numpy() + delimiters + df["pointID"].to_numpy() + delimiters + df.index.astype(str).values).tolist()
    
    succession_arr = []
    for stop in stops_infos:
        info = stop.split("_")
        data = extracted_lines[info[0]]
        
        succession = 999999
        if info[2] not in data:
            succession_arr.append(succession)
            continue
        
        succession_info = data[info[2]]
        if info[1] not in succession_info:
            succession_arr.append(succession)
            continue
        
        succession = succession_info[info[1]]
        succession_arr.append(succession)
    
    df["succession"] = np.array(succession_arr)
    
    df["lineID"] = df["lineID"].astype(int)
    df["directionID"] = df["directionID"].astype(int)
    df["pointID"] = df["pointID"].astype(int)
    df["succession"] = df["succession"].astype(int)
    
    df.sort_values(["directionID", "time",  "succession",], ascending=[True, True, True]).to_csv(f"gs://{bucket.name}/ordered_lines/{name}_ordered.csv")
    

AttributeError: 'str' object has no attribute 'name'

In [25]:
'''
Normalize the distance
 The algorithm will work as follow:
 1. We will change the distance for non-zero distance point by subtracting the time with how much time passed since the bus in the 0 distance of the point
 2. We will use the bus/tram average speed to calculate the distance passed
'''

AVERAGE_SPEED = decimal.Decimal("4.667") # in metre/second


line_df = pd.read_csv(f"gs://{bucket.name}/ordered_lines/Line71_ordered.csv")

# Remove outlier
line_df = line_df[line_df["succession"] != 999999]

columns = ["time", "lineID", "directionID", "distancefromPoint", "pointID", "succession"]

size = line_df.shape[0]
delimiters = np.array(["_" for _ in range(size)])

line_infos = None
for idx, column in enumerate(columns):
    line_df[column] = line_df[column].astype(str)
    if line_infos is None:
        line_infos = line_df[column].to_numpy() + delimiters
    elif line_infos is not None and idx < len(columns)-1:
        line_infos += line_df[column].to_numpy() + delimiters
    else:
        line_infos += line_df[column].to_numpy()

normalized_timestamp = []
normalized_distance = []
for line_info in line_infos:
    line = line_info.split("_")

    # Define var
    timestamp = int(line[0])
    distance = decimal.Decimal(line[3])
    
    if distance == 0:
        normalized_timestamp.append(epoch_ms_to_datetime(timestamp))
        normalized_distance.append(distance)
        continue
        
    # Normalized the timestamp
    time_passed = distance / AVERAGE_SPEED
    timestamp -= (int(time_passed*TO_MS))
    
    normalized_timestamp.append(epoch_ms_to_datetime(timestamp))
    normalized_distance.append(math.floor(abs(distance - (AVERAGE_SPEED*time_passed))))


line_df["normalized_timestamp"] = np.array(normalized_timestamp)
line_df["normalized_distance"] = np.array(normalized_distance)

for idx, column in enumerate(columns):
    line_df[column] = line_df[column].astype(int)


line_df["time"] = line_df["time"].apply(lambda x: epoch_ms_to_datetime(x))
line_df.drop('Unnamed: 0', axis=1, inplace=True)
line_df.sort_values(["directionID", "succession", "normalized_timestamp"], ascending=[True, True, True]).to_csv(f"gs://{bucket.name}/normalized_lines/Line71_normalized.csv")