In [3]:
import numpy as np
import pandas as pd
from os.path import dirname
import warnings
import geopandas as gpd

## from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Pipelines
from sklearn.pipeline import make_pipeline

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from joblib import dump

warnings.filterwarnings("ignore")

In [65]:
is_for_deploy = False

runtime = pd.read_parquet("C:/Users/huyh/Documents/Penn/Spring 2023/Cloud Computing/cloud-computing-bus-bunching/server/raw-data/runtimeDf.gzip")

runtime = runtime.query("routeId.isin(['21', '33', '47'])").copy()

toJoinFrom = runtime.copy().dropna(subset=["instanceId"])
same_trip_cols = ["serviceDate", "routeId", "directionId", "tripId"]

toJoin = toJoinFrom[["instanceId", "prevInstanceId"]].dropna(
    subset=["instanceId", "prevInstanceId"]
)

runtimeDf = runtime.copy()

for lagSteps in range(1, 22):
    # First get lag trips
    runtimeDf = runtimeDf.sort_values(same_trip_cols + ["toStopSequence"])
    runtimeDf[f"lag{lagSteps}InstanceId"] = runtimeDf.groupby(same_trip_cols)[
        "instanceId"
    ].shift(lagSteps)

    # Then get prev buses of lag trips
    thisToJoin = toJoin.copy().rename(
        columns={
            "instanceId": f"lag{lagSteps}InstanceId",
            "prevInstanceId": f"lag{lagSteps}PrevInstanceId",
        }
    )
    runtimeDf = runtimeDf.merge(thisToJoin, how="left", on=f"lag{lagSteps}InstanceId")

print("Produced lag IDs")

runtimeDf = runtimeDf.applymap(lambda x: np.nan if x is pd.NA else x)

print("Converted NAs to np.nan")

lag_vars = ["headway", "speed", "late"]

for lagSteps in range(1, 22):
    # Join to lag

    thisToJoin = (
        toJoinFrom.copy()[["instanceId"] + lag_vars]
        .rename(
            columns={
                "instanceId": f"lag{lagSteps}InstanceId",
                "headway": f"headwayLag{lagSteps}",
                "speed": f"speedLag{lagSteps}",
                "late": f"lateLag{lagSteps}",
            }
        )
        .dropna(subset=[f"lag{lagSteps}InstanceId"])
    )

    runtimeDf = runtimeDf.merge(thisToJoin, how="left", on=f"lag{lagSteps}InstanceId")

    # Join to prev of lag

    thisToJoin = (
        toJoinFrom.copy()[["instanceId"] + lag_vars]
        .rename(
            columns={
                "instanceId": f"lag{lagSteps}PrevInstanceId",
                "headway": f"prevBus_headwayLag{lagSteps}",
                "speed": f"prevBus_speedLag{lagSteps}",
                "late": f"prevBus_lateLag{lagSteps}",
            }
        )
        .dropna(subset=[f"lag{lagSteps}PrevInstanceId"])
    )

    runtimeDf = runtimeDf.merge(
        thisToJoin, how="left", on=f"lag{lagSteps}PrevInstanceId"
    )

for lagSteps in range(1, 21):
    for var in lag_vars:
        runtimeDf[f"{var}Lag{lagSteps}Diff{lagSteps+1}"] = (
            runtimeDf[f"{var}Lag{lagSteps}"] - runtimeDf[f"{var}Lag{lagSteps+1}"]
        )
        runtimeDf[f"prevBus_{var}Lag{lagSteps}Diff{lagSteps+1}"] = (
            runtimeDf[f"prevBus_{var}Lag{lagSteps}"]
            - runtimeDf[f"prevBus_{var}Lag{lagSteps+1}"]
        )

# stops = gpd.read_file(f"{server_dir}/raw-data/stops/stopsGeographyProcessed.shp")
# stops = stops.rename(columns={"directionI": "directionId", "StopId": "stopId"}).drop(
#     "geography", axis=1
# )

# stops.routeId = stops.routeId.astype(str)
# stops.directionId = stops.directionId.astype(str)
# stops.stopId = stops.stopId.astype(str)

# stops = stops.drop_duplicates(subset=["routeId", "directionId", "stopId"])

# runtimeDf = runtimeDf.merge(
#     stops[["routeId", "directionId", "stopId", "centerCity"]],
#     how="left",
#     left_on=["routeId", "directionId", "toStopId"],
#     right_on=["routeId", "directionId", "stopId"],
# )




Produced lag IDs
Converted NAs to np.nan


In [57]:
stop_level = pd.read_csv("C:/Users/huyh/Documents/Penn/Spring 2023/Cloud Computing/cloud-computing-bus-bunching/server/raw-data/stops_spatial_lag.csv", 
                         index_col=False)
stop_level = stop_level.drop('toStopSequence',  axis = 1)
stop_level.routeId = stop_level.routeId.astype(str)
stop_level.directionId = stop_level.directionId.astype(str).apply(lambda x: x.split('.')[0])
stop_level.toStopId = stop_level.toStopId.astype(str)

In [66]:
runtimeDf = runtimeDf.merge(stop_level, how = "left", on = ['routeId', 'directionId', 'toStopId'])

In [69]:
print(runtimeDf.dtypes)

DoW                     object
serviceDate     datetime64[ns]
routeId                 object
directionId             object
blockId                  int64
                     ...      
sumComm_15             float64
sumComm_20             float64
pctSignal_15           float64
pctSignal_10           float64
pctSignal_20           float64
Length: 360, dtype: object


In [89]:
fill_na_col = ['sumRiders_10', 'sumRiders_20', 'sumComm_10', 'sumComm_20', 'pctSignal_10', 'pctSignal_20', 'pop','popDen', 'riders', 'commuter', 'comm_count' ]
mean = runtimeDf[fill_na_col].mean()

In [90]:
runtimeDf[fill_na_col] = runtimeDf[fill_na_col].fillna(mean)

In [93]:
runtimeDf_variables = runtimeDf[['sumRiders_10', 'sumRiders_20', 'sumComm_10', 'sumComm_20', 'pctSignal_10', 'pctSignal_20', 'pop','popDen', 'riders', 'commuter', 'comm_count', 'routeId', 'toStopId', 'directionId']]

In [94]:
runtimeDf_variables.to_parquet("server/raw-data/stop_level_var.gzip")

In [6]:
z = pd.read_parquet("C:/Users/huyh/Documents/Penn/Spring 2023/Cloud Computing/cloud-computing-bus-bunching/server/raw-data/stop_level_var.gzip")

In [None]:
print(z.dtypes
      )