In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_parquet("../../data/predictive/Taxi_Trips_Sampled_Predictive.parquet")
df.head()

Unnamed: 0,datetime,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,h3_07,h3_08,h3_09,centroid
0,2015-01-01 00:00:00,0,0,1,1,1,3,False,-7.0115,0,872664c11ffffff,882664c115fffff,892664c114fffff,POINT (-87.631717366 41.914616286)
1,2015-01-01 00:30:00,0,0,1,1,1,3,False,-7.0115,0,872664ca5ffffff,882664c165fffff,892664ca597ffff,POINT (-87.676182496 41.950545696)
2,2015-01-01 00:15:00,0,0,1,1,1,3,False,-7.0115,0,872664c1affffff,882664c1a9fffff,892664c1a8bffff,POINT (-87.632746489 41.880994471)
3,2015-01-01 00:00:00,0,0,1,1,1,3,False,-7.0115,0,872664c13ffffff,882664c13bfffff,892664c13afffff,POINT (-87.63576009 41.90749193)
4,2015-01-01 00:30:00,0,0,1,1,1,3,False,-7.0115,0,872664c1effffff,882664c1e3fffff,892664c1e2fffff,POINT (-87.620992913 41.884987192)


In [3]:
time_related_columns = ["hour", "4_hour_block", "day", "week", "month", "day_of_week", "is_weekday"]
wheather_related_columns = ["temp", "precip"]
location_cols = ["h3_07", "h3_08", "h3_09", "centroid"]

In [4]:
print(f"h3 07: {df['h3_07'].nunique()}")
print(f"h3 08: {df['h3_08'].nunique()}")
print(f"h3 09: {df['h3_09'].nunique()}")
print(f"centroid: {df['centroid'].nunique()}")

h3 07: 49
h3 08: 167
h3 09: 263
centroid: 267


In [5]:
time_bucket_length = 1
location_bucket = "h3_07"

# create time bucket

bucket_df = df.copy()
bucket_df["time_bucket_floored"] = bucket_df["datetime"].dt.floor(
        f"{time_bucket_length}H"
)
bucket_df.drop(columns=["datetime"], inplace=True)

# create demand per time and location bucket

demand_df = bucket_df.groupby(["time_bucket_floored", location_bucket]).size().to_frame("demand")
features_df = bucket_df[["time_bucket_floored", location_bucket] + time_related_columns + wheather_related_columns].groupby(["time_bucket_floored", location_bucket]).mean()
## HERE ABOVE time related mean or rather from datetime start floored
demand_feat_df = features_df.merge(demand_df, left_index=True, right_index=True)
# wird nicht mehr gebraucht ausser time related info davon entnehmen
#demand_feat_df["datetime_start_floored"] = demand_feat_df.index.get_level_values(0)
demand_feat_df[location_bucket] = demand_feat_df.index.get_level_values(1)
demand_feat_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,h3_07
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01,872664c10ffffff,0.0,0.0,1.0,1.0,1.0,3.0,0.0,-6.731667,0.0,3,872664c10ffffff
2015-01-01,872664c11ffffff,0.0,0.0,1.0,1.0,1.0,3.0,0.0,-7.0115,0.0,2,872664c11ffffff
2015-01-01,872664c12ffffff,0.0,0.0,1.0,1.0,1.0,3.0,0.0,-6.731667,0.0,3,872664c12ffffff
2015-01-01,872664c13ffffff,0.0,0.0,1.0,1.0,1.0,3.0,0.0,-6.941542,0.0,4,872664c13ffffff
2015-01-01,872664c16ffffff,0.0,0.0,1.0,1.0,1.0,3.0,0.0,-7.0115,0.0,3,872664c16ffffff


In [6]:
# scale features

scaler = StandardScaler()
# scale demand y data as well????
demand_feat_df[time_related_columns + wheather_related_columns] = scaler.fit_transform(demand_feat_df[time_related_columns + wheather_related_columns])
demand_feat_df

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,h3_07
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01 00:00:00,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c10ffffff
2015-01-01 00:00:00,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,872664c11ffffff
2015-01-01 00:00:00,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c12ffffff
2015-01-01 00:00:00,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,872664c13ffffff
2015-01-01 00:00:00,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,872664c16ffffff
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31 11:00:00,872664c1bffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1bffffff
2015-12-31 11:00:00,872664c1effffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.121803,-0.138869,6,872664c1effffff
2015-12-31 11:00:00,872664ca9ffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.089096,-0.138869,1,872664ca9ffffff
2015-12-31 12:00:00,872664c1effffff,-0.196166,0.029041,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1effffff


In [7]:
# get one hot encoding for location bucket

demand_feat_df_one_hot = pd.get_dummies(demand_feat_df, columns=[location_bucket], dtype=int)
demand_feat_df_one_hot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,...,h3_07_872664d8bffffff,h3_07_872664d8cffffff,h3_07_872664d8dffffff,h3_07_872664d8effffff,h3_07_872664d98ffffff,h3_07_872664d99ffffff,h3_07_872664d9bffffff,h3_07_87275934cffffff,h3_07_87275934effffff,h3_07_87275936bffffff
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-01-01,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# split data into train and test

X = demand_feat_df_one_hot.drop(columns=["demand"])
y = demand_feat_df_one_hot["demand"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)

In [9]:
# train model

model = SVR(C=1.0, epsilon=0.2)
model.fit(X_train, y_train)

In [10]:
# evaluate model

y_pred = model.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R^2 Score:', r2_score(y_test, y_pred))

Mean Absolute Error: 1.2764227270630981
Mean Squared Error: 4.862735610109737
R^2 Score: 0.720296889529022


In [15]:
def evaluate_hyperparameters (time_bucket_length, location_bucket, c=1.0, epsilon=0.2):

    # create time bucket
    bucket_df = df.copy()
    bucket_df["time_bucket_floored"] = bucket_df["datetime"].dt.floor(
            f"{time_bucket_length}H"
    )
    bucket_df.drop(columns=["datetime"], inplace=True)

    # create demand per time and location bucket
    demand_df = bucket_df.groupby(["time_bucket_floored", location_bucket]).size().to_frame("demand")
    features_df = bucket_df[["time_bucket_floored", location_bucket] + time_related_columns + wheather_related_columns].groupby(["time_bucket_floored", location_bucket]).mean()
    demand_feat_df = features_df.merge(demand_df, left_index=True, right_index=True)
    demand_feat_df[location_bucket] = demand_feat_df.index.get_level_values(1)

    # scale features
    scaler = StandardScaler()
    demand_feat_df[time_related_columns + wheather_related_columns] = scaler.fit_transform(demand_feat_df[time_related_columns + wheather_related_columns])

    # get one hot encoding for location bucket
    demand_feat_df_one_hot = pd.get_dummies(demand_feat_df, columns=[location_bucket], dtype=int)

    # split data into train and test
    X = demand_feat_df_one_hot.drop(columns=["demand"])
    y = demand_feat_df_one_hot["demand"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)

    # train model
    model = SVR(C=c, epsilon=epsilon)
    model.fit(X_train, y_train)

    # evaluate model
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred).round(2)
    mse = mean_squared_error(y_test, y_pred).round(2)
    r2 = r2_score(y_test, y_pred).round(4)

    return mae, mse, r2

In [16]:
results = []
for time_bucket_length in [1, 2, 6, 24]:
    for location_bucket in ["h3_07"]: # "h3_08", "h3_09", "centroid"
        print(f"evaluate time bucket {time_bucket_length} and location bucket: {location_bucket}")
        mae, mse, r2 = evaluate_hyperparameters(time_bucket_length, location_bucket)
        print(f"mae: {mae}, mse: {mse}, r2: {r2}")
        results.append({"time_bucket_length": time_bucket_length, "location_bucket": location_bucket, "mae": mae, "mse": mse, "r2": r2})

evaluate time bucket 1 and location bucket: h3_07
mae: 1.28, mse: 4.86, r2: 0.7203
evaluate time bucket 2 and location bucket: h3_07
mae: 1.97, mse: 14.1, r2: 0.7456
evaluate time bucket 6 and location bucket: h3_07
mae: 4.36, mse: 106.56, r2: 0.6846
evaluate time bucket 24 and location bucket: h3_07
mae: 18.0, mse: 2447.02, r2: 0.1562


In [17]:
pd.DataFrame(results)

Unnamed: 0,time_bucket_length,location_bucket,mae,mse,r2
0,1,h3_07,1.28,4.86,0.7203
1,2,h3_07,1.97,14.1,0.7456
2,6,h3_07,4.36,106.56,0.6846
3,24,h3_07,18.0,2447.02,0.1562
