In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import random

import warnings
warnings.filterwarnings('ignore')

In this notebook we run multiple SVM models to predict the taxi demand for different time and location resolutions. Because there are many hexagons which are splitted into one feature each a full grid search with all hexagons is not feasible with the computing power we have available. Therefore we sample the data by  using 10 random hexagons for the hyperparameter optimization.

## Define Functions for Training SVM Models

In [2]:
def scale_data_and_get_dummies(data, location_bucket):
    # scale data except location bucket and demand
    col_scale_list = data.columns.to_list()
    col_scale_list.remove(location_bucket)
    col_scale_list.remove("demand")
    scaler = StandardScaler()
    data[col_scale_list] = scaler.fit_transform(data[col_scale_list])
    # create one hot encoding for location bucket feature
    data = pd.get_dummies(data, columns=[location_bucket], dtype=int)
    return data

def split_train_test(data, target_col="demand"):
    # split data into train and test
    X = data.drop(columns=[target_col])
    y = data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)
    return X_train, X_test, y_train, y_test 

In [3]:
#def grid_search_linear_svr(X_train, y_train, param_grid):
#    linear_svr_model = LinearSVR()
#    grid_search_linear = GridSearchCV(linear_svr_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
#    grid_search_linear.fit(X_train, y_train)
#    return grid_search_linear

# define parameter grids for different kernels

param_grid_linear = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
}

param_grid_poly = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1], # 1
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
    'degree': [2, 3, 4, 5]
}

param_grid_rbf = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1], # 1
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5] 
}

def grid_search_kernel_svr(X_train, y_train, param_grid, kernel):
    svr = SVR(kernel=kernel)
    grid_search_kernel = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
    grid_search_kernel.fit(X_train, y_train)
    return grid_search_kernel

## Run Grid Search

In [4]:
time_bucket_lengths = [1, 2, 6, 24]
location_buckets = ["h3_07", "h3_08", "h3_09"] # "centroid" fehlt noch weil nur über langitude longitude aggregierbar 
# TODO centroid implementation in function

In [5]:
results = []
grid_dfs = []

for time_bucket_length in time_bucket_lengths:
    for location_bucket in location_buckets:

#for time_bucket_length in [6, 24]:
#    for location_bucket in ["h3_07", "h3_08"]:

        print(f"Time bucket length: {time_bucket_length}h")
        print(f"Location bucket: {location_bucket}")

        # read in data
        data = pd.read_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")
        print(data.shape)

        # create sample from data with 10 hexagons
        sampled_hexagons =  random.sample(data[location_bucket].unique().tolist(), 10)
        data_sampled = data[data[location_bucket].isin(sampled_hexagons)]
        print(data_sampled.shape)
        if data_sampled.shape[0] >= 10000:
            data_sampled = data_sampled.sample(10000, random_state=4711)
            print(data_sampled.shape)
        
        # scale data and create one hot encoding for location bucket feature
        scaled_data = scale_data_and_get_dummies(data_sampled, location_bucket)

        # split data into train and test
        X_train, X_test, y_train, y_test = split_train_test(scaled_data)

        print("run LinearSVR")
        """
        grid_result_linear = grid_search_linear_svr(X_train, y_train, param_grid_linear)
        grid_df_linear = pd.DataFrame(grid_result_linear.cv_results_)
        grid_df_linear["location_bucket"] = location_bucket
        grid_df_linear["time_bucket_length"] = time_bucket_length
        grid_df_linear["model"] = "LinearSVR"
        """
        grid_result_linear = grid_search_kernel_svr(X_train, y_train, param_grid_linear, "linear")
        grid_df_linear = pd.DataFrame(grid_result_linear.cv_results_)
        grid_df_linear["location_bucket"] = location_bucket
        grid_df_linear["time_bucket_length"] = time_bucket_length
        grid_df_linear["model"] = "SVR_linear"
        

        print("run SVR poly kernel")
        grid_result_poly = grid_search_kernel_svr(X_train, y_train, param_grid_poly, "poly")
        grid_df_poly = pd.DataFrame(grid_result_poly.cv_results_)
        grid_df_poly["location_bucket"] = location_bucket
        grid_df_poly["time_bucket_length"] = time_bucket_length
        grid_df_poly["model"] = "SVR_poly"

        print("run SVR rbf kernel")
        grid_result_rbf = grid_search_kernel_svr(X_train, y_train, param_grid_rbf, "rbf")
        grid_df_rbf = pd.DataFrame(grid_result_rbf.cv_results_)
        grid_df_rbf["location_bucket"] = location_bucket
        grid_df_rbf["time_bucket_length"] = time_bucket_length
        grid_df_rbf["model"] = "SVR_rbf"

        print("concat results")
        grid_df = pd.concat([grid_df_linear, grid_df_poly, grid_df_rbf])
        grid_dfs.append(grid_df)

        print("save best params and scores")
        results.append({
            "time_bucket_length": time_bucket_length,
            "location_bucket": location_bucket,
            "linear_svr_best_params": grid_result_linear.best_params_,
            "linear_svr_best_score": grid_result_linear.best_score_,
            "poly_svr_best_params": grid_result_poly.best_params_,
            "poly_svr_best_score": grid_result_poly.best_score_,
            "rbf_svr_best_params": grid_result_rbf.best_params_,
            "rbf_svr_best_score": grid_result_rbf.best_score_
        })
        
        print()

Time bucket length: 1h
Location bucket: h3_07
(163534, 13)
(20266, 13)
(10000, 13)
run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results
save best params and scores

Time bucket length: 1h
Location bucket: h3_08
(410624, 13)
(8326, 13)
run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results
save best params and scores

Time bucket length: 1h
Location bucket: h3_09
(572255, 13)
(19636, 13)
(10000, 13)
run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results
save best params and scores

Time bucket length: 2h
Location bucket: h3_07
(94064, 13)
(10889, 13)
(10000, 13)
run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results
save best params and scores

Time bucket length: 2h
Location bucket: h3_08
(245400, 13)
(14997, 13)
(10000, 13)
run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results
save best params and scores

Time bucket length: 2h
Location bucket: h3_09
(349848, 13)
(16518, 13)
(10000, 13)
run LinearSVR
run SVR poly kern

In [6]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,time_bucket_length,location_bucket,linear_svr_best_params,linear_svr_best_score,poly_svr_best_params,poly_svr_best_score,rbf_svr_best_params,rbf_svr_best_score
0,1,h3_07,"{'C': 100, 'epsilon': 0.2}",-2683.614,"{'C': 100, 'degree': 5, 'epsilon': 0.5, 'gamma...",-611.778,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-750.3008
1,1,h3_08,"{'C': 10, 'epsilon': 0.5}",-29.00646,"{'C': 10, 'degree': 4, 'epsilon': 0.5, 'gamma'...",-22.5403,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-20.40815
2,1,h3_09,"{'C': 100, 'epsilon': 0.2}",-2711.028,"{'C': 100, 'degree': 4, 'epsilon': 0.1, 'gamma...",-1217.768,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-1200.616
3,2,h3_07,"{'C': 100, 'epsilon': 0.5}",-1705.956,"{'C': 100, 'degree': 5, 'epsilon': 0.1, 'gamma...",-840.9947,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-813.1737
4,2,h3_08,"{'C': 0.1, 'epsilon': 0.4}",-2220.011,"{'C': 100, 'degree': 5, 'epsilon': 0.5, 'gamma...",-652.3096,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-681.5047
5,2,h3_09,"{'C': 100, 'epsilon': 0.5}",-240.9235,"{'C': 100, 'degree': 4, 'epsilon': 0.5, 'gamma...",-123.9221,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-126.1452
6,6,h3_07,"{'C': 100, 'epsilon': 0.4}",-36.53835,"{'C': 100, 'degree': 4, 'epsilon': 0.5, 'gamma...",-22.69115,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-19.16565
7,6,h3_08,"{'C': 100, 'epsilon': 0.5}",-1817.358,"{'C': 100, 'degree': 4, 'epsilon': 0.1, 'gamma...",-1197.17,"{'C': 100, 'epsilon': 0.4, 'gamma': 0.1}",-1258.397
8,6,h3_09,"{'C': 100, 'epsilon': 0.1}",-5170.598,"{'C': 100, 'degree': 4, 'epsilon': 0.5, 'gamma...",-2809.077,"{'C': 100, 'epsilon': 0.4, 'gamma': 0.1}",-2796.105
9,24,h3_07,"{'C': 100, 'epsilon': 0.1}",-6806615.0,"{'C': 100, 'degree': 3, 'epsilon': 0.5, 'gamma...",-40288970.0,"{'C': 100, 'epsilon': 0.1, 'gamma': 'scale'}",-33084600.0


In [7]:
results_df.to_csv("../../data/predictive/SVR_results_10000.csv", index=False)

In [8]:
grid_dfs_list = [pd.DataFrame(grid_df) for grid_df in grid_dfs]
grid_dfs = pd.concat(grid_dfs_list)
grid_dfs

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,location_bucket,time_bucket_length,model,param_degree,param_gamma
0,1.195531,0.021982,0.172250,0.010931,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1}",-2435.798738,-2477.138606,-2899.176847,-2775.698457,-2929.982427,-2703.559015,208.662294,20,h3_07,1,SVR_linear,,
1,1.212312,0.017108,0.165603,0.006043,0.1,0.2,"{'C': 0.1, 'epsilon': 0.2}",-2435.059616,-2477.299054,-2900.424151,-2776.187793,-2928.558586,-2703.505840,208.777925,19,h3_07,1,SVR_linear,,
2,1.159882,0.038872,0.152435,0.005850,0.1,0.3,"{'C': 0.1, 'epsilon': 0.3}",-2435.039216,-2478.186174,-2901.201697,-2774.830519,-2928.184305,-2703.488382,208.563924,18,h3_07,1,SVR_linear,,
3,1.124687,0.014562,0.164284,0.012034,0.1,0.4,"{'C': 0.1, 'epsilon': 0.4}",-2434.045221,-2477.874833,-2901.091165,-2774.358448,-2927.041831,-2702.882300,208.588027,17,h3_07,1,SVR_linear,,
4,1.132837,0.047351,0.158863,0.006159,0.1,0.5,"{'C': 0.1, 'epsilon': 0.5}",-2433.626702,-2475.913500,-2899.664414,-2773.555688,-2927.310093,-2702.014079,208.851817,16,h3_07,1,SVR_linear,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.057677,0.012900,0.016842,0.003803,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'scale'}",-5691.867983,-7336.505832,-5772.244408,-14331.273929,-9339.759472,-8494.330325,3205.893496,10,h3_09,24,SVR_rbf,,scale
96,0.045034,0.003562,0.014678,0.002735,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'auto'}",-6481.226224,-9518.222542,-7803.165247,-16852.264457,-12042.364632,-10539.448621,3663.597105,11,h3_09,24,SVR_rbf,,auto
97,0.051141,0.005935,0.015599,0.001454,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.001}",-22236.740661,-26347.641633,-27413.781342,-39630.441323,-31969.879847,-29519.696961,5929.060900,37,h3_09,24,SVR_rbf,,0.001
98,0.042238,0.003319,0.017249,0.004109,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.01}",-11453.046495,-16378.269971,-15500.099624,-26023.435244,-21292.389053,-18129.448077,5037.333920,19,h3_09,24,SVR_rbf,,0.01


In [9]:
grid_dfs.to_csv("../../data/predictive/SVR_results_grids_10000.csv", index=False)

In [13]:
grid_dfs.sort_values(by="mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,location_bucket,time_bucket_length,model,param_degree,param_gamma
99,0.229452,0.017303,0.022747,0.000105,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-15.605104,-17.596066,-21.187871,-31.485896,-9.953329,-19.165653,7.153821,1,h3_07,6,SVR_rbf,,0.1
94,0.26859,0.025119,0.030578,0.004737,100,0.4,"{'C': 100, 'epsilon': 0.4, 'gamma': 0.1}",-15.609267,-17.643275,-21.219598,-31.526877,-9.958436,-19.191491,7.165942,2,h3_07,6,SVR_rbf,,0.1
89,0.269812,0.013806,0.031574,0.003462,100,0.3,"{'C': 100, 'epsilon': 0.3, 'gamma': 0.1}",-15.593028,-17.691254,-21.226299,-31.566384,-9.983308,-19.212055,7.173097,3,h3_07,6,SVR_rbf,,0.1
84,0.268372,0.017917,0.037493,0.009532,100,0.2,"{'C': 100, 'epsilon': 0.2, 'gamma': 0.1}",-15.62055,-17.646275,-21.200667,-31.647745,-10.024995,-19.228046,7.188217,4,h3_07,6,SVR_rbf,,0.1
79,0.287352,0.01534,0.032555,0.003408,100,0.1,"{'C': 100, 'epsilon': 0.1, 'gamma': 0.1}",-15.636599,-17.803072,-21.346347,-31.619358,-10.041383,-19.289352,7.174076,5,h3_07,6,SVR_rbf,,0.1


In [15]:
grid_dfs.loc[(grid_dfs['location_bucket'] == "h3_07") & (grid_dfs['time_bucket_length'] == 1)].sort_values(by="mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,location_bucket,time_bucket_length,model,param_degree,param_gamma
399,17.499868,2.405999,0.114595,0.001206,100,0.5,"{'C': 100, 'degree': 5, 'epsilon': 0.5, 'gamma...",-653.759187,-497.214731,-601.344,-633.8204,-672.751502,-611.777964,61.967195,1,h3_07,1,SVR_poly,5,0.1
394,24.541117,2.502294,0.138578,0.010246,100,0.4,"{'C': 100, 'degree': 5, 'epsilon': 0.4, 'gamma...",-653.18561,-497.655822,-602.05713,-636.39098,-677.659662,-613.389841,62.874731,2,h3_07,1,SVR_poly,5,0.1
389,26.298891,3.216299,0.152619,0.008072,100,0.3,"{'C': 100, 'degree': 5, 'epsilon': 0.3, 'gamma...",-653.161319,-497.81684,-601.737152,-638.765698,-680.300306,-614.356263,63.546752,3,h3_07,1,SVR_poly,5,0.1
384,24.14312,2.896626,0.154922,0.010605,100,0.2,"{'C': 100, 'degree': 5, 'epsilon': 0.2, 'gamma...",-653.523069,-497.874032,-601.497614,-641.470417,-685.972331,-616.067493,64.987523,4,h3_07,1,SVR_poly,5,0.1
379,25.278293,2.048827,0.16024,0.003886,100,0.1,"{'C': 100, 'degree': 5, 'epsilon': 0.1, 'gamma...",-653.431155,-497.942052,-602.638633,-642.914845,-686.655165,-616.71637,65.163321,5,h3_07,1,SVR_poly,5,0.1


In [17]:
orig_data = pd.read_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")
orig_data["demand"].describe()

count    62570.000000
mean       273.951990
std        825.839861
min          1.000000
25%          4.000000
50%         20.000000
75%        126.000000
max      11690.000000
Name: demand, dtype: float64

### Test for time bucket 24 and location h3_07

In [29]:
time_bucket_length = 24
location_bucket = "h3_07"

In [30]:
# read in data
data = pd.read_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")
print(data.shape)

# create sample from data with 10 hexagons
sampled_hexagons =  random.sample(data["h3_07"].unique().tolist(), 10)
data_sampled = data[data["h3_07"].isin(sampled_hexagons)]
print(data_sampled.shape)

(12947, 12)
(1200, 12)


In [31]:
scaled_data = scale_data_and_get_dummies(data_sampled, location_bucket)
X_train, X_test, y_train, y_test = split_train_test(scaled_data)

print("run LinearSVR")
grid_result_linear = grid_search_linear_svr(X_train, y_train, param_grid_linear)
grid_df_linear = pd.DataFrame(grid_result_linear.cv_results_)
grid_df_linear["location_bucket"] = location_bucket
grid_df_linear["time_bucket_length"] = time_bucket_length
grid_df_linear["model"] = "LinearSVR"

print("run SVR poly kernel")
grid_result_poly = grid_search_kernel_svr(X_train, y_train, param_grid_poly, "poly")
grid_df_poly = pd.DataFrame(grid_result_poly.cv_results_)
grid_df_poly["location_bucket"] = location_bucket
grid_df_poly["time_bucket_length"] = time_bucket_length
grid_df_poly["model"] = "SVR_poly"

print("run SVR rbf kernel")
grid_result_rbf = grid_search_kernel_svr(X_train, y_train, param_grid_rbf, "rbf")
grid_df_rbf = pd.DataFrame(grid_result_rbf.cv_results_)
grid_df_rbf["location_bucket"] = location_bucket
grid_df_rbf["time_bucket_length"] = time_bucket_length
grid_df_rbf["model"] = "SVR_rbf"

print("concat results")
grid_df = pd.concat([grid_df_linear, grid_df_poly, grid_df_rbf])
grid_df

run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,location_bucket,time_bucket_length,model,param_degree,param_gamma
0,0.001387,0.000198,0.000668,0.000086,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1}",-1.991368e+06,-1.833703e+06,-1.992102e+06,-1.739407e+06,-1.980829e+06,-1.907482e+06,103217.917918,16,h3_07,24,LinearSVR,,
1,0.001328,0.000115,0.000674,0.000050,0.1,0.2,"{'C': 0.1, 'epsilon': 0.2}",-1.991057e+06,-1.834297e+06,-1.992129e+06,-1.739324e+06,-1.980946e+06,-1.907550e+06,103130.981265,19,h3_07,24,LinearSVR,,
2,0.001283,0.000083,0.000636,0.000015,0.1,0.3,"{'C': 0.1, 'epsilon': 0.3}",-1.991082e+06,-1.834124e+06,-1.992226e+06,-1.739177e+06,-1.980993e+06,-1.907520e+06,103230.151667,17,h3_07,24,LinearSVR,,
3,0.001107,0.000059,0.000617,0.000018,0.1,0.4,"{'C': 0.1, 'epsilon': 0.4}",-1.991015e+06,-1.834331e+06,-1.992393e+06,-1.738828e+06,-1.981080e+06,-1.907529e+06,103343.494192,18,h3_07,24,LinearSVR,,
4,0.001352,0.000210,0.000663,0.000033,0.1,0.5,"{'C': 0.1, 'epsilon': 0.5}",-1.991243e+06,-1.834669e+06,-1.991756e+06,-1.739155e+06,-1.981239e+06,-1.907612e+06,103144.514960,20,h3_07,24,LinearSVR,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.017634,0.000532,0.008767,0.005668,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'scale'}",-1.548818e+05,-1.321544e+05,-1.498288e+05,-9.914900e+04,-1.123002e+05,-1.296628e+05,21360.834811,10,h3_07,24,SVR_rbf,,scale
96,0.019223,0.003701,0.006141,0.000139,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'auto'}",-1.667854e+05,-1.378105e+05,-1.718236e+05,-1.008956e+05,-1.141381e+05,-1.382906e+05,27994.218743,15,h3_07,24,SVR_rbf,,auto
97,0.015855,0.002687,0.006041,0.000193,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.001}",-9.612838e+05,-8.427799e+05,-9.606465e+05,-8.169635e+05,-9.455249e+05,-9.054397e+05,62493.964148,37,h3_07,24,SVR_rbf,,0.001
98,0.018552,0.007774,0.006033,0.000133,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.01}",-2.696548e+05,-2.075265e+05,-2.896555e+05,-2.130689e+05,-2.281191e+05,-2.416050e+05,32413.487873,16,h3_07,24,SVR_rbf,,0.01


In [9]:
# train model
model = SVR(C=10, epsilon=0.1, kernel="rbf")
model.fit(X_train, y_train)

In [10]:
# evaluate model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test[:100], y_pred[:100]).round(2)
mse = mean_squared_error(y_test[:100], y_pred[:100]).round(2)
r2 = r2_score(y_test[:100], y_pred[:100]).round(4)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2: {r2}")

: 

In [9]:
# train model
model = SVR(C=10, epsilon=0.1, kernel="rbf")
model.fit(X_train_sampled, y_train_sampled)

# full version of training models

In [7]:
# scale features

scaler = StandardScaler()
# scale demand y data as well????
demand_feat_df[time_related_columns + wheather_related_columns] = scaler.fit_transform(demand_feat_df[time_related_columns + wheather_related_columns])
demand_feat_df

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,h3_07
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01 00:00:00,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c10ffffff
2015-01-01 00:00:00,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,872664c11ffffff
2015-01-01 00:00:00,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c12ffffff
2015-01-01 00:00:00,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,872664c13ffffff
2015-01-01 00:00:00,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,872664c16ffffff
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31 11:00:00,872664c1bffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1bffffff
2015-12-31 11:00:00,872664c1effffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.121803,-0.138869,6,872664c1effffff
2015-12-31 11:00:00,872664ca9ffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.089096,-0.138869,1,872664ca9ffffff
2015-12-31 12:00:00,872664c1effffff,-0.196166,0.029041,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1effffff


In [8]:
# get one hot encoding for location bucket

demand_feat_df_one_hot = pd.get_dummies(demand_feat_df, columns=[location_bucket], dtype=int)
demand_feat_df_one_hot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,...,h3_07_872664d8bffffff,h3_07_872664d8cffffff,h3_07_872664d8dffffff,h3_07_872664d8effffff,h3_07_872664d98ffffff,h3_07_872664d99ffffff,h3_07_872664d9bffffff,h3_07_87275934cffffff,h3_07_87275934effffff,h3_07_87275936bffffff
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-01-01,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# split data into train and test

X = demand_feat_df_one_hot.drop(columns=["demand"])
y = demand_feat_df_one_hot["demand"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)

In [10]:
# train model

model = SVR(C=1.0, epsilon=0.2)
model.fit(X_train, y_train)

In [10]:
# evaluate model

y_pred = model.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R^2 Score:', r2_score(y_test, y_pred))

Mean Absolute Error: 1.2764227270630981
Mean Squared Error: 4.862735610109737
R^2 Score: 0.720296889529022


In [None]:
#def train_and_evaluate_SVR_model (X_train, X_test, y_train, y_test, c=1.0, epsilon=0.2, kernel="linear"):
#    # train model
#    model = SVR(C=c, epsilon=epsilon, kernel=kernel)
#    model.fit(X_train, y_train)
#
#    # evaluate model
#    y_pred = model.predict(X_test)
#    mae = mean_absolute_error(y_test, y_pred).round(2)
#    mse = mean_squared_error(y_test, y_pred).round(2)
#    r2 = r2_score(y_test, y_pred).round(4)
#
#    return mae, mse, r2


#mae, mse, r2 = train_and_evaluate_SVR_model(X_train, X_test, y_train, y_test, c=1.0, epsilon=0.2, kernel="linear")

#print(f"MAE: {mae}")
#print(f"MSE: {mse}")
#print(f"R2: {r2}")

In [None]:
regularization_c = [0.01, 0.1, 1.0, 10.0, 100.0]
possible_kernels = ["linear", "poly", "rbf", "sigmoid", "precomputed"]
degrees_poly_kerne = [2, 3, 4, 5, 6, 7, 8, 9, 10] # only for poly kernel
gamma = ["scale", "auto"] # only for poly, rbf, sigmoid kernel

In [16]:
results = []
for time_bucket_length in [1, 2, 6, 24]:
    for location_bucket in ["h3_07"]: # "h3_08", "h3_09", "centroid"
        print(f"evaluate time bucket {time_bucket_length} and location bucket: {location_bucket}")
        mae, mse, r2 = evaluate_hyperparameters(time_bucket_length, location_bucket)
        print(f"mae: {mae}, mse: {mse}, r2: {r2}")
        results.append({"time_bucket_length": time_bucket_length, "location_bucket": location_bucket, "mae": mae, "mse": mse, "r2": r2})

evaluate time bucket 1 and location bucket: h3_07
mae: 1.28, mse: 4.86, r2: 0.7203
evaluate time bucket 2 and location bucket: h3_07
mae: 1.97, mse: 14.1, r2: 0.7456
evaluate time bucket 6 and location bucket: h3_07
mae: 4.36, mse: 106.56, r2: 0.6846
evaluate time bucket 24 and location bucket: h3_07
mae: 18.0, mse: 2447.02, r2: 0.1562


In [17]:
pd.DataFrame(results)

Unnamed: 0,time_bucket_length,location_bucket,mae,mse,r2
0,1,h3_07,1.28,4.86,0.7203
1,2,h3_07,1.97,14.1,0.7456
2,6,h3_07,4.36,106.56,0.6846
3,24,h3_07,18.0,2447.02,0.1562
