In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import random

import warnings
warnings.filterwarnings('ignore')

In [2]:
def scale_data_and_get_dummies(data, location_bucket):
    # scale data except location bucket and demand
    col_scale_list = data.columns.to_list()
    col_scale_list.remove(location_bucket)
    col_scale_list.remove("demand")
    scaler = StandardScaler()
    data[col_scale_list] = scaler.fit_transform(data[col_scale_list])
    # create one hot encoding for location bucket feature
    data = pd.get_dummies(data, columns=[location_bucket], dtype=int)
    return data

def split_train_test(data, target_col="demand"):
    # split data into train and test
    X = data.drop(columns=[target_col])
    y = data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)
    return X_train, X_test, y_train, y_test 

In [3]:
param_grid_linear = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
}

def grid_search_linear_svr(X_train, y_train, param_grid):
    linear_svr_model = LinearSVR()
    grid_search_linear = GridSearchCV(linear_svr_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
    grid_search_linear.fit(X_train, y_train)
    return grid_search_linear

param_grid_poly = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1], # 1
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
    'degree': [2, 3, 4, 5]
}

param_grid_rbf = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1], # 1
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5] 
}

def grid_search_kernel_svr(X_train, y_train, param_grid, kernel):
    svr = SVR(kernel=kernel)
    grid_search_kernel = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
    grid_search_kernel.fit(X_train, y_train)
    return grid_search_kernel

In [4]:
time_bucket_lengths = [1, 2, 6, 24]
location_buckets = ["h3_07", "h3_08", "h3_09"] # "centroid" fehlt noch weil nur über langitude longitude aggregierbar 
# TODO centroid implementation in function

In [10]:
results = []
grid_dfs = []

#for time_bucket_length in time_bucket_lengths:
#    for location_bucket in location_buckets:

for time_bucket_length in [6, 24]:
    for location_bucket in ["h3_07", "h3_08"]:
        print(f"Time bucket length: {time_bucket_length}h")
        print(f"Location bucket: {location_bucket}")

        # read in data
        data = pd.read_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")
        print(data.shape)

        # create sample from data with 10 hexagons
        sampled_hexagons =  random.sample(data[location_bucket].unique().tolist(), 10)
        data_sampled = data[data[location_bucket].isin(sampled_hexagons)]
        print(data_sampled.shape)
        if data_sampled.shape[0] >= 1000:
            data_sampled = data_sampled.sample(1000, random_state=4711)
            print(data_sampled.shape)
        
        # scale data and create one hot encoding for location bucket feature
        scaled_data = scale_data_and_get_dummies(data_sampled, location_bucket)

        # split data into train and test
        X_train, X_test, y_train, y_test = split_train_test(scaled_data)

        print("run LinearSVR")
        grid_result_linear = grid_search_linear_svr(X_train, y_train, param_grid_linear)
        grid_df_linear = pd.DataFrame(grid_result_linear.cv_results_)
        grid_df_linear["location_bucket"] = location_bucket
        grid_df_linear["time_bucket_length"] = time_bucket_length
        grid_df_linear["model"] = "LinearSVR"

        print("run SVR poly kernel")
        grid_result_poly = grid_search_kernel_svr(X_train, y_train, param_grid_poly, "poly")
        grid_df_poly = pd.DataFrame(grid_result_poly.cv_results_)
        grid_df_poly["location_bucket"] = location_bucket
        grid_df_poly["time_bucket_length"] = time_bucket_length
        grid_df_poly["model"] = "SVR_poly"

        print("run SVR rbf kernel")
        grid_result_rbf = grid_search_kernel_svr(X_train, y_train, param_grid_rbf, "rbf")
        grid_df_rbf = pd.DataFrame(grid_result_rbf.cv_results_)
        grid_df_rbf["location_bucket"] = location_bucket
        grid_df_rbf["time_bucket_length"] = time_bucket_length
        grid_df_rbf["model"] = "SVR_rbf"

        print("concat results")
        grid_df = pd.concat([grid_df_linear, grid_df_poly, grid_df_rbf])
        grid_dfs.append(grid_df)

        print("save best params and scores")
        results.append({
            "time_bucket_length": [time_bucket_length],
            "location_bucket": [location_bucket],
            "linear_svr_best_params": [grid_result_linear.best_params_],
            "linear_svr_best_score": [grid_result_linear.best_score_],
            "poly_svr_best_params": [grid_result_poly.best_params_],
            "poly_svr_best_score": [grid_result_poly.best_score_],
            "rbf_svr_best_params": [grid_result_rbf.best_params_],
            "rbf_svr_best_score": [grid_result_rbf.best_score_]
        })

Time bucket length: 6h
Location bucket: h3_07
(39374, 13)
(3722, 13)
(1000, 13)
run LinearSVR
run SVR poly kernel




run SVR rbf kernel
concat results
save best params and scores
Time bucket length: 6h
Location bucket: h3_08
(109406, 13)
(4947, 13)
(1000, 13)
run LinearSVR
run SVR poly kernel




run SVR rbf kernel
concat results
save best params and scores
Time bucket length: 24h
Location bucket: h3_07
(12947, 12)
(1916, 12)
(1000, 12)
run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results
save best params and scores
Time bucket length: 24h
Location bucket: h3_08
(40493, 12)
(1524, 12)
(1000, 12)
run LinearSVR




run SVR poly kernel
run SVR rbf kernel
concat results
save best params and scores


In [12]:
pd.DataFrame(results)

Unnamed: 0,time_bucket_length,location_bucket,linear_svr_best_params,linear_svr_best_score,poly_svr_best_params,poly_svr_best_score,rbf_svr_best_params,rbf_svr_best_score
0,[6],[h3_07],"[{'C': 100, 'epsilon': 0.4}]",[-82962.245876716],"[{'C': 100, 'degree': 5, 'epsilon': 0.5, 'gamm...",[-24569.486748481977],"[{'C': 100, 'epsilon': 0.1, 'gamma': 0.1}]",[-27628.842137289874]
1,[6],[h3_08],"[{'C': 10, 'epsilon': 0.2}]",[-37371.84881595118],"[{'C': 100, 'degree': 4, 'epsilon': 0.4, 'gamm...",[-11768.94173019694],"[{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}]",[-11364.21265086244]
2,[24],[h3_07],"[{'C': 100, 'epsilon': 0.2}]",[-3221670.1869080802],"[{'C': 100, 'degree': 5, 'epsilon': 0.5, 'gamm...",[-9832486.043438274],"[{'C': 100, 'epsilon': 0.3, 'gamma': 'scale'}]",[-39092803.205945596]
3,[24],[h3_08],"[{'C': 100, 'epsilon': 0.4}]",[-1390.5297306582568],"[{'C': 100, 'degree': 3, 'epsilon': 0.1, 'gamm...",[-526.3713367904045],"[{'C': 100, 'epsilon': 0.1, 'gamma': 'scale'}]",[-488.7082836962771]


In [21]:
grid_dfs_list = [pd.DataFrame(grid_df) for grid_df in grid_dfs]
pd.concat(grid_dfs_list)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,location_bucket,time_bucket_length,model,param_degree,param_gamma
0,0.001407,0.000071,0.000687,0.000081,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1}",-181299.477246,-236327.597289,-239648.289894,-184683.757864,-235005.574211,-215392.939301,26520.389760,16,h3_07,6,LinearSVR,,
1,0.001506,0.000390,0.000834,0.000214,0.1,0.2,"{'C': 0.1, 'epsilon': 0.2}",-181443.977674,-236365.604707,-239654.971637,-184682.511357,-235207.071169,-215470.827309,26520.674110,18,h3_07,6,LinearSVR,,
2,0.001338,0.000177,0.000651,0.000057,0.1,0.3,"{'C': 0.1, 'epsilon': 0.3}",-181399.685950,-236429.555203,-239647.870848,-184688.750911,-235189.096902,-215470.991963,26536.717075,19,h3_07,6,LinearSVR,,
3,0.001377,0.000427,0.000766,0.000201,0.1,0.4,"{'C': 0.1, 'epsilon': 0.4}",-181354.957734,-236456.099497,-239556.600457,-184696.318132,-235171.289488,-215447.053062,26531.394509,17,h3_07,6,LinearSVR,,
4,0.001374,0.000139,0.000659,0.000059,0.1,0.5,"{'C': 0.1, 'epsilon': 0.5}",-181438.818223,-236424.690142,-239689.986457,-184694.622936,-235230.610285,-215495.745609,26538.388626,20,h3_07,6,LinearSVR,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.055610,0.006837,0.012705,0.002065,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'scale'}",-406.588287,-571.933875,-281.924131,-476.370510,-712.357937,-489.834948,146.010219,5,h3_08,24,SVR_rbf,,scale
96,0.049890,0.003381,0.013563,0.002213,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'auto'}",-417.988887,-604.523105,-265.880312,-477.504743,-707.602455,-494.699900,152.267719,11,h3_08,24,SVR_rbf,,auto
97,0.035140,0.003688,0.014116,0.002400,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.001}",-1388.834206,-1923.898072,-898.873600,-1824.523236,-2343.093144,-1675.844452,492.933136,53,h3_08,24,SVR_rbf,,0.001
98,0.043979,0.002664,0.012433,0.000919,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.01}",-489.015050,-872.315415,-263.610928,-768.464572,-1012.771878,-681.235569,270.331574,35,h3_08,24,SVR_rbf,,0.01


### Test for time bucket 24 and location h3_07

In [29]:
time_bucket_length = 24
location_bucket = "h3_07"

In [30]:
# read in data
data = pd.read_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")
print(data.shape)

# create sample from data with 10 hexagons
sampled_hexagons =  random.sample(data["h3_07"].unique().tolist(), 10)
data_sampled = data[data["h3_07"].isin(sampled_hexagons)]
print(data_sampled.shape)

(12947, 12)
(1200, 12)


In [31]:
scaled_data = scale_data_and_get_dummies(data_sampled, location_bucket)
X_train, X_test, y_train, y_test = split_train_test(scaled_data)

print("run LinearSVR")
grid_result_linear = grid_search_linear_svr(X_train, y_train, param_grid_linear)
grid_df_linear = pd.DataFrame(grid_result_linear.cv_results_)
grid_df_linear["location_bucket"] = location_bucket
grid_df_linear["time_bucket_length"] = time_bucket_length
grid_df_linear["model"] = "LinearSVR"

print("run SVR poly kernel")
grid_result_poly = grid_search_kernel_svr(X_train, y_train, param_grid_poly, "poly")
grid_df_poly = pd.DataFrame(grid_result_poly.cv_results_)
grid_df_poly["location_bucket"] = location_bucket
grid_df_poly["time_bucket_length"] = time_bucket_length
grid_df_poly["model"] = "SVR_poly"

print("run SVR rbf kernel")
grid_result_rbf = grid_search_kernel_svr(X_train, y_train, param_grid_rbf, "rbf")
grid_df_rbf = pd.DataFrame(grid_result_rbf.cv_results_)
grid_df_rbf["location_bucket"] = location_bucket
grid_df_rbf["time_bucket_length"] = time_bucket_length
grid_df_rbf["model"] = "SVR_rbf"

print("concat results")
grid_df = pd.concat([grid_df_linear, grid_df_poly, grid_df_rbf])
grid_df

run LinearSVR
run SVR poly kernel
run SVR rbf kernel
concat results


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,location_bucket,time_bucket_length,model,param_degree,param_gamma
0,0.001387,0.000198,0.000668,0.000086,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1}",-1.991368e+06,-1.833703e+06,-1.992102e+06,-1.739407e+06,-1.980829e+06,-1.907482e+06,103217.917918,16,h3_07,24,LinearSVR,,
1,0.001328,0.000115,0.000674,0.000050,0.1,0.2,"{'C': 0.1, 'epsilon': 0.2}",-1.991057e+06,-1.834297e+06,-1.992129e+06,-1.739324e+06,-1.980946e+06,-1.907550e+06,103130.981265,19,h3_07,24,LinearSVR,,
2,0.001283,0.000083,0.000636,0.000015,0.1,0.3,"{'C': 0.1, 'epsilon': 0.3}",-1.991082e+06,-1.834124e+06,-1.992226e+06,-1.739177e+06,-1.980993e+06,-1.907520e+06,103230.151667,17,h3_07,24,LinearSVR,,
3,0.001107,0.000059,0.000617,0.000018,0.1,0.4,"{'C': 0.1, 'epsilon': 0.4}",-1.991015e+06,-1.834331e+06,-1.992393e+06,-1.738828e+06,-1.981080e+06,-1.907529e+06,103343.494192,18,h3_07,24,LinearSVR,,
4,0.001352,0.000210,0.000663,0.000033,0.1,0.5,"{'C': 0.1, 'epsilon': 0.5}",-1.991243e+06,-1.834669e+06,-1.991756e+06,-1.739155e+06,-1.981239e+06,-1.907612e+06,103144.514960,20,h3_07,24,LinearSVR,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.017634,0.000532,0.008767,0.005668,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'scale'}",-1.548818e+05,-1.321544e+05,-1.498288e+05,-9.914900e+04,-1.123002e+05,-1.296628e+05,21360.834811,10,h3_07,24,SVR_rbf,,scale
96,0.019223,0.003701,0.006141,0.000139,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 'auto'}",-1.667854e+05,-1.378105e+05,-1.718236e+05,-1.008956e+05,-1.141381e+05,-1.382906e+05,27994.218743,15,h3_07,24,SVR_rbf,,auto
97,0.015855,0.002687,0.006041,0.000193,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.001}",-9.612838e+05,-8.427799e+05,-9.606465e+05,-8.169635e+05,-9.455249e+05,-9.054397e+05,62493.964148,37,h3_07,24,SVR_rbf,,0.001
98,0.018552,0.007774,0.006033,0.000133,100,0.5,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.01}",-2.696548e+05,-2.075265e+05,-2.896555e+05,-2.130689e+05,-2.281191e+05,-2.416050e+05,32413.487873,16,h3_07,24,SVR_rbf,,0.01


In [10]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
}

linear_svr_model = LinearSVR()
grid_search_linear = GridSearchCV(linear_svr_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
grid_search_linear.fit(X_train, y_train)

print(grid_search_linear.best_params_)
pd.DataFrame(grid_search_linear.cv_results_)

{'C': 100, 'epsilon': 0.2}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.012667,0.00197,0.001828,5.9e-05,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1}",-17696390.0,-16903010.0,-16947320.0,-16378720.0,-18134030.0,-17211890.0,623642.879263,17
1,0.011684,0.000271,0.001884,8.9e-05,0.1,0.2,"{'C': 0.1, 'epsilon': 0.2}",-17697860.0,-16903100.0,-16949520.0,-16379280.0,-18132270.0,-17212410.0,623009.509271,20
2,0.01192,0.000284,0.001921,0.000129,0.1,0.3,"{'C': 0.1, 'epsilon': 0.3}",-17696770.0,-16902460.0,-16949480.0,-16378790.0,-18131420.0,-17211780.0,622785.448475,16
3,0.012105,0.000521,0.002098,0.000689,0.1,0.4,"{'C': 0.1, 'epsilon': 0.4}",-17698020.0,-16903340.0,-16947560.0,-16379540.0,-18131910.0,-17212080.0,623000.644147,18
4,0.012668,0.000466,0.002022,0.00023,0.1,0.5,"{'C': 0.1, 'epsilon': 0.5}",-17697230.0,-16903130.0,-16949210.0,-16378880.0,-18132310.0,-17212150.0,623053.667192,19
5,0.01354,0.001264,0.002369,0.0007,1.0,0.1,"{'C': 1, 'epsilon': 0.1}",-15428150.0,-14756470.0,-14757820.0,-14305100.0,-15847780.0,-15019060.0,547916.754389,14
6,0.013139,0.00123,0.001948,0.000176,1.0,0.2,"{'C': 1, 'epsilon': 0.2}",-15427490.0,-14750030.0,-14756170.0,-14308460.0,-15846440.0,-15017720.0,547323.301597,11
7,0.012916,0.001088,0.001822,7.7e-05,1.0,0.3,"{'C': 1, 'epsilon': 0.3}",-15427280.0,-14757370.0,-14762400.0,-14305840.0,-15847020.0,-15019980.0,546845.767035,15
8,0.012411,0.000551,0.002042,0.000276,1.0,0.4,"{'C': 1, 'epsilon': 0.4}",-15422730.0,-14757600.0,-14758940.0,-14304840.0,-15844920.0,-15017810.0,546098.549852,13
9,0.012325,0.000726,0.001939,2.8e-05,1.0,0.5,"{'C': 1, 'epsilon': 0.5}",-15429760.0,-14755030.0,-14756010.0,-14305450.0,-15842450.0,-15017740.0,546769.875224,12


In [11]:
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
    'degree': [2, 3, 4, 5]
}

svr = SVR(kernel='poly')
grid_search_rbf = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_rbf.fit(X_train, y_train)

print(grid_search_rbf.best_params_)
pd.DataFrame(grid_search_rbf.cv_results_)

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV] END ..........C=0.1, degree=2, epsilon=0.1, gamma=scale; total time=   6.5s
[CV] END ..........C=0.1, degree=2, epsilon=0.1, gamma=scale; total time=   6.5s
[CV] END ...........C=0.1, degree=2, epsilon=0.1, gamma=auto; total time=   6.6s
[CV] END ..........C=0.1, degree=2, epsilon=0.1, gamma=scale; total time=   6.6s
[CV] END ...........C=0.1, degree=2, epsilon=0.1, gamma=auto; total time=   6.6s
[CV] END ..........C=0.1, degree=2, epsilon=0.1, gamma=scale; total time=   6.7s
[CV] END ..........C=0.1, degree=2, epsilon=0.1, gamma=scale; total time=   6.8s
[CV] END ...........C=0.1, degree=2, epsilon=0.1, gamma=auto; total time=   6.7s
[CV] END ..........C=0.1, degree=2, epsilon=0.1, gamma=0.001; total time=   6.3s
[CV] END ...........C=0.1, degree=2, epsilon=0.1, gamma=auto; total time=   6.4s
[CV] END ...........C=0.1, degree=2, epsilon=0.1, gamma=auto; total time=   6.5s
[CV] END ..........C=0.1, degree=2, epsilon=0

KeyboardInterrupt: 

In [11]:
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5] 
}

svr = SVR(kernel='rbf')
grid_search_rbf = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_rbf.fit(X_train, y_train)

print(grid_search_rbf.best_params_)
pd.DataFrame(grid_search_rbf.cv_results_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.4s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.5s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.6s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.5s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.5s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.2s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.2s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.2s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.9s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   3.0s
[CV] END .....................C=0.1, epsilon=0.1, gamma=0.01; total time=   3.4s
[CV] END ....................C=0.1, epsilon=0.

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.057108,0.033167,0.452695,0.022089,0.1,0.1,scale,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale'}",-0.096264,-0.101123,-0.100918,-0.076815,-0.083240,-0.091672,0.009877,63
1,1.665568,0.217492,0.818984,0.142899,0.1,0.1,auto,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 'auto'}",-0.613850,-0.591233,-0.588738,-0.566889,-0.631267,-0.598396,0.022166,114
2,3.131293,0.188271,1.347271,0.041865,0.1,0.1,0.001,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 0.001}",-0.973774,-0.928967,-0.931601,-0.900396,-0.997965,-0.946541,0.034778,120
3,2.283277,0.066994,0.909661,0.140965,0.1,0.1,0.01,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 0.01}",-0.611244,-0.588808,-0.586149,-0.564258,-0.628107,-0.595713,0.021996,113
4,1.329329,0.110026,0.527014,0.079493,0.1,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 0.1}",-0.093888,-0.097600,-0.098772,-0.074186,-0.080935,-0.089076,0.009776,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.320548,0.010006,0.086122,0.012855,100,0.5,auto,"{'C': 100, 'epsilon': 0.5, 'gamma': 'auto'}",-0.089577,-0.089951,-0.099691,-0.078177,-0.077927,-0.087065,0.008204,56
116,0.337959,0.015809,0.114287,0.008912,100,0.5,0.001,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.001}",-0.114420,-0.115445,-0.122472,-0.111958,-0.107808,-0.114421,0.004810,75
117,0.317436,0.008777,0.081873,0.005419,100,0.5,0.01,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.01}",-0.089496,-0.089777,-0.099602,-0.078227,-0.077868,-0.086994,0.008162,55
118,0.310481,0.022905,0.074324,0.011964,100,0.5,0.1,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-0.071325,-0.072288,-0.087281,-0.072014,-0.076261,-0.075834,0.005979,45


create data for time bucket 1 hour and hexagon resolution 7 and a sampled dataframe with only ten hexagons

In [7]:
import random

sampled_hexagons =  random.sample(df["h3_07"].unique().tolist(), 10)
df_sampled = df[df["h3_07"].isin(sampled_hexagons)]

data_sampled = create_spatio_temporal_df(df_sampled, 1, "h3_07")
print(data_sampled.shape)

(22876, 20)


In [8]:
# split into train and test
X_train, X_test, y_train, y_test = split_train_test(data)
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = split_train_test(data_sampled)

run not sampled data to test length

In [9]:
# train model
model = SVR(C=10, epsilon=0.1, kernel="rbf")
model.fit(X_train, y_train)

In [10]:
# evaluate model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test[:100], y_pred[:100]).round(2)
mse = mean_squared_error(y_test[:100], y_pred[:100]).round(2)
r2 = r2_score(y_test[:100], y_pred[:100]).round(4)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2: {r2}")

: 

In [9]:
# train model
model = SVR(C=10, epsilon=0.1, kernel="rbf")
model.fit(X_train_sampled, y_train_sampled)

# full version of training models

In [7]:
# scale features

scaler = StandardScaler()
# scale demand y data as well????
demand_feat_df[time_related_columns + wheather_related_columns] = scaler.fit_transform(demand_feat_df[time_related_columns + wheather_related_columns])
demand_feat_df

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,h3_07
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01 00:00:00,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c10ffffff
2015-01-01 00:00:00,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,872664c11ffffff
2015-01-01 00:00:00,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c12ffffff
2015-01-01 00:00:00,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,872664c13ffffff
2015-01-01 00:00:00,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,872664c16ffffff
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31 11:00:00,872664c1bffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1bffffff
2015-12-31 11:00:00,872664c1effffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.121803,-0.138869,6,872664c1effffff
2015-12-31 11:00:00,872664ca9ffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.089096,-0.138869,1,872664ca9ffffff
2015-12-31 12:00:00,872664c1effffff,-0.196166,0.029041,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1effffff


In [8]:
# get one hot encoding for location bucket

demand_feat_df_one_hot = pd.get_dummies(demand_feat_df, columns=[location_bucket], dtype=int)
demand_feat_df_one_hot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,...,h3_07_872664d8bffffff,h3_07_872664d8cffffff,h3_07_872664d8dffffff,h3_07_872664d8effffff,h3_07_872664d98ffffff,h3_07_872664d99ffffff,h3_07_872664d9bffffff,h3_07_87275934cffffff,h3_07_87275934effffff,h3_07_87275936bffffff
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-01-01,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# split data into train and test

X = demand_feat_df_one_hot.drop(columns=["demand"])
y = demand_feat_df_one_hot["demand"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)

In [10]:
# train model

model = SVR(C=1.0, epsilon=0.2)
model.fit(X_train, y_train)

In [10]:
# evaluate model

y_pred = model.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R^2 Score:', r2_score(y_test, y_pred))

Mean Absolute Error: 1.2764227270630981
Mean Squared Error: 4.862735610109737
R^2 Score: 0.720296889529022


In [None]:
#def train_and_evaluate_SVR_model (X_train, X_test, y_train, y_test, c=1.0, epsilon=0.2, kernel="linear"):
#    # train model
#    model = SVR(C=c, epsilon=epsilon, kernel=kernel)
#    model.fit(X_train, y_train)
#
#    # evaluate model
#    y_pred = model.predict(X_test)
#    mae = mean_absolute_error(y_test, y_pred).round(2)
#    mse = mean_squared_error(y_test, y_pred).round(2)
#    r2 = r2_score(y_test, y_pred).round(4)
#
#    return mae, mse, r2


#mae, mse, r2 = train_and_evaluate_SVR_model(X_train, X_test, y_train, y_test, c=1.0, epsilon=0.2, kernel="linear")

#print(f"MAE: {mae}")
#print(f"MSE: {mse}")
#print(f"R2: {r2}")

In [None]:
regularization_c = [0.01, 0.1, 1.0, 10.0, 100.0]
possible_kernels = ["linear", "poly", "rbf", "sigmoid", "precomputed"]
degrees_poly_kerne = [2, 3, 4, 5, 6, 7, 8, 9, 10] # only for poly kernel
gamma = ["scale", "auto"] # only for poly, rbf, sigmoid kernel

In [16]:
results = []
for time_bucket_length in [1, 2, 6, 24]:
    for location_bucket in ["h3_07"]: # "h3_08", "h3_09", "centroid"
        print(f"evaluate time bucket {time_bucket_length} and location bucket: {location_bucket}")
        mae, mse, r2 = evaluate_hyperparameters(time_bucket_length, location_bucket)
        print(f"mae: {mae}, mse: {mse}, r2: {r2}")
        results.append({"time_bucket_length": time_bucket_length, "location_bucket": location_bucket, "mae": mae, "mse": mse, "r2": r2})

evaluate time bucket 1 and location bucket: h3_07
mae: 1.28, mse: 4.86, r2: 0.7203
evaluate time bucket 2 and location bucket: h3_07
mae: 1.97, mse: 14.1, r2: 0.7456
evaluate time bucket 6 and location bucket: h3_07
mae: 4.36, mse: 106.56, r2: 0.6846
evaluate time bucket 24 and location bucket: h3_07
mae: 18.0, mse: 2447.02, r2: 0.1562


In [17]:
pd.DataFrame(results)

Unnamed: 0,time_bucket_length,location_bucket,mae,mse,r2
0,1,h3_07,1.28,4.86,0.7203
1,2,h3_07,1.97,14.1,0.7456
2,6,h3_07,4.36,106.56,0.6846
3,24,h3_07,18.0,2447.02,0.1562
