In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVR, SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
time_bucket_lengths = [1, 2, 6, 24]
location_buckets = ["h3_07", "h3_08", "h3_09"] # "centroid" fehlt noch weil nur über langitude longitude aggregierbar 
# TODO centroid implementation in function

time_bucket_length = 24
location_bucket = "h3_07"

In [3]:
# read in data
data = pd.read_parquet(f"../../data/predictive/Taxi_Trips_Spatio_Temporal_{time_bucket_length}_{location_bucket}.parquet")
data

Unnamed: 0,h3_07,temp,precip,demand,day_of_week,is_weekday,month,sustenance_poi,public_transport_poi,education_poi,arts_and_culture_poi,sports_poi
0,872664521ffffff,-3.609411,0.0,406,3,0,1,60.0,38.0,10.0,0.0,0.0
1,872664c10ffffff,-5.542004,0.0,3699,3,0,1,699.0,281.0,32.0,14.0,26.0
2,872664c11ffffff,-5.211486,0.0,1288,3,0,1,987.0,346.0,21.0,15.0,17.0
3,872664c12ffffff,-5.696185,0.0,1285,3,0,1,1179.0,435.0,45.0,17.0,43.0
4,872664c13ffffff,-4.900690,0.0,2833,3,0,1,1540.0,486.0,48.0,18.0,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...
12942,872664d8cffffff,-3.389802,0.0,3,3,0,12,216.0,170.0,9.0,10.0,8.0
12943,872664d8dffffff,-2.874378,0.0,9,3,0,12,257.0,150.0,11.0,11.0,7.0
12944,872664d99ffffff,-2.470000,0.0,2,3,0,12,204.0,112.0,11.0,5.0,5.0
12945,872664d9bffffff,-2.190000,0.0,1,3,0,12,143.0,59.0,13.0,6.0,7.0


In [4]:
# scale data and get one hot encoding for location bucket feature
col_list = data.columns.to_list()
col_list.remove(location_bucket)

scaler = StandardScaler()
data[col_list] = scaler.fit_transform(data[col_list])

data = pd.get_dummies(data, columns=[location_bucket], dtype=int)
data

Unnamed: 0,temp,precip,demand,day_of_week,is_weekday,month,sustenance_poi,public_transport_poi,education_poi,arts_and_culture_poi,...,h3_07_872664d9dffffff,h3_07_872664d9effffff,h3_07_872759340ffffff,h3_07_87275934cffffff,h3_07_87275934dffffff,h3_07_87275934effffff,h3_07_872759369ffffff,h3_07_87275936bffffff,h3_07_87275936cffffff,h3_07_87275936dffffff
0,-1.165312,-0.161232,-0.233116,-0.005897,-0.631361,-1.596232,-1.162694,-1.301891,-1.142317,-1.821897,...,0,0,0,0,0,0,0,0,0,0
1,-1.338693,-0.161232,0.603152,-0.005897,-0.631361,-1.596232,0.222104,-0.356233,0.299297,0.906606,...,0,0,0,0,0,0,0,0,0,0
2,-1.309040,-0.161232,-0.009130,-0.005897,-0.631361,-1.596232,0.846239,-0.103279,-0.421510,1.101499,...,0,0,0,0,0,0,0,0,0,0
3,-1.352525,-0.161232,-0.009891,-0.005897,-0.631361,-1.596232,1.262328,0.243074,1.151159,1.491285,...,0,0,0,0,0,0,0,0,0,0
4,-1.281158,-0.161232,0.383228,-0.005897,-0.631361,-1.596232,2.044663,0.441545,1.347743,1.686178,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12942,-1.145610,-0.161232,-0.335460,-0.005897,-0.631361,1.617875,-0.824621,-0.788200,-1.207845,0.127033,...,0,0,0,0,0,0,0,0,0,0
12943,-1.099369,-0.161232,-0.333936,-0.005897,-0.631361,1.617875,-0.735769,-0.866032,-1.076789,0.321927,...,0,0,0,0,0,0,0,0,0,0
12944,-1.063090,-0.161232,-0.335714,-0.005897,-0.631361,1.617875,-0.850627,-1.013913,-1.076789,-0.847432,...,0,0,0,0,0,0,0,0,0,0
12945,-1.037970,-0.161232,-0.335968,-0.005897,-0.631361,1.617875,-0.982822,-1.220168,-0.945734,-0.652539,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def split_train_test(data, target_col="demand"):
    # split data into train and test
    X = data.drop(columns=[target_col])
    y = data[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)
    return X_train, X_test, y_train, y_test    

In [6]:
X_train, X_test, y_train, y_test = split_train_test(data)

In [9]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5],
}

linear_svr_model = LinearSVR()
grid_search_linear = GridSearchCV(linear_svr_model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
grid_search_linear.fit(X_train, y_train)

print(grid_search_linear.best_params_)
pd.DataFrame(grid_search_linear.cv_results_)



{'C': 1, 'epsilon': 0.1}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.035453,0.001449,0.001492,0.000111,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1}",-0.07215,-0.0782,-0.083821,-0.077941,-0.066048,-0.075632,0.006049,2
1,0.024197,0.001303,0.001481,6.9e-05,0.1,0.2,"{'C': 0.1, 'epsilon': 0.2}",-0.079842,-0.086222,-0.091954,-0.085898,-0.072743,-0.083332,0.006536,6
2,0.019049,0.000535,0.001514,7.3e-05,0.1,0.3,"{'C': 0.1, 'epsilon': 0.3}",-0.093596,-0.095996,-0.102237,-0.097492,-0.084681,-0.0948,0.005794,10
3,0.016205,0.000906,0.001583,0.000127,0.1,0.4,"{'C': 0.1, 'epsilon': 0.4}",-0.110682,-0.110109,-0.120302,-0.111773,-0.100053,-0.110584,0.006434,12
4,0.016346,0.001573,0.001498,7.5e-05,0.1,0.5,"{'C': 0.1, 'epsilon': 0.5}",-0.136879,-0.134309,-0.144608,-0.135286,-0.126003,-0.135417,0.005941,17
5,0.165674,0.015351,0.001448,0.000161,1.0,0.1,"{'C': 1, 'epsilon': 0.1}",-0.072282,-0.076876,-0.084507,-0.079543,-0.06428,-0.075497,0.006865,1
6,0.104189,0.009203,0.001462,0.000144,1.0,0.2,"{'C': 1, 'epsilon': 0.2}",-0.076057,-0.081593,-0.087969,-0.083183,-0.067248,-0.07921,0.007088,4
7,0.062731,0.00432,0.001998,0.001321,1.0,0.3,"{'C': 1, 'epsilon': 0.3}",-0.081932,-0.086661,-0.091809,-0.087434,-0.072379,-0.084043,0.006621,7
8,0.047297,0.008901,0.001427,5.5e-05,1.0,0.4,"{'C': 1, 'epsilon': 0.4}",-0.090278,-0.096438,-0.099803,-0.092778,-0.082149,-0.092289,0.006013,9
9,0.055346,0.010657,0.001609,0.000216,1.0,0.5,"{'C': 1, 'epsilon': 0.5}",-0.111326,-0.115777,-0.118716,-0.108403,-0.104,-0.111644,0.005218,13


In [11]:
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
    'epsilon': [0.1, 0.2, 0.3, 0.4, 0.5] 
}

svr = SVR(kernel='rbf')
grid_search_rbf = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_rbf.fit(X_train, y_train)

print(grid_search_rbf.best_params_)
pd.DataFrame(grid_search_rbf.cv_results_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.4s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.5s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.6s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.5s
[CV] END ....................C=0.1, epsilon=0.1, gamma=scale; total time=   1.5s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.2s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.2s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.2s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   2.9s
[CV] END .....................C=0.1, epsilon=0.1, gamma=auto; total time=   3.0s
[CV] END .....................C=0.1, epsilon=0.1, gamma=0.01; total time=   3.4s
[CV] END ....................C=0.1, epsilon=0.

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_epsilon,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.057108,0.033167,0.452695,0.022089,0.1,0.1,scale,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 'scale'}",-0.096264,-0.101123,-0.100918,-0.076815,-0.083240,-0.091672,0.009877,63
1,1.665568,0.217492,0.818984,0.142899,0.1,0.1,auto,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 'auto'}",-0.613850,-0.591233,-0.588738,-0.566889,-0.631267,-0.598396,0.022166,114
2,3.131293,0.188271,1.347271,0.041865,0.1,0.1,0.001,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 0.001}",-0.973774,-0.928967,-0.931601,-0.900396,-0.997965,-0.946541,0.034778,120
3,2.283277,0.066994,0.909661,0.140965,0.1,0.1,0.01,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 0.01}",-0.611244,-0.588808,-0.586149,-0.564258,-0.628107,-0.595713,0.021996,113
4,1.329329,0.110026,0.527014,0.079493,0.1,0.1,0.1,"{'C': 0.1, 'epsilon': 0.1, 'gamma': 0.1}",-0.093888,-0.097600,-0.098772,-0.074186,-0.080935,-0.089076,0.009776,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.320548,0.010006,0.086122,0.012855,100,0.5,auto,"{'C': 100, 'epsilon': 0.5, 'gamma': 'auto'}",-0.089577,-0.089951,-0.099691,-0.078177,-0.077927,-0.087065,0.008204,56
116,0.337959,0.015809,0.114287,0.008912,100,0.5,0.001,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.001}",-0.114420,-0.115445,-0.122472,-0.111958,-0.107808,-0.114421,0.004810,75
117,0.317436,0.008777,0.081873,0.005419,100,0.5,0.01,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.01}",-0.089496,-0.089777,-0.099602,-0.078227,-0.077868,-0.086994,0.008162,55
118,0.310481,0.022905,0.074324,0.011964,100,0.5,0.1,"{'C': 100, 'epsilon': 0.5, 'gamma': 0.1}",-0.071325,-0.072288,-0.087281,-0.072014,-0.076261,-0.075834,0.005979,45


create data for time bucket 1 hour and hexagon resolution 7 and a sampled dataframe with only ten hexagons

In [7]:
import random

sampled_hexagons =  random.sample(df["h3_07"].unique().tolist(), 10)
df_sampled = df[df["h3_07"].isin(sampled_hexagons)]

data_sampled = create_spatio_temporal_df(df_sampled, 1, "h3_07")
print(data_sampled.shape)

(22876, 20)


In [8]:
# split into train and test
X_train, X_test, y_train, y_test = split_train_test(data)
X_train_sampled, X_test_sampled, y_train_sampled, y_test_sampled = split_train_test(data_sampled)

run not sampled data to test length

In [9]:
# train model
model = SVR(C=10, epsilon=0.1, kernel="rbf")
model.fit(X_train, y_train)

In [10]:
# evaluate model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test[:100], y_pred[:100]).round(2)
mse = mean_squared_error(y_test[:100], y_pred[:100]).round(2)
r2 = r2_score(y_test[:100], y_pred[:100]).round(4)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2: {r2}")

: 

In [9]:
# train model
model = SVR(C=10, epsilon=0.1, kernel="rbf")
model.fit(X_train_sampled, y_train_sampled)

# full version of training models

In [7]:
# scale features

scaler = StandardScaler()
# scale demand y data as well????
demand_feat_df[time_related_columns + wheather_related_columns] = scaler.fit_transform(demand_feat_df[time_related_columns + wheather_related_columns])
demand_feat_df

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,h3_07
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01 00:00:00,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c10ffffff
2015-01-01 00:00:00,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,872664c11ffffff
2015-01-01 00:00:00,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,872664c12ffffff
2015-01-01 00:00:00,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,872664c13ffffff
2015-01-01 00:00:00,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,872664c16ffffff
...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-31 11:00:00,872664c1bffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1bffffff
2015-12-31 11:00:00,872664c1effffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.121803,-0.138869,6,872664c1effffff
2015-12-31 11:00:00,872664ca9ffffff,-0.345404,-0.580651,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.089096,-0.138869,1,872664ca9ffffff
2015-12-31 12:00:00,872664c1effffff,-0.196166,0.029041,1.767271,1.847726,1.696905,-0.105023,-0.6953,-1.154511,-0.138869,1,872664c1effffff


In [8]:
# get one hot encoding for location bucket

demand_feat_df_one_hot = pd.get_dummies(demand_feat_df, columns=[location_bucket], dtype=int)
demand_feat_df_one_hot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,hour,4_hour_block,day,week,month,day_of_week,is_weekday,temp,precip,demand,...,h3_07_872664d8bffffff,h3_07_872664d8cffffff,h3_07_872664d8dffffff,h3_07_872664d8effffff,h3_07_872664d98ffffff,h3_07_872664d99ffffff,h3_07_872664d9bffffff,h3_07_87275934cffffff,h3_07_87275934effffff,h3_07_87275936bffffff
time_bucket_floored,h3_07,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2015-01-01,872664c10ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c11ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,2,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c12ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.402264,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c13ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.420562,-0.138869,4,...,0,0,0,0,0,0,0,0,0,0
2015-01-01,872664c16ffffff,-1.987019,-1.800036,-1.677834,-1.704113,-1.575115,-0.105023,-0.6953,-1.426662,-0.138869,3,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# split data into train and test

X = demand_feat_df_one_hot.drop(columns=["demand"])
y = demand_feat_df_one_hot["demand"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4711)

In [10]:
# train model

model = SVR(C=1.0, epsilon=0.2)
model.fit(X_train, y_train)

In [10]:
# evaluate model

y_pred = model.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R^2 Score:', r2_score(y_test, y_pred))

Mean Absolute Error: 1.2764227270630981
Mean Squared Error: 4.862735610109737
R^2 Score: 0.720296889529022


In [None]:
#def train_and_evaluate_SVR_model (X_train, X_test, y_train, y_test, c=1.0, epsilon=0.2, kernel="linear"):
#    # train model
#    model = SVR(C=c, epsilon=epsilon, kernel=kernel)
#    model.fit(X_train, y_train)
#
#    # evaluate model
#    y_pred = model.predict(X_test)
#    mae = mean_absolute_error(y_test, y_pred).round(2)
#    mse = mean_squared_error(y_test, y_pred).round(2)
#    r2 = r2_score(y_test, y_pred).round(4)
#
#    return mae, mse, r2


#mae, mse, r2 = train_and_evaluate_SVR_model(X_train, X_test, y_train, y_test, c=1.0, epsilon=0.2, kernel="linear")

#print(f"MAE: {mae}")
#print(f"MSE: {mse}")
#print(f"R2: {r2}")

In [None]:
regularization_c = [0.01, 0.1, 1.0, 10.0, 100.0]
possible_kernels = ["linear", "poly", "rbf", "sigmoid", "precomputed"]
degrees_poly_kerne = [2, 3, 4, 5, 6, 7, 8, 9, 10] # only for poly kernel
gamma = ["scale", "auto"] # only for poly, rbf, sigmoid kernel

In [16]:
results = []
for time_bucket_length in [1, 2, 6, 24]:
    for location_bucket in ["h3_07"]: # "h3_08", "h3_09", "centroid"
        print(f"evaluate time bucket {time_bucket_length} and location bucket: {location_bucket}")
        mae, mse, r2 = evaluate_hyperparameters(time_bucket_length, location_bucket)
        print(f"mae: {mae}, mse: {mse}, r2: {r2}")
        results.append({"time_bucket_length": time_bucket_length, "location_bucket": location_bucket, "mae": mae, "mse": mse, "r2": r2})

evaluate time bucket 1 and location bucket: h3_07
mae: 1.28, mse: 4.86, r2: 0.7203
evaluate time bucket 2 and location bucket: h3_07
mae: 1.97, mse: 14.1, r2: 0.7456
evaluate time bucket 6 and location bucket: h3_07
mae: 4.36, mse: 106.56, r2: 0.6846
evaluate time bucket 24 and location bucket: h3_07
mae: 18.0, mse: 2447.02, r2: 0.1562


In [17]:
pd.DataFrame(results)

Unnamed: 0,time_bucket_length,location_bucket,mae,mse,r2
0,1,h3_07,1.28,4.86,0.7203
1,2,h3_07,1.97,14.1,0.7456
2,6,h3_07,4.36,106.56,0.6846
3,24,h3_07,18.0,2447.02,0.1562
