## Chosen model: XGBRegressor
### finetuning hyperparameters

In [73]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error
import pandas as pd

df_original = pd.read_csv("../src/taxipred/data/cleaned_data.csv")
df_encoded = pd.get_dummies(df_original, drop_first=True)
X, y = df_encoded.drop(columns="Trip_Price"), df_encoded["Trip_Price"]
model = XGBRegressor()
def data_handler(): 
    """-> X_train, X_test, X_val, y_train, y_test, y_val, X_train_full, y_train_full"""

    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.1, random_state=42) #X_train_full använder du när du utvärderat alla modeller
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.11, random_state=42)

    return X_train, X_test, X_val, y_train, y_test, y_val, X_train_full, y_train_full#scaled_X_train, scaled_X_test, scaled_X_val, y_train, y_test, y_val

def regressor_model(model, data_for_training, y_true_for_training, data_to_predict, y_true):

    model.fit(data_for_training, y_true_for_training)
    y_pred = model.predict(data_to_predict)

    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)

    df_metric = pd.DataFrame([{"mae":mae,"mse":mse,"rmse": rmse}])
    # mae, mse, rmse
    return df_metric
X_train, X_test, X_val, y_train, y_test, y_val,X_train_full, y_train_full = data_handler()
regressor_model(model, X_train, y_train,X_val,y_val)

Unnamed: 0,mae,mse,rmse
0,5.051979,48.240022,6.945504


## Evaluate parameters
### Evaluate under/overfitting, 
- check to increase max_depth (if rmse with val is worse = overfitting, if both are bad = underfitting)
- check reg_alpha, if underfitting on val-data lower to 0-1

In [74]:
model = XGBRegressor(
    n_estimators=1000, #antal träd som byggs D100
    learning_rate=0.02, # Hur mkt varje nytt träd påverkar slutmodellen, lägre värden = stabilare men kräver fler träd. D0.3
    max_depth=3, #större värden = risk för overfitting, mindre värden = risk för underfitting D6
    min_child_weight=2, #minsta antal observationer för att skapa en ny nod, högre värden = mer konservativ D1
    subsample=0.7, # antal rader som används per träd, < 1 ger slump och mindre overfitting. D1
    colsample_bytree=0.9, #Andel features som används per träd D1
    gamma=0, #minsta förbättring för att göra en split, högre värde = färre splits D0
    reg_alpha=4, #L1 regularisering, gör modellen glesare D0
    reg_lambda=1, #L2 regularisering, straffar stora koefficienter, stabiliserar D1
    random_state=42, 
    
) 

# objective="reg:squarederror" #Standard för regression.
# n_jobs =None, # Kör med alla tillängliga CPU-trådar
# colsample_bytree=1.0 # Andel features som används per träd.
# colsample_bynode=1.0 # Andel features per split.
X_train, X_test, X_val, y_train, y_test, y_val, X_train_full, y_train_full = data_handler()
regressor_model(model,X_train, y_train,X_val, y_val) #Bästa 6.21, depth = 3, 5.68, 5.60

Unnamed: 0,mae,mse,rmse
0,3.698067,31.390557,5.602728


## Final check with X_test, if it looks good, export model

In [75]:
X_train, X_test, X_val, y_train, y_test, y_val, X_train_full, y_train_full = data_handler()
regressor_model(model,X_train_full,y_train_full,X_test,y_test)

Unnamed: 0,mae,mse,rmse
0,3.31847,23.493376,4.846997


In [76]:
import joblib
model.fit(X,y)

joblib.dump(model, "../src/taxipred/models/taxi_XBGRegressor.joblib", compress=("xz", 3), protocol=5)


['../src/taxipred/models/taxi_XBGRegressor.joblib']