In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_log_error

In [2]:
sns.set_theme(style="whitegrid")

In [3]:
X_train = np.load('data/X_train.npy')
y_train = np.load('data/y_train.npy')
X_val = np.load('data/X_val.npy')
y_val = np.load('data/y_val.npy')

In [27]:
def model_train_and_tune(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring="neg_root_mean_squared_log_error", n_jobs=-1, verbose=2)

    grid_search.fit(X_train, y_train)

    return grid_search.best_score_, grid_search.best_params_, grid_search.best_estimator_

In [14]:
model = LinearRegression()
param_grid = {
    "fit_intercept": [True, False]
}
score, params, model = model_train_and_tune(model, param_grid, X_train, y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits




In [15]:
def print_results(score, params):
    print(f"Best score: {score}")
    print(f"Best params: {params}")

In [16]:
print_results(score, params)

Best score: nan
Best params: {'fit_intercept': True}


In [20]:
def custom_root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(np.sum(np.square(np.log1p(1 + y_pred) - np.log1p(1 + y_true))) / len(y_true))

In [21]:
custom_root_mean_squared_log_error(y_val, model.predict(X_val))

0.1525064342164683

In [24]:
model = RandomForestRegressor()
param_grid = {
    "n_estimators": [200, 300, 400],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
score, params, model = model_train_and_tune(model, param_grid, X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [25]:
print_results(score, params)

Best score: -0.1509465863210951
Best params: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 400}


In [26]:
custom_root_mean_squared_log_error(y_val, model.predict(X_val))

0.13978866449684338

In [29]:
model = SVR()
param_grid = {
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [2, 3, 4],
    "C": [0.1, 1, 10],
}
score, params, model = model_train_and_tune(model, param_grid, X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [None]:
print_results(score, params)

In [None]:
custom_root_mean_squared_log_error(y_val, model.predict(X_val))