In [5]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

train_csv_filename = "../data/processed/train_data.csv"
test_csv_filename = "../data/processed/test_data.csv"
df_train = pd.read_csv(train_csv_filename)
df_test = pd.read_csv(test_csv_filename)

df_train_x = df_train[["temperature", "relativehumidity", "dewpoint", "surface_pressure", "cloudcover", "windspeed", "winddirection",
         "pm25"]]

df_train_y = df_train["pm10"]

df_test_x = df_test[["temperature", "relativehumidity", "dewpoint", "surface_pressure", "cloudcover", "windspeed", "winddirection",
         "pm25"]]

df_test_y = df_test["pm10"]

x_train = np.array(df_train_x)
x_test = np.array(df_test_x)
y_train = np.array(df_train_y)
y_test = np.array(df_test_y)

In [2]:
pipe = Pipeline([
        ('imputer', SimpleImputer()),
        ('regressor', RandomForestRegressor())
    ])

param_grid = {
    'regressor__n_estimators': [5, 10, 25, 50, 100, 200],
    'regressor__max_features': ['sqrt', 'log2'],
    'regressor__max_depth': [3, 5, 10, 20],
    'regressor__min_samples_split': [2, 5, 10, 20, 35],
    'regressor__min_samples_leaf': [1, 2, 4, 8, 20],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y_train)

print(grid_search.best_params_)

{'regressor__max_depth': 10, 'regressor__max_features': 'log2', 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 20, 'regressor__n_estimators': 50}


In [6]:
#model = RandomForestRegressor(**grid_search.best_params_)
train_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('regressor', RandomForestRegressor())
])

train_pipe.set_params(**grid_search.best_params_)

train_pipe.fit(x_train, y_train)

predictions = train_pipe.predict(x_test)

mae = metrics.mean_absolute_error(y_test, predictions)
mse = metrics.mean_squared_error(y_test, predictions)
rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))
mape = np.mean(np.abs((y_test - predictions) / np.abs(predictions)))
acc = round(100 * (1 - mape), 2)

print('Mean Absolute Error (MAE):', mae)
print('Mean Squared Error (MSE):', mse)
print('Root Mean Squared Error (RMSE):', rmse)
print('Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
print('Accuracy:', acc)

Mean Absolute Error (MAE): 2.718555482632967
Mean Squared Error (MSE): 14.025143305265827
Root Mean Squared Error (RMSE): 3.7450157950622622
Mean Absolute Percentage Error (MAPE): 22.45
Accuracy: 77.55
