In [None]:
import math
import os

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

file_location = os.path.dirname(__file__)

atributes = "temperature_2m,relativehumidity_2m,dewpoint_2m,apparent_temperature,pressure_msl," \
            "surface_pressure,precipitation,weathercode,cloudcover,cloudcover_low,cloudcover_mid,cloudcover_high," \
            "windspeed_10m,winddirection_10m"

city_names = [
    "Maribor",
    "Ljubljana",
    "Kranj",
    "Koper",
    "Celje",
    "Novo_Mesto",
    "Ptuj",
    "Murska_Sobota"
]

In [None]:
def prepare_data(city, y_atribute):
    train_csv_filename = os.path.join(file_location, "../data/processed/city/final/" + city + "/train_data.csv")
    test_csv_filename = os.path.join(file_location, "../data/processed/city/final/" + city + "/test_data.csv")
    df_train = pd.read_csv(train_csv_filename)
    df_test = pd.read_csv(test_csv_filename)

    df_train_x = df_train[atributes.split(",")]
    df_train_x = df_train_x.drop(y_atribute, axis=1)

    df_train_y = df_train[y_atribute]

    df_test_x = df_test[atributes.split(",")]
    df_test_x = df_test_x.drop(y_atribute, axis=1)

    df_test_y = df_test[y_atribute]

    return df_train_x, df_test_x, df_train_y, df_test_y

In [None]:
def get_best_params(x_train, y_train):
    pipe = Pipeline([
        ('imputer', SimpleImputer()),
        ('regressor', RandomForestRegressor())
    ])

    param_grid = {
        'regressor__n_estimators': [10, 20, 50, 100, 200],
        'regressor__max_features': ['sqrt', 'log2'],
        'regressor__max_depth': [3, 5, 10, 20, 40],
        'regressor__min_samples_split': [2, 5, 10, 20],
        'regressor__min_samples_leaf': [1, 2, 4, 8, 16],
    }

    grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)
    grid_search.fit(x_train, y_train)

    return grid_search.best_params_

In [None]:
def train_model(city, y_atribute, x_train, x_test, y_train, y_test):
    x_train = np.array(x_train)
    x_test = np.array(x_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    train_pipe = Pipeline([
        ('imputer', SimpleImputer()),
        ('regressor', RandomForestRegressor())
    ])

    print("Start training model for", city, "(", y_atribute, ")")

    best_params = get_best_params(x_train, y_train)

    train_pipe.set_params(**best_params)

    train_pipe.fit(x_train, y_train)

    predictions = train_pipe.predict(x_test)

    mae = metrics.mean_absolute_error(y_test, predictions)
    mse = metrics.mean_squared_error(y_test, predictions)
    rmse = np.sqrt(metrics.mean_squared_error(y_test, predictions))

    print('(', city, ')Mean Absolute Error (MAE):', mae)
    print('(', city, ')Mean Squared Error (MSE):', mse)
    print('(', city, ')Root Mean Squared Error (RMSE):', rmse)

    mape = np.mean(np.abs((y_test - predictions) / np.abs(predictions)))
    acc = -1

    if math.isnan(mape) is False:
        acc = round(100 * (1 - mape), 2)
        print('(', city, ')Mean Absolute Percentage Error (MAPE):', round(mape * 100, 2))
        print('(', city, ')Accuracy:', acc)

In [None]:
for i in range(len(city_names)):
    (x_train, x_test, y_train, y_test) = prepare_data(city_names[i], "temperature_2m")
    train_model(city_names[i], "temperature_2m", x_train, x_test, y_train, y_test)

    (x_train, x_test, y_train, y_test) = prepare_data(city_names[i], "precipitation")
    train_model(city_names[i], "precipitation", x_train, x_test, y_train, y_test)