In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Wczytanie danych

In [None]:
train = pd.read_csv('transform\\train.csv')
test = pd.read_csv('transform\\test.csv')

### Podział na zbiór treningowy i walidacyjny

In [None]:
from sklearn.model_selection import train_test_split

X, y = train.drop(columns=['pm2_5'], axis=1), train['pm2_5']
test_ids = test['id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

### Zapis predykcji

In [None]:
import os


def save_to_csv(y_pred, save_as):
    if 'result' not in os.listdir(os.getcwd()):
        os.mkdir('result')
    final_df = pd.concat([test_ids, pd.DataFrame.from_dict({'pm2_5': y_pred})], axis=1)
    final_df.to_csv(f'result\\{save_as}', index=False)

In [None]:
### <center>Optuna + RandomForestRegressor</center>

In [None]:
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.metrics import root_mean_squared_error


def define_rf_model(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 700),
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'min_samples_split': trial.suggest_int('min_samples_split', 20, 70),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 50, 200),
        'bootstrap': True,
        'oob_score': True,
        'n_jobs': -1,
        'random_state': 4,
        'ccp_alpha': trial.suggest_float('ccp_alpha', 1e-4, 1e-1, log=True),
        'max_samples': trial.suggest_float('max_samples', 1e-1, 1, log=True)
    }
    return RandomForestRegressor(**params)

def objective_rf(trial):
    model = define_rf_model(trial)
    model.fit(X_train, y_train)
    return root_mean_squared_error(y_test, model.predict(X_test))

In [None]:
study_rf = optuna.create_study(direction='minimize', study_name='AirQualityWithRandomForest',
                               sampler=optuna.samplers.TPESampler())
study_rf.optimize(objective_rf, n_trials=200)

In [None]:
### Zdefiniowanie najlepszego RandomForestRegressor

In [None]:
rf_model = define_rf_model(study_rf.best_trial)

rf_model.fit(X_train, y_train)
rf_params = ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'ccp_alpha', 'max_samples']
rf_pred = rf_model.predict(X_test)
root_mean_squared_error(y_test, rf_pred)

### Wykres przedstawiający każdy <i>trial</i> w procesie nauki

In [None]:
optuna.visualization.plot_optimization_history(study_rf)

### Wizualizacja przekroju parametrów

In [None]:
optuna.visualization.plot_slice(study_rf, params=rf_params)

### Wpływ poszczególnych parametrów na proces nauki modelu

In [None]:
optuna.visualization.plot_param_importances(study_rf)

### Najlepsze parametry

In [None]:
study_rf.best_params

### Predykcje RandomForest

In [None]:
save_to_csv(rf_model.predict(test), 'rf.csv')