In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Wczytanie danych

In [None]:
train = pd.read_csv('transform\\train.csv')
test = pd.read_csv('transform\\test.csv')

### Podział na zbiór treningowy i walidacyjny

In [None]:
from sklearn.model_selection import train_test_split

X, y = train.drop(columns=['pm2_5'], axis=1), train['pm2_5']
test_ids = test['id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

### Zapis predykcji

In [None]:
import os


def save_to_csv(y_pred, save_as):
    if 'result' not in os.listdir(os.getcwd()):
        os.mkdir('result')
    final_df = pd.concat([test_ids, pd.DataFrame.from_dict({'pm2_5': y_pred})], axis=1)
    final_df.to_csv(f'result\\{save_as}', index=False)

### <center>Optuna + XGBoost</center>

In [None]:
import xgboost as xgb
import optuna
from sklearn.metrics import root_mean_squared_error


def define_xgb_model(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 750, 1100),
        'subsample': trial.suggest_float('subsample', 0.1, 0.6, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1, log=True),
        'tree_method': trial.suggest_categorical('tree_method', ['hist', 'approx']),
        'verbosity': 0,
        'enable_categorical': True,
        'n_jobs': -1
    }
    return xgb.XGBRegressor(**params)

def objective_xgb(trial):
    model = define_xgb_model(trial)
    model.fit(X_train, y_train)
    return root_mean_squared_error(y_test, model.predict(X_test))

In [None]:
study_xgboost = optuna.create_study(direction='minimize', study_name='AirQualityWithXGBoost',
                                    sampler=optuna.samplers.TPESampler())
study_xgboost.optimize(objective_xgb, n_trials=100)

### Zdefiniowanie najlepszego XGBoost

In [None]:
xgb_model = define_xgb_model(study_xgboost.best_trial)

xgb_model.fit(X_train, y_train)
xgboost_params = ['n_estimators', 'subsample', 'reg_lambda', 'learning_rate', 'max_depth', 'colsample_bytree']
xgb_pred = xgb_model.predict(X_test)
root_mean_squared_error(y_test, xgb_pred)

### Krzywa nauki dla XGBoost

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(xgb_model, X, y, cv=20, n_jobs=-1, random_state=4, scoring='neg_root_mean_squared_error')

### Wykres przedstawiający każdy <i>trial</i> w procesie nauki

In [None]:
optuna.visualization.plot_optimization_history(study_xgboost)

### Wizualizacja przekroju parametrów

In [None]:
optuna.visualization.plot_slice(study_xgboost, params=xgboost_params)

### Wpływ poszczególnych parametrów na proces nauki modelu

In [None]:
optuna.visualization.plot_param_importances(study_xgboost)

### Najlepsze parametry

In [None]:
study_xgboost.best_params

### Predykcje XGBoost

In [None]:
save_to_csv(xgb_model.predict(test), 'xgb.csv')