## <center>Analiza poziomu PM2.5 w afrykańskich miastach</center>
### Zespół:
<ol>
    <li style='font-size: 20px'>Hubert Kłosowski 242424</li>
    <li style='font-size: 20px'>Krzysztof Kolanek 242425</li>
    <li style='font-size: 20px'>Kamil Małecki 242464</li>
</ol>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### Wczytanie danych

In [None]:
train = pd.read_csv(os.path.join('transform', 'train.csv'))
test = pd.read_csv(os.path.join('transform', 'test.csv'))


categorical = ['month', 'week', 'dayofweek']
# train[categorical] = train[categorical].astype('category')
# test[categorical] = test[categorical].astype('category')

In [None]:
test.info()

### Podział na zbiór treningowy i walidacyjny

In [None]:
from sklearn.model_selection import train_test_split


X, y = train.drop(columns=['pm2_5'], axis=1), train['pm2_5']
test_ids = test['id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

### Zapis predykcji

In [None]:
def save_to_csv(y_pred, save_as):
    if 'result' not in os.listdir(os.getcwd()):
        os.mkdir('result')
    final_df = pd.concat([test_ids, pd.DataFrame.from_dict({'pm2_5': y_pred})], axis=1)
    final_df.to_csv(os.path.join('result', save_as), index=False)

### <center>Optuna + lightGBM</center>

In [None]:
import lightgbm as lgb
import optuna
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold, cross_val_score, GroupKFold


def define_lightgbm_model(trial):
    params = {
        'objective': 'root_mean_squared_error',
        'boosting_type': 'gbdt',
        'max_bin': trial.suggest_int('max_bin', 10, 200),
        'num_leaves': trial.suggest_int('num_leaves', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 9e-2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 700),
        'tree_learner': 'voting',
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.8, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 250),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1, log=True),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 4,
        'verbosity': -1,
    }
    return lgb.LGBMRegressor(**params)

def objective_lightgbm(trial):
    model = define_lightgbm_model(trial)
    gkf = GroupKFold(n_splits=X['dayofweek'].nunique())
    scores = cross_val_score(model, X, y, groups=X['dayofweek'], cv=gkf, n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

In [None]:
study_lightgbm = optuna.create_study(direction='minimize', study_name='AirQualityWithLightGBM', sampler=optuna.samplers.TPESampler())
study_lightgbm.optimize(objective_lightgbm, n_trials=200)

### Zdefiniowanie najlepszego lightgbm

In [None]:
lgb_model = define_lightgbm_model(study_lightgbm.best_trial)
lgb_model.fit(X, y)
lightgbm_params = ['max_bin', 'num_leaves', 'max_depth', 'learning_rate', 'n_estimators', 'bagging_fraction', 'colsample_bytree', 'min_data_in_leaf']
lgb_pred = lgb_model.predict(X_test)
root_mean_squared_error(y_test, lgb_pred)

### Krzywa nauki dla lightGBM

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(lgb_model, X, y, cv=10, n_jobs=-1, random_state=4, scoring='neg_root_mean_squared_error')

In [None]:
# params_12 =  {'num_leaves': 25, 
#            'max_depth': 15, 
#            'learning_rate': 0.01982093884782807, 
#            'n_estimators': 1042, 
#            'tree_learner': 'voting', 
#            'subsample': 0.8572357579881347, 
#            'colsample_bytree': 0.8692866219741755, 
#            'min_data_in_leaf': 57,
#            'bagging_freq': 1,
#             'device': 'cpu',
#             'n_jobs': -1,
#             'random_state': 4,
#             'verbosity': -1,
#            'objective': 'root_mean_squared_error',
#             'boosting_type': 'gbdt',
#            }
# best_lgbm = lgb.LGBMRegressor(**params_12)
# best_lgbm.fit(X, y)
# best_lgbm_pred = best_lgbm.predict(X_test)
# root_mean_squared_error(y_test, best_lgbm_pred)
# save_to_csv(best_lgbm.predict(test), 'check.csv')

### Wykres przedstawiający każdy <i>trial</i> w procesie nauki

In [None]:
optuna.visualization.matplotlib.plot_optimization_history(study_lightgbm)

### Wizualizacja przekroju parametrów

In [None]:
optuna.visualization.matplotlib.plot_slice(study_lightgbm, params=lightgbm_params)

### Wpływ poszczególnych parametrów na proces nauki modelu

In [None]:
optuna.visualization.matplotlib.plot_param_importances(study_lightgbm)

### Najlepsze parametry

In [None]:
study_lightgbm.best_params

### Znaczenie poszczególnych kolumn

In [None]:
lgb.plot_importance(lgb_model, figsize=(20, 12), dpi=200)

### Drzewo decyzyjne dla lightGBM

In [None]:
lgb.plot_tree(lgb_model, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=200, orientation='vertical')

### Predykcje z lightGBM

In [None]:
lgb_final_pred = lgb_model.predict(test.drop(columns='id', axis=1))
save_to_csv(lgb_final_pred, 'lightgbm.csv')

### <center>Optuna + CatBoost</center>

In [None]:
from catboost import CatBoostRegressor


def define_cat_model(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 200, 700),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 5e-2, log=True),
        'depth': trial.suggest_int('depth', 11, 20),
        'grow_policy': 'Depthwise',
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 5e-2, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 200),
        'random_state': 4,
    }
    return CatBoostRegressor(**params)

def objective_cat(trial):
    model = define_cat_model(trial)
    gkf = GroupKFold(n_splits=X['dayofweek'].nunique())
    scores = cross_val_score(model, X, y, groups=X['dayofweek'], cv=gkf, n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

In [None]:
study_cat = optuna.create_study(direction='minimize', study_name='AirQualityWithCatBoost', sampler=optuna.samplers.TPESampler())
study_cat.optimize(objective_cat, n_trials=100)

In [None]:
cat_model = define_cat_model(study_cat.best_trial)
cat_model.fit(X, y)
lightgbm_params = ['iterations', 'learning_rate', 'depth', 'l2_leaf_reg', 'subsample', 'min_data_in_leaf']
cat_pred = cat_model.predict(X_test)
root_mean_squared_error(y_test, cat_pred)

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(cat_model, X, y, cv=10, n_jobs=-1, random_state=4, scoring='neg_root_mean_squared_error')

In [None]:
study_cat.best_params

In [None]:
cat_final_pred = cat_model.predict(test.drop(columns='id', axis=1))
save_to_csv(cat_final_pred, 'catboost.csv')

### <center>Optuna + XGBoost</center>

In [None]:
import xgboost as xgb


def define_xgb_model(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 750, 1100),
        'subsample': trial.suggest_float('subsample', 0.1, 0.6, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1, log=True),
        'tree_method': 'hist',
        'verbosity': 0,
        # 'enable_categorical': True,
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 4,
    }
    return xgb.XGBRegressor(**params)

def objective_xgb(trial):
    model = define_xgb_model(trial)
    gkf = GroupKFold(n_splits=X['dayofweek'].nunique())
    scores = cross_val_score(model, X, y, groups=X['dayofweek'], cv=gkf, n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

In [None]:
study_xgb = optuna.create_study(direction='minimize', study_name='AirQualityWithXGBoost', sampler=optuna.samplers.TPESampler())
study_xgb.optimize(objective_xgb, n_trials=100)

### Zdefiniowanie najlepszego XGBoost

In [None]:
xgb_model = define_xgb_model(study_xgb.best_trial)

xgb_model.fit(X, y)
xgb_params = ['n_estimators', 'subsample', 'reg_lambda', 'learning_rate', 'max_depth', 'colsample_bytree']
xgb_pred = xgb_model.predict(X_test)
root_mean_squared_error(y_test, xgb_pred)

### Krzywa nauki dla XGBoost

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(xgb_model, X, y, cv=5, n_jobs=-1, random_state=4, scoring='neg_root_mean_squared_error')

### Wykres przedstawiający każdy <i>trial</i> w procesie nauki

In [None]:
optuna.visualization.matplotlib.plot_optimization_history(study_xgb)

### Wizualizacja przekroju parametrów

In [None]:
optuna.visualization.matplotlib.plot_slice(study_xgb, params=xgb_params)

### Wpływ poszczególnych parametrów na proces nauki modelu

In [None]:
optuna.visualization.matplotlib.plot_param_importances(study_xgb)

### Najlepsze parametry

In [None]:
study_xgb.best_params

### Predykcje XGBoost

In [None]:
xgb_final_pred = xgb_model.predict(test.drop(columns='id', axis=1))
save_to_csv(xgb_final_pred, 'xgb.csv')

### <center>Stacking</center>

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression


stack_model = StackingRegressor(
    estimators=[('lgb', lgb_model), ('cat', cat_model), ('xgb', xgb_model)],
    final_estimator=LinearRegression(),
    cv=10,
    n_jobs=-1,
    passthrough=False
).fit(X, y)

### Stacking predykcja

In [None]:
stack_pred = stack_model.predict(test.drop(columns=['id'], axis=1))
save_to_csv(stack_pred, 'stack_xgb_cat_lgb.csv')

### <center>Voting</center>

In [None]:
from sklearn.ensemble import VotingRegressor


voting_model = VotingRegressor(
    estimators=[('lgb', lgb_model), ('cat', cat_model), ('xgb', xgb_model)],
    n_jobs=-1
).fit(X, y)

### Voting predykcja

In [None]:
voting_pred = voting_model.predict(test.drop(columns=['id'], axis=1))
save_to_csv(voting_pred, 'voting_xgb_cat_lgb.csv')

### Dodatkowe informacje
<ol>
    <li>The 15km SO2 band is ingested only when solar_zenith_angle < 70.</li>
    <li>Because of noise on the data, negative vertical column values are often observed in particular over clean regions or for low SO2 emissions. It is recommended not to filter these values except for outliers, i.e. for vertical columns lower than -0.001 mol/m^2.</li>
    <li>The effective cloud fraction is the radiometric equivalent cloud fraction of a satellite pixel assuming a fixed cloud albedo, usually 0.8. By definition the effective cloud fraction times the assumed cloud albedo plus the cloud-free surface and atmosphere contributions yields a TOA reflectance that agrees with the observed TOA reflectance.</li>
</ol>