## <center>Analiza poziomu PM2.5 w afrykańskich miastach</center>
### Zespół:
<ol>
    <li style='font-size: 20px'>Hubert Kłosowski 242424</li>
    <li style='font-size: 20px'>Krzysztof Kolanek 242425</li>
    <li style='font-size: 20px'>Kamil Małecki 242464</li>
</ol>

### Potrzebne importy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Wczytanie danych

In [None]:
train = pd.read_csv('data\\train.csv')
test = pd.read_csv('data\\test.csv')

train.info()

In [None]:
train.head()

### Rozbicie daty na składowe

In [None]:
def change_date(dataframe):
    dataframe['date'] = pd.to_datetime(dataframe['date'])
    dataframe['dayofweek'] = dataframe['date'].dt.dayofweek.astype('category')
    dataframe['month'] = dataframe['month'].astype('category')
    dataframe['hour'] = dataframe['hour'].astype('category')
    return dataframe


train, test = change_date(train), change_date(test)

### Wykres przedstawiający jakość powietrza w krajach afrykańskich

In [None]:
sns.lineplot(data=train, x='date', y='pm2_5')
plt.title('Jakość powietrza z podziałem na kraje')

### Wykres przedstawiający wartość pm2_5 w zarejestrowanych godzinach

In [None]:
sns.barplot(data=train, x='hour', y='pm2_5')
plt.title('Jakość powietrza w poszczególnych godzinach z podziałem na kraje')

### Wykres przedstawiający wartość pm2_5 w zależności od dnia tygodnia

In [None]:
sns.barplot(data=train, x='dayofweek', y='pm2_5')
plt.title('Jakość powietrza w każdym dniu tygodnia z podziałem na kraje')

### Wykres przedstawiający wartość pm2_5 w zależności od miesiąca

In [None]:
sns.barplot(data=train, x='month', y='pm2_5')
plt.title('Jakość powietrza w każdym dniu tygodnia z podziałem na kraje')

### Korelacja wybranych kolumn z pm2_5

In [None]:
train['site_id'] = train['site_id'].apply(lambda x: x[:2]).astype('category')
test['site_id'] = test['site_id'].apply(lambda x: x[:2]).astype('category')

sns.heatmap(train[['month', 'dayofweek', 'hour', 'site_latitude', 'site_longitude', 'cloud_surface_albedo', 'site_id', 'pm2_5']].corr(), annot=True, cmap='Greys')

## <center>Czyszczenie danych</center>

### 1. Imputacja, usuwanie kolumn, oraz inne cuda

In [None]:
from sklearn.impute import KNNImputer


def fill_X(column_name='site_latitude'):
    column_values = X[column_name].unique()
    for date in column_values:
        for i, column in enumerate(starts_with):
            similar_columns = [col for col in X.columns if col.startswith(column)]
            df = X.loc[X[column_name] == date, similar_columns].copy()
            if not df.empty:
                try:
                    X.loc[X[column_name] == date, similar_columns] = imputers[i].fit_transform(df, y)
                except ValueError:
                    X.drop(index=df.index, inplace=True)
                    y.drop(index=df.index, inplace=True)
                    X.reset_index(drop=True, inplace=True)
                    y.reset_index(drop=True, inplace=True)

def fill_test(column_name='site_latitude'):
    column_values = test[column_name].unique()
    for date in column_values:
        for i, column in enumerate(starts_with):
            similar_columns = [col for col in test.columns if col.startswith(column)]
            df = test.loc[test[column_name] == date, similar_columns].copy()
            if not df.empty:
                test.loc[test[column_name] == date, similar_columns] = imputers[i].transform(df)

def drop_high_nans(dataframe):  # usuwamy kolumny o dużej liczbie wartości NaN
    columns_nans = []
    for i, el in enumerate(dataframe.columns):
        if dataframe[el].isna().sum() / len(dataframe) >= 0.9:
            columns_nans.append(el)
    dataframe.drop(columns_nans, axis=1, inplace=True)
    return dataframe

def drop_high_correlated_columns():
    matrix = X.corr(numeric_only=True).abs()
    upper_t = matrix.where(np.triu(np.ones_like(matrix, dtype=np.bool_), k=1))
    return [col for col in upper_t.columns if any(upper_t[col] > 0.99)]

def drop_low_correlated_columns_to_pm2_5():
    corr = train.corr()['pm2_5'].to_frame()
    return corr[(corr['pm2_5'] < 0.01) & (corr['pm2_5'] > -0.01)].index.values

def subract_azimuth_zenith(dataframe):
    zenith_columns = [zenith for zenith in dataframe.columns if 'zenith' in zenith]
    azimuth_columns = [azimuth for azimuth in dataframe.columns if 'azimuth' in azimuth]
    for i, zenith in enumerate(zenith_columns):
        splitted = zenith.split('_')
        dataframe[f'{splitted[0]}_{splitted[1]}_diff'] = dataframe[zenith] - dataframe[azimuth_columns[i]]
        dataframe.drop(zenith_columns[i], axis=1, inplace=True)
        dataframe.drop(azimuth_columns[i], axis=1, inplace=True)
    return dataframe


test_ids = test['id']
train.drop(columns=['id', 'city', 'country', 'date'], inplace=True)
test.drop(columns=['id', 'city', 'country', 'date'], inplace=True)
starts_with = train.columns.str.split('_', expand=True).levels[0].to_frame()
starts_with.drop(['month', 'hour', 'pm2', 'site'], inplace=True)
starts_with = starts_with[0].tolist()

imputers = [KNNImputer(n_neighbors=15, weights='distance') for _ in range(len(starts_with))]
train, test = drop_high_nans(train), drop_high_nans(test)
# fill_train(), fill_test()
# to_drop = drop_low_correlated_columns_to_pm2_5()
# train, test = train.drop(columns=to_drop, axis=1), test.drop(columns=to_drop, axis=1)
# train, test = subract_azimuth_zenith(train), subract_azimuth_zenith(test)
X, y = train.drop(['pm2_5'], axis=1), train['pm2_5']

In [None]:
X.info()

### Wykresy pudełkowe wskazujące wartości odstające

In [None]:
from sympy import divisors


def plot_boxplots():
    for i, column_group in enumerate(starts_with):
        similar_columns = [col for col in train.columns if col.startswith(column_group)]
        if len(similar_columns) > 1:
            divs = divisors(len(similar_columns))
            if len(divs) % 2 == 0:
                rows, cols = divs[(len(divs) // 2) - 1], divs[len(divs) // 2]
            else:
                rows, cols = divs[len(divs) // 2], divs[len(divs) // 2]
            fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(40, 30), squeeze=False)
            fig.suptitle(column_group, fontsize=25)
            for j, column in enumerate(similar_columns):
                x_cord, y_cord = divmod(j, cols)
                train[column].plot(kind='box', ax=ax[x_cord, y_cord], fontsize=15)
            plt.show()


vertical_columns = [col for col in X.columns if 'number_density' in col]

# plot_boxplots()

### 2. Usunięcie wartości odstających

In [None]:
from scipy.stats import zscore


def del_outliers():
    zscores = zscore(X.select_dtypes(exclude='category').values, nan_policy='omit')
    np.nan_to_num(zscores, copy=False)
    zscores = np.absolute(zscores)
    result = np.mean(zscores, axis=1)
    indexes_to_drop = []
    q1, q2 = np.quantile(result, 0.001), np.quantile(result, 0.999)
    for i, el in enumerate(result):
        if q1 < el > q2:
            indexes_to_drop.append(i)
    X.drop(indexes_to_drop, inplace=True)
    y.drop(indexes_to_drop, inplace=True)
    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)


# del_outliers()

X.info()

In [None]:
X.head()

## <center>Selekcja cech</center>

In [None]:
from sklearn.feature_selection import RFECV, RFE, SelectKBest, mutual_info_regression, f_regression
from sklearn.ensemble import RandomForestRegressor


def plot_feature_importance(sc, num_of_features):
    if isinstance(sc, RFECV) or isinstance(sc, RFE):
        scores = dict(zip(sc.feature_names_in_, sc.ranking_))
    else:
        scores = dict(zip(sc.feature_names_in_, sc.scores_))
    scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:num_of_features]
    scores_df = pd.DataFrame(scores, columns=['Feature', 'Score'])
    
    scores_df.plot(kind='bar', x='Feature', y='Score', figsize=(10, 6), rot=90, title='Oceny wybranych cech')
    plt.xlabel('Cecha')
    plt.ylabel('Ocena')


# selector = RFE(
#     estimator=RandomForestRegressor(
#         n_estimators=700, 
#         max_depth=7, 
#         random_state=4, 
#         n_jobs=-1, 
#         oob_score=True,
#         warm_start=True
#     ),
#     n_features_to_select=k,
# )
# k = 17
# selector = RFECV(
#     estimator=RandomForestRegressor(
#         n_estimators=400, 
#         max_depth=10, 
#         random_state=4, 
#         n_jobs=-1, 
#         oob_score=True, 
#         warm_start=True, 
#         ccp_alpha=1e-4
#     ),
#     min_features_to_select=k, 
#     cv=10, 
#     scoring='neg_root_mean_squared_error',
#     n_jobs=-1
# )
# selector.fit(X, y)
# X, test = selector.transform(X), selector.transform(test)
# 
# plot_feature_importance(selector, k)

## <center>Transformacja danych</center>

### 1. Standaryzacja danych

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

# Bez kategorycznych
# categorical = ['hour', 'month', 'dayofweek', 'site_id']
# scale_columns = X.columns.difference(categorical)
# 
# scaler = make_column_transformer((StandardScaler(), scale_columns))
# 
# X_cat, test_cat = X[categorical], test[categorical]
# 
# X = pd.concat([pd.DataFrame(scaler.fit_transform(X[scale_columns]), columns=scaler.feature_names_in_), X_cat], axis=1)
# test = pd.concat([pd.DataFrame(scaler.transform(test[scale_columns]), columns=scaler.feature_names_in_), test_cat], axis=1)
# Wszystkie kolumny
scaler = StandardScaler()

X = scaler.fit_transform(X, y)
test = scaler.transform(test)
X, test = pd.DataFrame(X, columns=scaler.feature_names_in_), pd.DataFrame(test, columns=scaler.feature_names_in_)

### 2. Podział na zbiór walidacyjny i treningowy

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

## <center>Część obliczeniowa</center>

### Otrzymanie najlepszych parametrów

In [None]:
import os


def save_to_csv(y_pred, save_as):
    if 'result' not in os.listdir(os.getcwd()):
        os.mkdir('result')
    final_df = pd.concat([test_ids, pd.DataFrame.from_dict({'pm2_5': y_pred})], axis=1)
    final_df.to_csv(f'result\\{save_as}', index=False)

### <center>Optuna + lightGBM</center>

In [None]:
import lightgbm as lgb
import optuna
from sklearn.metrics import root_mean_squared_error

def define_lightgbm_model(trial):
    params = {
        'objective': 'root_mean_squared_error',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 10, 17),
        'learning_rate': trial.suggest_float('learning_rate', 8e-3, 5e-2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 900, 1100),
        'tree_learner': 'voting',
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 0.9),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 150),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1, log=True),
        'bagging_freq': 1,
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 4,
        'verbosity': -1
    }
    return lgb.LGBMRegressor(**params)

def objective_lightgbm(trial):
    model = define_lightgbm_model(trial)
    model.fit(X_train, y_train)
    return root_mean_squared_error(y_test, model.predict(X_test))

In [None]:
study_lightgbm = optuna.create_study(direction='minimize', study_name='AirQualityWithLightGBM', sampler=optuna.samplers.TPESampler())
study_lightgbm.optimize(objective_lightgbm, n_trials=200)

### Zdefiniowanie najlepszego lightgbm

In [None]:
params_12 =  {'num_leaves': 25, 
           'max_depth': 15, 
           'learning_rate': 0.01982093884782807, 
           'n_estimators': 1042, 
           'tree_learner': 'voting', 
           'bagging_fraction': 0.863457680863147, 
           'subsample': 0.8572357579881347, 
           'colsample_bytree': 0.8692866219741755, 
           'min_data_in_leaf': 57,
           'bagging_freq': 1,
            'device': 'cpu',
            'n_jobs': -1,
            'random_state': 4,
            'verbosity': -1,
           'objective': 'root_mean_squared_error',
            'boosting_type': 'gbdt',
           }
# best_lgbm = lgb.LGBMRegressor(**params_12)
# best_lgbm.fit(X, y)
# best_lgbm_pred = best_lgbm.predict(X_test)

lgbm = define_lightgbm_model(study_lightgbm.best_trial)
lgbm.fit(X_train, y_train)
lightgbm_params = ['num_leaves', 'max_depth', 'learning_rate', 'n_estimators', 'subsample', 'colsample_bytree', 'min_data_in_leaf', 'bagging_fraction']
lgb_pred = lgbm.predict(X_test)
root_mean_squared_error(y_test, lgb_pred)

### <center>Optuna + XGBoost</center>

In [None]:
import xgboost as xgb

def define_xgb_model(trial):
    params = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 750, 1100),
        'subsample': trial.suggest_float('subsample', 0.1, 0.6, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1, log=True),
        'tree_method': trial.suggest_categorical('tree_method', ['hist', 'approx']),
        'verbosity': 0,
        'enable_categorical': True,
        'n_jobs': -1
    }
    return xgb.XGBRegressor(**params)

def objective_xgb(trial):
    model = define_xgb_model(trial)
    model.fit(X_train, y_train)
    return root_mean_squared_error(y_test, model.predict(X_test))

In [None]:
study_xgboost = optuna.create_study(direction='minimize', study_name='AirQualityWithXGBoost', sampler=optuna.samplers.TPESampler())
study_xgboost.optimize(objective_xgb, n_trials=100)

### Zdefiniowanie najlepszego XGBoost

In [None]:
xgbm = define_xgb_model(study_xgboost.best_trial)

xgbm.fit(X_train, y_train)
xgboost_params = ['n_estimators', 'subsample', 'reg_lambda', 'learning_rate', 'max_depth', 'colsample_bytree']
xgb_pred = xgbm.predict(X_test)
root_mean_squared_error(y_test, xgb_pred)

### <center>Optuna + MLPRegressor</center>

### Wykres przedstawiający każdy <i>trial</i> w procesie nauki

In [None]:
optuna.visualization.plot_optimization_history(study_lightgbm)

In [None]:
optuna.visualization.plot_optimization_history(study_xgboost)

### Wizualizacja przekroju parametrów

In [None]:
optuna.visualization.plot_slice(study_lightgbm, params=lightgbm_params)

In [None]:
optuna.visualization.plot_slice(study_xgboost, params=xgboost_params)

### Wpływ poszczególnych parametrów na proces nauki modelu

In [None]:
optuna.visualization.plot_param_importances(study_lightgbm)

In [None]:
optuna.visualization.plot_param_importances(study_xgboost)

### Najlepsze parametry

In [None]:
study_lightgbm.best_params

In [None]:
study_xgboost.best_params

### Znaczenie poszczególnych kolumn

In [None]:
lgb.plot_importance(lgbm, figsize=(20, 12), dpi=200)

### Drzewo decyzyjne

In [None]:
lgb.plot_tree(lgbm, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=200, orientation='vertical')

### <center>Stacking</center>

In [None]:
from sklearn.linear_model import LinearRegression
X_data = np.column_stack((lgb_pred, xgb_pred))

meta_model = LinearRegression()
meta_model.fit(X_data, y_test)

lgbm_final, xgbm_final = lgbm.predict(test), xgbm.predict(test)

X_final_test = np.column_stack((lgbm_final, xgbm_final))

In [None]:
X_data

## <center>Do wysłania</center>

### lightGBM

In [None]:
save_to_csv(lgbm_final, 'lightgbm.csv')

### XGBoost

In [None]:
save_to_csv(xgbm_final, 'xgb.csv')

### Stack lightGBM + XGBoost

In [None]:
save_to_csv(meta_model.predict(X_final_test), 'stack_lgb_xgb.csv')

### Dodatkowe informacje
<ol>
    <li>The 15km SO2 band is ingested only when solar_zenith_angle < 70.</li>
    <li>Because of noise on the data, negative vertical column values are often observed in particular over clean regions or for low SO2 emissions. It is recommended not to filter these values except for outliers, i.e. for vertical columns lower than -0.001 mol/m^2.</li>
    <li>The effective cloud fraction is the radiometric equivalent cloud fraction of a satellite pixel assuming a fixed cloud albedo, usually 0.8. By definition the effective cloud fraction times the assumed cloud albedo plus the cloud-free surface and atmosphere contributions yields a TOA reflectance that agrees with the observed TOA reflectance.</li>
</ol>