### <center>Zadanie 6</center>

Jakość powietrza w Ugandzie

In [None]:
import os
import warnings

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, TimeSeriesSplit

warnings.filterwarnings('ignore')

#### Wczytanie danych

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
train.drop(columns=['ID', 'device'], inplace=True)

train.info()

In [None]:
train.head()

#### Rozbicie daty na składowe

In [None]:
train['date'] = pd.to_datetime(train['date'], format='%Y-%m-%d')
train['year'] = train['date'].dt.year
train['month'] = train['date'].dt.month
train['dayofweek'] = train['date'].dt.dayofweek
train = train.sort_values(by='date')

train.drop(columns=['date'], inplace=True)

#### Ilość brakujących wartości

In [None]:
train.isnull().sum().sort_values(ascending=False) / train.shape[0] * 100

#### Usuń kolumny, które mają więcej niż 40% NaN

In [None]:
to_many_nans = train.columns[train.isnull().sum() / train.shape[0] * 100 >= 40.0]
train.drop(to_many_nans, axis=1, inplace=True)

#### Uzupełnienie NaN przy wykorzystaniu mediany

Dla każdej kolumny, która posiada braki zostaną one uzupełnione medianą.

In [None]:
X, y = train.drop(columns=['pm2_5']), train['pm2_5']

imputer = IterativeImputer(random_state=42).fit(X)
train_after_impute = imputer.transform(X)
train[X.columns] = train_after_impute
X = pd.DataFrame(train_after_impute, columns=X.columns)

#### Macierz korelacji dla całego zbioru

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(
    X.corr(),
    annot=False,
    cmap='coolwarm',
    linewidths=0.1,
    linecolor='black',
    square=True,
    cbar=True,
    xticklabels=True,
    yticklabels=True
)
plt.title('Macierz korelacji dla zbioru treningowego', fontsize=16)
plt.show()

#### Usunięcie kolumn nisko skorelowanych z pm2_5

In [None]:
def drop_low_correlated_columns_to_pm2_5():
    corr = train.corr(numeric_only=True)['pm2_5'].to_frame()
    return corr[(corr['pm2_5'] < 0.01) & (corr['pm2_5'] > -0.01)].index.to_numpy()

low_correlated = drop_low_correlated_columns_to_pm2_5()
X.drop(columns=low_correlated, inplace=True)

low_correlated

#### Usunięcie kolumn wysoko skorelowanych z innymi

In [None]:
def drop_high_correlated_columns():
    matrix = X.corr(numeric_only=True).abs()
    upper_t = matrix.where(np.triu(np.ones_like(matrix, dtype=np.bool_), k=1))
    return [col for col in upper_t.columns if any(upper_t[col] > 0.99)]

high_correlated = drop_high_correlated_columns()
X.drop(columns=high_correlated, inplace=True)

high_correlated

#### <center>Wizualizacja danych</center>

#### Rozkład wartości kolumny pm2_5

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(
    x=y,
    bins=50,
    kde=True,
    color='skyblue',
    alpha=0.5,
    line_kws={'linewidth': 2}
)
plt.xticks(range(0, 451, 50))
plt.title('Rozkład wartości PM2.5', fontsize=16)
plt.xlabel('Wartość PM2.5', fontsize=14)
plt.ylabel('Liczebność', fontsize=14)

#### <center>Proces nauki i testowania</center>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train, columns=X.columns).reset_index(drop=True)
X_test = pd.DataFrame(X_test, columns=X.columns).reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

#### Usunięcie outlierów przy wykorzystaniu metody z-score

In [None]:
pm2_5_std = y_train.std()
pm2_5_mean = y_train.mean()

detect_outliers = pd.DataFrame(data={'zscore': np.linspace(0, 5, 21), 'pm2_5': [(i * pm2_5_std) + pm2_5_mean for i in np.linspace(0, 5, 21)]})

detect_outliers

In [None]:
outliers_indexes = y_train[y_train > 5 * pm2_5_std].index
X_train.drop(index=outliers_indexes, inplace=True)
y_train.drop(index=outliers_indexes, inplace=True)

X_train.info()

#### <center>Regresja liniowa</center>

In [None]:
linear = LinearRegression(n_jobs=-1).fit(X_train, y_train)
linear_pred = linear.predict(X_test)

root_mean_squared_error(y_test, linear_pred)

#### Inne metryki

In [None]:
r2_score(y_test, linear_pred), mean_absolute_error(y_test, linear_pred)

#### <center>Las losowy</center>

In [None]:
def define_rf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 80, 250),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 5, 20),
        'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 1e-3, 1, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'criterion': 'squared_error',
        'bootstrap': True,
        'max_samples': trial.suggest_float('max_samples', 0.5, 1),
    }
    return RandomForestRegressor(**params)

def objective_rf(trial):
    model = define_rf(trial)
    scores = cross_val_score(model, X_train, y_train, cv=TimeSeriesSplit(n_splits=5), n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

study_rf = optuna.create_study(direction='minimize', study_name='RegressionRandomForest', sampler=optuna.samplers.TPESampler(seed=42))
study_rf.optimize(objective_rf, n_trials=100)

#### Wizualizacja hiperparametryzacji lasu losowego

In [None]:
optuna.visualization.plot_optimization_history(study_rf)

In [None]:
optuna.visualization.plot_slice(study_rf)

In [None]:
optuna.visualization.plot_param_importances(study_rf)

#### <center>LightGBM</center>

In [None]:
def define_lgb(trial):
    params = {
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 30, 100),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 2),
        'max_bin': trial.suggest_int('max_bin', 64, 256),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1,
    }
    return lgb.LGBMRegressor(**params)

def objective_lgb(trial):
    model = define_lgb(trial)
    scores = cross_val_score(model, X_train, y_train, cv=TimeSeriesSplit(n_splits=5), n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

study_lgb = optuna.create_study(direction='minimize', study_name='RegressionLightGBM', sampler=optuna.samplers.TPESampler())
study_lgb.optimize(objective_lgb, n_trials=100)

#### Wizualizacja hiperparametryzacji LightGBM

In [None]:
optuna.visualization.plot_optimization_history(study_lgb)

In [None]:
optuna.visualization.plot_slice(study_lgb)

In [None]:
optuna.visualization.plot_param_importances(study_lgb)

#### Wybrane modele

In [None]:
rf = define_rf(study_rf.best_trial)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

root_mean_squared_error(y_test, rf_pred)

#### Inne metryki

In [None]:
(r2_score(y_test, rf_pred), mean_absolute_error(y_test, rf_pred))

#### Istotność cech dla lasu losowego

In [None]:
importance = pd.DataFrame(
    data={
        'feature': rf.feature_names_in_,
        'importance': rf.feature_importances_
    }
).sort_values(by='importance', ascending=False).head(10)

plt.barh(
    importance['feature'],
    importance['importance']
)
plt.title('Istotność cech dla lasu losowego', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=14)
plt.ylabel('Nazwa cechy', fontsize=14)
plt.show()

In [None]:
lightgbm = define_lgb(study_lgb.best_trial)
lightgbm.fit(X_train, y_train)
lightgbm_pred = lightgbm.predict(X_test)

root_mean_squared_error(y_test, lightgbm_pred)

#### Inne metryki

In [None]:
(r2_score(y_test, lightgbm_pred), mean_absolute_error(y_test, lightgbm_pred))

#### Istotnosć cech dla LightGBM

In [None]:
importance = pd.DataFrame(
    data={
        'feature': lightgbm.feature_name_,
        'importance': lightgbm.feature_importances_
    }
).sort_values(by='importance', ascending=False).head(10)

plt.barh(
    importance['feature'],
    importance['importance']
)
plt.title('Istotność cech dla LightGBM', fontsize=16)
plt.xlabel('Wartość istotności', fontsize=14)
plt.ylabel('Nazwa cechy', fontsize=14)
plt.show()

#### Voting

Polega na uśrednianiu wyników zwróconych przez modele składowe w celu uzyskania bardziej stabilnych i dokładnych predykcji.

In [None]:
voting = VotingRegressor(
    estimators=[
        ('randomforest', rf),
        ('lightgbm', lightgbm)
    ],
    n_jobs=-1
).fit(X_train, y_train)

voting_pred = voting.predict(X_test)

root_mean_squared_error(y_test, voting_pred)

#### Inne metryki

In [None]:
(r2_score(y_test, voting_pred), mean_absolute_error(y_test, voting_pred))

#### <center>Krzywe uczenia</center>

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 20))


voting.fit(X, y)

plt.suptitle('Krzywe uczenia dla wybranych regresorów', fontsize=16, fontweight='bold')
models = [
    voting,
    rf,
    lightgbm,
    linear
]
titles = ['Voting', 'Las losowy' , 'LightGBM', 'Regresja liniowa']
for i in range(len(titles)):
    xc, yc = divmod(i, 2)
    tss = TimeSeriesSplit(n_splits=5)
    train_size, train_scores, test_scores = learning_curve(
        estimator=models[i],
        X=X,
        y=y,
        cv=tss,
        train_sizes=np.linspace(0.1, 1, 10),
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )
    train_scores_mean = -1 * np.mean(train_scores, axis=1)
    test_scores_mean = -1 * np.mean(test_scores, axis=1)
    ax[xc, yc].plot(train_size, train_scores_mean, 'o-', color='skyblue', label='Train')
    ax[xc, yc].plot(train_size, test_scores_mean, 'o-', color='orange', label='Test')
    ax[xc, yc].legend(loc='best')
    ax[xc, yc].grid(True)
    ax[xc, yc].set_title(titles[i], fontsize=16, pad=10)
    ax[xc, yc].set_ylabel('Wartość RMSE')
    ax[xc, yc].set_xlabel('Liczba obserwacji z zbioru treningowego' if yc == 1 else '')

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()