In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

train_target = train['pm2_5']
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [None]:
train['date'].describe()

In [None]:
test['date'].describe()

In [None]:
train.drop(columns=['id', 'city', 'country', 'site_id', 'site_latitude', 'site_longitude', 'hour', 'pm2_5'], inplace=True)
test.drop(columns=['id', 'city', 'country', 'site_id', 'site_latitude', 'site_longitude', 'hour'], inplace=True)
train.drop(columns=[column for column in train.columns if 'sensor_zenith' in column or 'sensor_azimuth' in column], inplace=True)
test.drop(columns=[column for column in test.columns if 'sensor_zenith' in column or 'sensor_azimuth' in column], inplace=True)

In [None]:
def drop_high_nans():
    columns_nans = []
    for i, el in enumerate(train.columns):
        if train[el].isna().sum() / len(train) >= 0.9:
            columns_nans.append(el)
    return columns_nans


high_nans = drop_high_nans()
train, test = train.drop(columns=high_nans, axis=1), test.drop(columns=high_nans, axis=1)

In [None]:
test.info()

In [None]:
train.info()

In [None]:
vertical_columns = [col for col in train.columns if 'number_density' in col]

In [None]:
# fig, ax = plt.subplots(nrows=len(vertical_columns), ncols=2, figsize=(20, 45))
# for i, column in enumerate(vertical_columns):
#     sns.lineplot(data=train, x='date', y=column, ax=ax[i, 0])
#     sns.lineplot(data=test, x='date', y=column, ax=ax[i, 1])
#     ax[i, 0].set_title(f'train_{column}')
#     ax[i, 1].set_title(f'test_{column}')

### Która kolumna wyróżnia dane czy są z treningowego, czy z testowego?

In [None]:
train['is_train'] = 1
test['is_train'] = 0

In [None]:
whole = pd.concat([train, test], ignore_index=True).reset_index(drop=True)

In [None]:
whole_dates = whole['date']
whole.drop('date', axis=1, inplace=True)
whole = whole.sample(frac=1).reset_index(drop=True)
whole.info()

In [None]:
from sklearn.model_selection import train_test_split


X, y = whole.drop(columns=['is_train'], axis=1), whole['is_train']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
import lightgbm as lgb
import optuna
from sklearn.model_selection import KFold, cross_val_score


def define_model(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'max_bin': trial.suggest_int('max_bin', 10, 200),
        'num_leaves': trial.suggest_int('num_leaves', 100, 400),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 500),
        'tree_learner': 'voting',
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.8, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 250),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1e-1, log=True),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 4,
        'verbosity': -1,
        'is_unbalance': True
    }
    return lgb.LGBMClassifier(**params)

def objective(trial):
    model = define_model(trial)
    kf = KFold(n_splits=10, random_state=4, shuffle=True)
    scores = cross_val_score(model, X, y, cv=kf, n_jobs=-1, scoring='accuracy')
    return scores.mean()

In [None]:
study_lightgbm = optuna.create_study(direction='maximize', study_name='AirQualityWithLightGBM', sampler=optuna.samplers.TPESampler())
study_lightgbm.optimize(objective, n_trials=100)

lightgbm = define_model(study_lightgbm.best_trial)
lightgbm.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report


print(classification_report(y_test, lightgbm.predict(X_test)))

In [None]:
lgb.plot_importance(lightgbm, figsize=(20, 12), dpi=200)

In [None]:
lgb.plot_tree(lightgbm, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=200, orientation='vertical')

In [None]:
train = pd.concat([train, train_target], axis=1)

In [None]:
whole = pd.concat([whole_dates, whole], axis=1)

whole.info()

In [None]:
very_important = ['nitrogendioxide_tropopause_pressure', 'uvaerosolindex_sensor_altitude', 'ozone_o3_column_number_density_amf', 'ozone_o3_column_number_density']
sns.lineplot(data=whole[['date', 'nitrogendioxide_tropopause_pressure', 'is_train']], x='date', y='nitrogendioxide_tropopause_pressure', hue='is_train')

In [None]:
sns.lineplot(data=whole[['date', 'ozone_o3_column_number_density_amf', 'is_train']], x='date', y='ozone_o3_column_number_density_amf', hue='is_train')

In [None]:
sns.lineplot(data=whole[['date', 'ozone_o3_column_number_density', 'is_train']], x='date', y='ozone_o3_column_number_density', hue='is_train')

In [None]:
sns.lineplot(data=whole[['date', 'uvaerosolindex_sensor_altitude', 'is_train']], x='date', y='uvaerosolindex_sensor_altitude', hue='is_train')

In [None]:
sns.histplot(data=whole[['nitrogendioxide_tropopause_pressure', 'is_train']], x='nitrogendioxide_tropopause_pressure', hue='is_train')

In [None]:
sns.histplot(data=whole[['ozone_o3_column_number_density_amf', 'is_train']], x='ozone_o3_column_number_density_amf', hue='is_train')

In [None]:
sns.histplot(data=whole[['ozone_o3_column_number_density', 'is_train']], x='ozone_o3_column_number_density', hue='is_train')

In [None]:
sns.histplot(data=whole[['uvaerosolindex_sensor_altitude', 'is_train']], x='uvaerosolindex_sensor_altitude', hue='is_train')

In [None]:
pd.concat([test['uvaerosolindex_sensor_altitude'].describe(), train['uvaerosolindex_sensor_altitude'].describe()], axis=1)

In [None]:
pd.concat([test['nitrogendioxide_tropopause_pressure'].describe(), train['nitrogendioxide_tropopause_pressure'].describe()], axis=1)

In [None]:
limit = train.loc[(test['uvaerosolindex_sensor_altitude'].min() < train['uvaerosolindex_sensor_altitude']) & (train['uvaerosolindex_sensor_altitude'] > test['uvaerosolindex_sensor_altitude'].max()), :]