In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
test_ids = pd.read_csv(os.path.join('data', 'test.csv'))['ID_Zindi']
train = pd.read_csv(os.path.join('prepared', 'train.csv'))
test = pd.read_csv(os.path.join('prepared', 'test.csv'))

train.info()

In [None]:
from sklearn.model_selection import train_test_split


X, y = train.drop(columns=['GT_NO2'], axis=1), train['GT_NO2']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

In [None]:
import lightgbm as lgb
import optuna
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score, GroupKFold
from catboost import CatBoostRegressor

In [None]:
def define_lightgbm(trial):
    params = {
        'max_bin': trial.suggest_int('max_bin', 70, 250),
        'num_leaves': trial.suggest_int('num_leaves', 150, 400),
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        'learning_rate': trial.suggest_float('learning_rate', 2e-3, 1e-1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 400, 700),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.8, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 400),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 1, log=True),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
        'objective': 'root_mean_squared_error',
        'boosting_type': 'gbdt',
        'tree_learner': 'voting',
        'device': 'cpu',
        'n_jobs': -1,
        'random_state': 4,
        'verbosity': -1,
    }
    return lgb.LGBMRegressor(**params)

def objective_lightgbm(trial):
    model = define_lightgbm(trial)
    gkf = GroupKFold(n_splits=X['Season'].nunique())
    scores = cross_val_score(model, X, y, groups=X['Season'], cv=gkf, n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

In [None]:
def define_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 400),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'depth': trial.suggest_int('depth', 5, 12),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0, log=True),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.4, 1.0, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'random_state': 4,
        'cat_features': ['Season', 'DayOfWeek', 'Week', 'Year', 'Month'],
    }
    return CatBoostRegressor(**params, silent=True)

def objective_catboost(trial):
    model = define_catboost(trial)
    gkf = GroupKFold(n_splits=X['Season'].nunique())
    scores = cross_val_score(model, X, y, groups=X['Season'], cv=gkf, n_jobs=-1, scoring='neg_root_mean_squared_error')
    return scores.mean() * (-1)

In [None]:
study_lightgbm = optuna.create_study(direction='minimize', study_name='GeoAIWithLightGBM', sampler=optuna.samplers.TPESampler())
study_lightgbm.optimize(objective_lightgbm, n_trials=100)

In [None]:
study_catboost = optuna.create_study(direction='minimize', study_name='GeoAIWithCatBoost', sampler=optuna.samplers.TPESampler())
study_catboost.optimize(objective_catboost, n_trials=100)

In [None]:
# best_params =  {'max_bin': 84, 'num_leaves': 269, 'max_depth': 11, 'learning_rate': 0.009567724370102485, 'n_estimators': 621, 'bagging_fraction': 0.6947463877737182, 'colsample_bytree': 0.40279034207476644, 'min_data_in_leaf': 111, 'reg_lambda': 0.14810770431354064, 'bagging_freq': 1, 'device': 'cpu', 'n_jobs': -1, 'random_state': 4, 'verbosity': -1, 'tree_learner': 'voting', 'objective': 'root_mean_squared_error', 'boosting_type': 'gbdt'}
# 
# lgb_model = lgb.LGBMRegressor(**best_params)
# lgb_model.fit(X, y)

In [None]:
lgb_model = define_lightgbm(study_lightgbm.best_trial)
lgb_model.fit(X, y)

In [None]:
cat_model = define_catboost(study_catboost.best_trial)
cat_model.fit(X, y)

In [None]:
lightgbm_params = ['max_bin', 'num_leaves', 'max_depth', 'learning_rate', 'n_estimators', 'bagging_fraction', 'colsample_bytree', 'min_data_in_leaf']
lgb_pred = lgb_model.predict(X_test)
root_mean_squared_error(y_test, lgb_pred)

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(lgb_model, X, y, cv=GroupKFold(n_splits=X['Season'].nunique()), groups=X['Season'], n_jobs=-1, random_state=4, scoring='neg_root_mean_squared_error')

In [None]:
from sklearn.model_selection import LearningCurveDisplay


LearningCurveDisplay.from_estimator(cat_model, X, y, cv=GroupKFold(n_splits=X['Season'].nunique()), groups=X['Season'], n_jobs=-1, random_state=4, scoring='neg_root_mean_squared_error')

In [None]:
study_lightgbm.best_params

In [None]:
study_catboost.best_params

In [None]:
def save_to_csv(y_pred, save_as):
    if 'result' not in os.listdir(os.getcwd()):
        os.mkdir('result')
    final_df = pd.concat([test_ids, pd.DataFrame.from_dict({'GT_NO2': y_pred})], axis=1)
    final_df.to_csv(os.path.join('result', save_as), index=False)
    
save_to_csv(lgb_model.predict(test), 'lightgbm.csv')

In [None]:
save_to_csv(cat_model.predict(test), 'cat_boost.csv')

In [None]:
lgb.plot_importance(lgb_model, figsize=(20, 12), dpi=200)

In [None]:
lgb.plot_tree(lgb_model, precision=2, figsize=(20, 12), show_info=['data_percentage'], dpi=600, orientation='vertical')

In [None]:
xd = pd.read_csv(os.path.join('result', 'lightgbm.csv'), header=0)
rand = np.random.rand(len(xd),)
pd.DataFrame(rand, columns=['random']).to_csv(os.path.join('result', 'random.csv'), index=False)
# xd['GT_NO2'] *= 0.97
xd['GT_NO2'] -= rand
xd.to_csv(os.path.join('result', 'lightgbm_even_better.csv'), index=False)

In [None]:
predictions = 0.5 * lgb_model.predict(test) + 0.5 * cat_model.predict(test)

save_to_csv(predictions, 'cat_lgb_fifty_fifty.csv')