In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams['figure.figsize'] = (20,10)
warnings.filterwarnings('ignore')

In [3]:
inlocal = True
if inlocal:
  path_to_data = '../Data/created2/'
else:
  from google.colab import drive
  from google.colab import output
  output.enable_custom_widget_manager()
  drive.mount('/content/drive')
  path_to_data = '/content/drive/My Drive/WiDS 2023/data/'

df_train = pd.read_csv(path_to_data + 'train_processed.csv')
df_submit = pd.read_csv(path_to_data + 'test_processed.csv')

In [4]:
train = df_train[df_train['startdate'] < '2016-05-01'].copy()
valid = df_train[(df_train['startdate'] >= '2016-05-01')].copy()
X_submit = df_submit.copy()

train.drop(columns=['startdate','day'], inplace=True)
valid.drop(columns=['startdate','day'], inplace=True)
X_submit.drop(columns=['startdate','day'], inplace=True)

train.shape, valid.shape, X_submit.shape

((312512, 249), (63222, 249), (31354, 248))

In [6]:
target = 'contest_tmp2m_14d__tmp2m'

X_train, y_train = train.drop(columns=[target]), train[target]
X_valid, y_valid = valid.drop(columns=[target]), valid[target]
X_submit = X_submit

In [7]:
from catboost import CatBoostRegressor
from optuna.integration.catboost import CatBoostPruningCallback
from optuna import Trial, visualization, create_study, load_study, study, TrialPruned, pruners

In [None]:
import numpy as np
import optuna
from optuna.integration.catboost import CatBoostPruningCallback

import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


def objective(trial: optuna.Trial):
    data, target = load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
        "eval_metric": "Accuracy",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)

    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [14]:
base_model = CatBoostRegressor(
    cat_features=['lat', 'lon', 'location', 'climateregions__climateregion', 'mjo1d__phase', 'month', 'week'],
    loss_function='RMSE',
    eval_metric='RMSE',
    iterations=1460
)
base_model.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    verbose=0,
    plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRegressor at 0x29ef63130>

In [21]:
from sklearn.metrics import mean_squared_error
y_pred = base_model.predict(X_valid)
print('MSE:', mean_squared_error(y_valid, y_pred))
print(base_model.get_best_score())

MSE: 1.376881889658913
{'learn': {'RMSE': 0.4460101768805059}, 'validation': {'RMSE': 1.1734061055871092}}


In [16]:
submit_index = pd.read_csv('../Data/created/submit_index.csv')['index']
y_submit_pred = base_model.predict(X_submit)
submit = pd.DataFrame({
  'contest-tmp2m-14d__tmp2m': y_submit_pred,
  'index':submit_index
})
submit.to_csv('../submission/catboost1.csv', index=False)

In [8]:
from sklearn.metrics import mean_squared_error
def objective(trial):
    param = {
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        # 'task_type': 'GPU',
        # 'devices': '0:1',
        'cat_features': ['lat', 'lon', 'location', 'climateregions__climateregion', 'mjo1d__phase', 'month', 'week'],
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 2, 30),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_int('random_strength', 1, 100),
        'bagging_temperature': trial.suggest_int('bagging_temperature', 0, 100),
        'od_type': 'Iter',
        'od_wait': 200,
        'iterations': 10000,
    }
    model = CatBoostRegressor(**param)
    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        verbose=0,
        plot=True,
        early_stopping_rounds=200,
    )
    preds = model.predict(X_valid)
    score = mean_squared_error(y_valid, preds, squared=False)
    return score

study = create_study(
    direction='minimize',
    pruner=pruners.HyperbandPruner(),
)
study.optimize(objective, n_trials=10)

[32m[I 2023-02-04 11:54:24,222][0m A new study created in memory with name: no-name-fa242dfe-9051-4409-b795-dbb0bac58ddc[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[33m[W 2023-02-04 11:55:08,047][0m Trial 0 failed because of the following error: KeyboardInterrupt('')[0m
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/sx/c_sxyj753hx_sjy7c76rlnt00000gn/T/ipykernel_9476/2440691933.py", line 20, in objective
    model.fit(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/catboost/core.py", line 5730, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/catboost/core.py", line 2355, in _fit
    self._train(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/catboost/core.py", line 1759, in _train
    self._object._train(train

KeyboardInterrupt: 