In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams['figure.figsize'] = (20,10)
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor
from optuna import Trial, visualization, create_study

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

In [85]:
inlocal = True
if inlocal:
  path_to_data = '../Data/created/'
else:
  from google.colab import drive
  from google.colab import output
  output.enable_custom_widget_manager()
  drive.mount('/content/drive')
  path_to_data = '/content/drive/My Drive/WiDS 2023/data/'

df = pd.read_csv(path_to_data + 'train_processed.csv')
df_submit = pd.read_csv(path_to_data + 'test_processed.csv')

In [115]:
target = 'contest_tmp2m_14d__tmp2m'
df.index = pd.RangeIndex(start=0, stop=len(df), step=1)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(df.drop(target, axis=1), df[target], test_size=0.05, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.15, shuffle=False)

In [118]:
def objective(trial: Trial):
  params = {
    'random_seed': 42,
    'iterations': 10000,
    'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
  }

  model = CatBoostRegressor(
    **params,
    cat_features=['lat', 'lon', 'climateregions__climateregion', 'elevation__elevation', 'mjo1d__phase', 'month', 'week', 'is_mei__nip_3', 'location', 'day'],
    loss_function= 'RMSE',
    eval_metric= 'RMSE',
    # task_type= 'GPU',
    # devices= '0:1',
  )

  model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val), 
    early_stopping_rounds=200, 
    verbose=False,
    plot=True,
  )
  
  preds = model.predict(X_val)
  mse = mean_squared_error(y_val, preds)
  return mse

In [119]:
study = create_study(
  direction='minimize',
  study_name='catboost',
)
study.optimize(objective, n_trials=10)

[32m[I 2023-01-23 20:57:08,081][0m A new study created in memory with name: catboost[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-01-23 20:58:46,090][0m Trial 0 finished with value: 2.655847162373183 and parameters: {'learning_rate': 0.0788729691817006}. Best is trial 0 with value: 2.655847162373183.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-01-23 21:00:18,157][0m Trial 1 finished with value: 2.9292443819881746 and parameters: {'learning_rate': 0.06805647781024136}. Best is trial 0 with value: 2.655847162373183.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-01-23 21:01:32,234][0m Trial 2 finished with value: 2.8109540930348023 and parameters: {'learning_rate': 0.11293143595147728}. Best is trial 0 with value: 2.655847162373183.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[33m[W 2023-01-23 21:02:00,174][0m Trial 3 failed because of the following error: KeyboardInterrupt('')[0m
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/sx/c_sxyj753hx_sjy7c76rlnt00000gn/T/ipykernel_57546/4060173120.py", line 15, in objective
    model.fit(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/catboost/core.py", line 5730, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/catboost/core.py", line 2355, in _fit
    self._train(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/catboost/core.py", line 1759, in _train
    self._object._train(trai

KeyboardInterrupt: 

In [None]:
import joblib
joblib.dump(study, '/content/drive/My Drive/WiDS 2023/catboost_study.pkl')

In [None]:
study = joblib.load('/content/drive/My Drive/WiDS 2023/catboost_study.pkl')
study.optimize(objective, n_trials=500)

In [None]:
visualization.plot_optimization_history(study)
visualization.plot_slice(study)
visualization.plot_contour(study, params=['iterations', 'learning_rate'])
visualization.plot_parallel_coordinate(study, params=['iterations', 'learning_rate'])
visualization.plot_param_importances(study)

In [None]:
# get top 10 trials
temp = study.trials_dataframe().sort_values(by='value')
candidate = temp[['number','params_learning_rate']][~temp['params_learning_rate'].isnull()].to_dict('records')[:10]
candidate = list(map(lambda x: (x['number'],x['params_learning_rate']), candidate))

In [None]:
for number, learning_rate in candidate:
  params = {
    'random_seed': 42,
    'iterations': 10000,
    'learning_rate': learning_rate,
  }

  model = CatBoostRegressor(
    **params,
    cat_features=['lat', 'lon', 'climateregions__climateregion', 'elevation__elevation', 'mjo1d__phase', 'month', 'week', 'is_mei__nip_3', 'location', 'day'],
    loss_function= 'RMSE',
    eval_metric= 'RMSE',
    task_type= 'GPU',
    devices= '0:1',
  )
  model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=200,
    verbose=False,
  )
  preds = model.predict(X_test)
  mse = mean_squared_error(y_test, preds)
  print(f'number: {number}, learning_rate: {learning_rate}, mse: {mse}')
  
    

In [None]:
# retrain model with best params
params = study.best_params
params['cat_features'] = ['lat', 'lon', 'climateregions__climateregion', 'elevation__elevation', 'mjo1d__phase', 'month', 'week', 'is_mei__nip_3', 'location', 'day']
params['loss_function'] = 'RMSE'
params['eval_metric'] = 'RMSE'
params['random_seed'] = 42
# params['task_type'] = 'GPU'
# params['devices'] = '0:1'
final_model = CatBoostRegressor(**params)
final_model.fit(X_train, y_train, verbose=True)

preds = final_model.predict(X_test)
mse = mean_squared_error(y_test, preds)
print(f'MSE: {mse}')

In [20]:
submit_index = pd.read_csv('../Data/created/submit_index.csv')['index']
y_submit_pred = final_model.predict(X_submit)
submit = pd.DataFrame({
  'contest-tmp2m-14d__tmp2m': y_submit_pred,
  'index':submit_index
})
submit.to_csv('../submission/catboost.csv', index=False)

NameError: name 'final_model' is not defined