In [None]:
# %pip install optuna
# %pip install catboost

In [5]:
inlocal = True
if inlocal:
  path_to_data = '../Data/created/'
else:
  from google.colab import drive
  from google.colab import output
  output.enable_custom_widget_manager()
  drive.mount('/content/drive')
  path_to_data = '/content/drive/My Drive/WiDS 2023/data/'

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams['figure.figsize'] = (20,10)
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from lightgbm import LGBMRegressor

import optuna
from optuna.integration import lightgbm as lgb
from optuna import Trial, visualization

In [7]:
target = 'contest_tmp2m_14d__tmp2m'
df = pd.read_csv(path_to_data + 'train_processed.csv')
df_submit = pd.read_csv(path_to_data + 'test_processed.csv')

In [9]:
X, y = df.drop(target, axis=1), df[target]

In [13]:
X.dtypes[X.dtypes == int].index

Index(['lat', 'lon', 'climateregions__climateregion', 'elevation__elevation',
       'mjo1d__phase', 'month', 'week', 'day', 'is_mei__nip_3', 'location'],
      dtype='object')

In [25]:
X[X.dtypes[X.dtypes == int].index].nunique()

lat                               23
lon                               31
climateregions__climateregion     15
elevation__elevation              51
mjo1d__phase                       8
month                             12
week                              53
day                               31
is_mei__nip_3                      2
location                         514
dtype: int64

In [None]:


df.index = pd.RangeIndex(start=0, stop=len(df), step=1)
X_train_temp, X_test, y_train_temp, y_test = train_test_split(df.drop(target, axis=1), df[target], test_size=0.05, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.1, shuffle=False)

In [None]:
# define objective function to train model with gpu
def objective(trial):
    params = {
        'n_estimators': 10000,
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 15),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'random_state': 42,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
    }
    
    dtrain = lgb.Dataset(X_train, y_train)
    dval = lgb.Dataset(X_val, y_val, reference=dtrain)
    
    model = lgb.train(params, dtrain, valid_sets=[dval], verbose_eval=False, early_stopping_rounds=250)
    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    return mse

# train model

In [None]:
study = optuna.create_study(direction='minimize')  
study.optimize(objective, n_trials=10)

In [None]:
visualization.plot_optimization_history(study)