In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings

sns.set()
rcParams['figure.figsize'] = (20,10)
pd.options.display.max_columns = None
warnings.filterwarnings('ignore')

from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# import target encoder
from category_encoders import TargetEncoder

In [None]:
inlocal = True
if inlocal:
  path_to_data = '../Data/created/'
else:
  from google.colab import drive
  from google.colab import output
  output.enable_custom_widget_manager()
  drive.mount('/content/drive')
  path_to_data = '/content/drive/My Drive/WiDS 2023/data/'

In [None]:
df = pd.read_csv(path_to_data + 'train_processed.csv')
df_submit = pd.read_csv(path_to_data + 'test_processed.csv')
target = 'contest_tmp2m_14d__tmp2m'

In [None]:
X_train, y_train = df.drop(target, axis=1), df[target]

In [None]:
cat_cols = ['lat', 'lon', 'climateregions__climateregion', 'elevation__elevation', 'mjo1d__phase', 'month', 'week', 'is_mei__nip_3', 'location', 'day']

In [None]:
catboost_base = CatBoostRegressor(
    iterations=1700,
    random_state=42,
    cat_features=cat_cols,
    verbose=False,
    loss_function='RMSE',
    eval_metric='RMSE',
    task_type='GPU',
    devices='0:1',
)
catboost_base.fit(
    X_train, y_train,
    plot=True
)

In [26]:
feature_importances = pd.DataFrame({
    'feature': X_train.columns,
    'importance': catboost_base.feature_importances_
})
feature_importances['rank'] = feature_importances['importance'].rank(ascending=False)
feature_importances.sort_values('rank', inplace=True)
feature_importances[feature_importances['feature'].isin(cat_cols)]

Unnamed: 0,feature,importance,rank
0,lat,,
1,lon,,
2,contest_pevpr_sfc_gauss_14d__pevpr,,
3,nmme0_tmp2m_34w__cancm30,,
4,nmme0_tmp2m_34w__cancm40,,
5,nmme0_tmp2m_34w__ccsm30,,
6,nmme0_tmp2m_34w__ccsm40,,
7,nmme0_tmp2m_34w__cfsv20,,
8,nmme0_tmp2m_34w__gfdlflora0,,
9,nmme0_tmp2m_34w__gfdlflorb0,,


In [None]:
submit_index = pd.read_csv(path_to_data + 'submit_index.csv')['index']
submit = pd.DataFrame({
  'contest-tmp2m-14d__tmp2m': catboost_base.predict(df_submit),
  'index':submit_index
})
submit.to_csv('submit.csv', index=False)

In [None]:
X_train_xgb = X_train.copy()
target_encoder = TargetEncoder(cols=cat_cols)
X_train_xgb[cat_cols] = target_encoder.fit_transform(X_train_xgb[cat_cols], y_train)
xgboost_base = XGBRegressor(
    random_state=42,
    tree_method='gpu_hist',
    gpu_id=0,
    verbosity=0,
    objective='reg:squarederror',
)
xgboost_base.fit(X_train_xgb, y_train)

In [None]:
X_train_lgbm = X_train.copy()
X_train_lgbm[cat_cols] = X_train_lgbm[cat_cols].astype('category')
lgbm_base = LGBMRegressor(
    random_state=42,
    device='gpu',
    gpu_platform_id=0,
    gpu_device_id=0,
    verbosity=-1,
    objective='regression',
)
lgbm_base.fit(X_train_lgbm, y_train)

In [None]:
y_preds_catboost = catboost_base.predict(X_train)
y_preds_xgboost = xgboost_base.predict(X_train_xgb)
y_preds_lgbm = lgbm_base.predict(X_train_lgbm)

In [None]:
combine_df = pd.DataFrame({
    'catboost': y_preds_catboost,
    'xgboost': y_preds_xgboost,
    'lgbm': y_preds_lgbm,
})

lr_model = LinearRegression()
lr_model.fit(combine_df, y_train)

In [None]:
def get_submit():
  y_submit_catboost = catboost_base.predict(df_submit)

  df_submit_xgb = df_submit.copy()
  df_submit_xgb[cat_cols] = target_encoder.transform(df_submit_xgb[cat_cols])
  y_submit_xgboost = xgboost_base.predict(df_submit_xgb)

  df_submit_lgbm = df_submit.copy()
  df_submit_lgbm[cat_cols] = df_submit_lgbm[cat_cols].astype('category')
  y_submit_lgbm = lgbm_base.predict(df_submit_lgbm)

  combine_df_submit = pd.DataFrame({
      'catboost': y_submit_catboost,
      'xgboost': y_submit_xgboost,
      'lgbm': y_submit_lgbm,
  })

  y_submit = lr_model.predict(combine_df_submit)
  return y_submit