In [1]:
%%capture
!pip install scikit-optimize catboost optuna bayesian-optimization

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# After running
! git clone --recursive https://github.com/Microsoft/LightGBM

#You can run this oneliner which will build and compile LightGBM with GPU enabled in colab:
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

Cloning into 'LightGBM'...
remote: Enumerating objects: 29073, done.[K
remote: Counting objects: 100% (328/328), done.[K
remote: Compressing objects: 100% (200/200), done.[K
remote: Total 29073 (delta 172), reused 216 (delta 126), pack-reused 28745[K
Receiving objects: 100% (29073/29073), 20.56 MiB | 24.28 MiB/s, done.
Resolving deltas: 100% (21505/21505), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/external_libs/compute'...
remote: Enumerating objects: 21733, done.        
remote: Counting objects: 100% (5/5), done.       

In [1]:
%cd drive/MyDrive/kaggle/WiDS

%load_ext autoreload
%autoreload 2

/content/drive/MyDrive/kaggle/WiDS


In [2]:
import pandas as pd
import numpy as np
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 100)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
import sys
from datetime import datetime
import gc

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from statsmodels.tsa.arima.model import ARIMA
from scipy.stats import uniform, randint
from skopt import BayesSearchCV
from skopt.space import Real, Integer

from catboost import CatBoostRegressor
import xgboost as xgb
import lightgbm as lgb
import optuna

import warnings; warnings.filterwarnings('ignore')

sys.path.append('..')
from config import CFG
from src.data_processing.reduce_mem import reduce_mem_usage
CFG = CFG()

In [3]:
BASE_PATH = os.getcwd()
DATA_PATH = os.path.join(BASE_PATH, 'data')
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')

CFG.DATA_PATH = DATA_PATH
CFG.RAW_DATA_PATH = RAW_DATA_PATH
CFG.BASE_PATH = BASE_PATH

In [4]:
train = pd.read_csv(os.path.join(CFG.RAW_DATA_PATH, 'train_data.csv'), index_col='index', parse_dates=["startdate"])
test = pd.read_csv(os.path.join(CFG.RAW_DATA_PATH, 'test_data.csv'), index_col='index', parse_dates=["startdate"])
sample = pd.read_csv(os.path.join(CFG.DATA_PATH, 'sample_solution.csv'))
target = 'contest-tmp2m-14d__tmp2m'

In [None]:
contest = [x for x in train.columns.tolist() if 'contest' in x]
tmp2 = [x for x in train.columns.tolist() if 'tmp2' in x and x not in contest]
prate = [x for x in train.columns.tolist() if 'prate' in x and x not in contest]
vwnd = [x for x in train.columns.tolist() if 'vwnd' in x and x not in contest]
uwnd = [x for x in train.columns.tolist() if 'uwnd' in x and x not in contest]
wind_hgt = [x for x in train.columns.tolist() if 'wind-hgt' in x and x not in contest]
sst_2010 = [x for x in train.columns.tolist() if 'sst-2010' in x and x not in contest]
icec = [x for x in train.columns.tolist() if 'icec' in x and x not in contest]

In [5]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 353.67 Mb (49.8% reduction)
Mem. usage decreased to 29.60 Mb (49.5% reduction)


In [None]:
print(train.shape, test.shape)

(375734, 246) (31354, 245)


In [6]:
def loc_norm(train, test):
    scale = 14

    train.loc[:, 'lat'] = round(train.lat, scale)
    train.loc[:, 'lon'] = round(train.lon, scale)
    test.loc[:, 'lat'] = round(test.lat, scale)
    test.loc[:, 'lon'] = round(test.lon, scale)

    all_df = pd.concat([train, test], axis=0)
    all_df['loc_group'] = all_df.groupby(['lat', 'lon']).ngroup()
    train = all_df.iloc[:len(train)]
    test = all_df.iloc[len(train):].drop(target, axis=1)
    return train, test

def cat_enc(train, test):
    le = LabelEncoder()
    train['climateregions__climateregion'] = le.fit_transform(train['climateregions__climateregion'])
    test['climateregions__climateregion'] = le.transform(test['climateregions__climateregion'])
    train = train.drop('climateregions__climateregion', axis=1)
    test = test.drop('climateregions__climateregion', axis=1)
    return train, test

def sep_date(train, test):
    train['year'] = train['startdate'].dt.year
    train['month'] = train['startdate'].dt.month
    train['day'] = train['startdate'].dt.dayofyear

    test['year'] = test['startdate'].dt.year
    test['month'] = test['startdate'].dt.month
    test['day'] = test['startdate'].dt.dayofyear

    return train, test

def feat_eng(train, test):
    train, test = loc_norm(train, test)
    train, test = cat_enc(train, test)
    train, test = sep_date(train, test)

    X = train.drop(['startdate', 'lat', 'lon', target], axis=1)
    X_test = test.drop(['startdate', 'lat', 'lon'], axis=1)
    y = train[target]

    return X, X_test, y

In [7]:
X, X_test, y = feat_eng(train, test)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=CFG.SEED)
print(f"X_train shape: {X_train.shape} X_val: {X_val.shape} y_train: {y_train.shape} y_val: {y_val.shape}")

X_train shape: (300587, 244) X_val: (75147, 244) y_train: (300587,) y_val: (75147,)


In [9]:
def adv_val(train_X, test_X):
    lgb_params = {
        'n_estimators': 100,
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'verbose': 1
    }
    ad_y = np.array([1]*train_X.shape[0] + [0]*test_X.shape[0])
    ad_x = pd.concat([train_X, test_X])

    lgb_data = lgb.Dataset(ad_x, ad_y)
    cv_lgb = lgb.cv(lgb_params, lgb_data)

    print("Adversarial Validation AUC Score: {}".format(cv_lgb['valid auc-mean'][-1]))

    ad_val_mod = lgb.train(lgb_params, lgb_data)
    
    print(pd.DataFrame(
        {'feat':ad_x.columns, 
         'imp':ad_val_mod.feature_importance()}).sort_values('imp', ascending = False))
    
    return ad_val_mod, cv_lgb

In [10]:
ad_val_mod, cv_lgb = adv_val(X_train, X_test)

[LightGBM] [Info] Number of positive: 240469, number of negative: 25083
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 59880
[LightGBM] [Info] Number of data points in the train set: 265552, number of used features: 244
[LightGBM] [Info] Number of positive: 240469, number of negative: 25084
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 59880
[LightGBM] [Info] Number of data points in the train set: 265553, number of used features: 244
[LightGBM] [Info] Number of positive: 240470, number of negative: 25083
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 59880
[LightGBM] [Info] Number of data points in the train set: 265553, number of used features: 244
[LightGBM] [Info] Number of positive: 240470, number of negative: 25083
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 59880
[LightGBM] [Info] Number of data points in the train se

In [None]:
# X.drop(['contest-pevpr-sfc-gauss-14d__pevpr','nmme0-tmp2m-34w__cancm30'], inplace = True)
# X_test.drop(['contest-pevpr-sfc-gauss-14d__pevpr','nmme0-tmp2m-34w__cancm30'], inplace = True)

In [14]:
gc.collect()

X1, Y1 = X.copy(), y.copy()

from catboost import Pool, cv, CatBoostRegressor
from bayes_opt import BayesianOptimization
from bayes_opt import BayesianOptimization as BO
import warnings
from sklearn.model_selection import * 
from sklearn.metrics import *

def rmse(actual, predicted):
    return mean_squared_error(actual, predicted, squared=False)

Use_BO = CFG.USE_BO
if Use_BO:
    #n_estimators,
    # num_leaves
    def CB_opt(depth, learning_rate, l2_leaf_reg, model_size_reg): 
        scores = []
    #     skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 1944)
        trainx, valx, trainy, valy = train_test_split(X1, Y1, test_size=0.33, random_state=42)
        reg = CatBoostRegressor(   
                                        verbose = 0,
                                        #iterations=10,
                                        #n_estimators = 10,
                                        learning_rate = learning_rate,
                                        #subsample = subsample, 
                                        l2_leaf_reg = l2_leaf_reg,
                                        max_depth = int(depth),
                                        #num_leaves = int(num_leaves),
                                        random_state = 1212,
                                        #grow_policy = "Lossguide",
    #                                     max_bin = int(max_bin),  
                                        use_best_model = True, 
                                        # bootstrap_type='Bayesian',
                                        loss_function='RMSE',
                                        model_size_reg = model_size_reg,
                                        task_type = 'GPU',
                                        devices = '0'
                                        
                                    )

        reg.fit(trainx, trainy, eval_set = (valx, valy))
        y_pred = reg.predict(valx)
        scores.append(rmse(valy, y_pred))
        return 1/np.mean(scores)

    #"n_estimators": (150,1200),
    # "num_leaves": (100,150),
    # "max_bin":(150,300),
    pbounds = {
               "depth": (6, 7),
               "learning_rate": (0.09, 0.0980689972639084),
               #"subsample":(0.7, 0.800000011920929),
               "l2_leaf_reg":(2,4),
               "model_size_reg": (0.48, 0.5)
    }
    optimizer = BayesianOptimization(f = CB_opt, pbounds = pbounds,  verbose = 2, random_state = 1212)
    optimizer.maximize(init_points = 7, n_iter = 30, acq = 'ucb', alpha = 1e-6)

    print(optimizer.max)

    max_bo_params = optimizer.max['params']
    max_bo_params

|   iter    |  target   |   depth   | l2_lea... | learni... | model_... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m1.654    [0m | [0m6.245    [0m | [0m3.609    [0m | [0m0.09141  [0m | [0m0.4994   [0m |
| [0m2        [0m | [0m1.648    [0m | [0m6.225    [0m | [0m3.738    [0m | [0m0.09227  [0m | [0m0.4945   [0m |
| [95m3        [0m | [95m1.674    [0m | [95m6.752    [0m | [95m2.493    [0m | [95m0.09623  [0m | [95m0.4817   [0m |
| [95m4        [0m | [95m1.683    [0m | [95m6.891    [0m | [95m2.436    [0m | [95m0.09754  [0m | [95m0.4818   [0m |
| [0m5        [0m | [0m1.676    [0m | [0m6.807    [0m | [0m2.779    [0m | [0m0.09762  [0m | [0m0.4803   [0m |
| [0m6        [0m | [0m1.674    [0m | [0m6.542    [0m | [0m2.583    [0m | [0m0.09693  [0m | [0m0.4901   [0m |
| [0m7        [0m | [0m1.65     [0m | [0m6.719    [0m | [0m2.623    [0m | [0m0.09214  [0m | [0m0.

In [15]:
Use_BO_result = CFG.USE_BO

if Use_BO_result:
    opt_params = {
              'iterations':20000,
              'verbose':0,
              'learning_rate' : max_bo_params['learning_rate'],
              #'subsample' : max_bo_params['subsample'], 
              'l2_leaf_reg' : max_bo_params['l2_leaf_reg'],
              'max_depth' : int(max_bo_params['depth']), 
              'use_best_model' : True, 
              'loss_function' : 'RMSE',
              'model_size_reg' : max_bo_params['model_size_reg'],
              'task_type': 'GPU',
                'devices': '0'
             }
else:
    opt_params = {
          'iterations':25000,
          'verbose':0,
          'learning_rate' : 0.0980689972639084,
          # 'subsample' : 0.7443133148363695, 
          'l2_leaf_reg' : 2.3722386345448316,
          'max_depth' : int(6.599144674342465),
          'use_best_model' : True, 
          'loss_function' : 'RMSE',
          'model_size_reg' : 0.4833187897595954,
          'task_type': 'GPU',
          'devices': '0'
         }

In [16]:
## catBoost Pool object
train_pool = Pool(data=X1,label = Y1)

X_train, X_test2, y_train, y_test = train_test_split(X1, Y1, test_size=0.33, random_state=42)

bst = CatBoostRegressor(**opt_params)
bst.fit(train_pool, eval_set=(X_test2, y_test), plot=False,silent=True)
print(bst.get_best_score())

{'learn': {'RMSE': 0.13720851627361427}, 'validation': {'RMSE': 0.13745436977896283}}


In [17]:
# Pseudo Labelling
train_pseudo = X_test.copy()
# ddf = pd.read_csv('/kaggle/input/wids-2023-sub3/submission (17).csv')
y_test_pred  = bst.predict(X_test)  #ddf[target]
train_pseudo[target] = y_test_pred
train_mod = pd.concat([X_train.copy(), train_pseudo], axis=0).reset_index(drop=True)
features = [c for c in X_test.columns if (c != 'id')]
display(train_mod)

XX = train_mod[features]
yy = train_mod[target]
y_oof_pred = np.zeros(len(yy))

X_testt = X_test[features].values
y_test_pred2 = np.zeros(len(X_testt))

Unnamed: 0,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,nmme0-tmp2m-34w__gfdlflora0,nmme0-tmp2m-34w__gfdlflorb0,nmme0-tmp2m-34w__gfdl0,nmme0-tmp2m-34w__nasa0,nmme0-tmp2m-34w__nmme0mean,contest-wind-h10-14d__wind-hgt-10,nmme-tmp2m-56w__cancm3,nmme-tmp2m-56w__cancm4,nmme-tmp2m-56w__ccsm3,nmme-tmp2m-56w__ccsm4,nmme-tmp2m-56w__cfsv2,nmme-tmp2m-56w__gfdl,nmme-tmp2m-56w__gfdlflora,nmme-tmp2m-56w__gfdlflorb,...,wind-vwnd-925-2010-6,wind-vwnd-925-2010-7,wind-vwnd-925-2010-8,wind-vwnd-925-2010-9,wind-vwnd-925-2010-10,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20,loc_group,year,month,day,contest-tmp2m-14d__tmp2m
0,158.93,18.39,21.47,17.64,19.88,16.98,16.02,16.34,18.78,19.43,18.33,30890.89,9.38,13.10,10.37,11.29,9.89,10.18,9.45,9.80,...,31.45,-66.48,-50.80,30.94,41.74,-18.12,7.56,71.09,28.97,-32.43,-11.45,-4.40,6.22,6.80,13.80,345,2015,10,287,
1,17.68,-2.01,-1.47,-1.93,0.01,-1.26,-3.83,-3.46,-0.44,-1.53,-1.77,30825.85,-6.22,-5.35,-5.67,-6.74,-4.93,-6.09,-8.63,-8.10,...,73.02,2.04,10.86,11.91,69.28,-78.56,-49.10,-48.89,-0.90,-52.67,-13.40,-13.62,16.01,32.08,41.37,194,2015,12,356,
2,15.29,5.69,6.56,6.70,7.49,6.76,9.54,10.16,6.81,6.14,7.32,30956.16,5.41,6.36,6.28,7.54,5.53,6.31,9.00,8.98,...,18.46,7.86,-42.84,-23.45,44.48,19.11,74.71,-21.27,-18.74,-37.49,-100.61,28.04,2.20,25.30,8.73,209,2016,1,14,
3,39.74,-6.40,-4.13,-5.81,-8.79,-6.02,-3.84,-3.43,-5.28,-9.13,-5.87,31142.85,-7.57,-5.77,-6.38,-10.55,-6.69,-5.96,-5.20,-5.74,...,-3.59,5.83,8.27,-12.58,45.44,-28.86,-2.19,-19.22,-13.55,-78.15,-70.60,38.93,-31.88,32.66,-4.57,366,2016,1,21,
4,176.57,9.80,11.86,11.74,13.53,11.94,12.30,12.23,13.11,14.51,12.33,30839.99,4.63,7.60,8.01,8.40,8.03,7.89,6.93,7.13,...,32.09,-41.25,12.76,-68.56,-2.28,-71.86,-89.94,-22.72,26.19,-41.52,-17.59,-6.89,20.52,4.04,31.79,35,2015,12,348,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283090,62.72,4.60,8.71,6.05,10.08,6.39,8.42,9.08,5.53,6.97,7.32,30269.05,0.86,3.26,0.38,2.02,0.12,1.97,1.76,2.99,...,-62.88,-25.85,9.34,20.33,28.17,74.96,-8.49,32.39,38.82,7.42,11.75,-23.62,-0.24,-5.94,51.23,513,2022,12,361,3.82
283091,73.41,4.60,8.71,6.05,10.08,6.39,8.42,9.08,5.53,6.97,7.32,30264.55,0.29,2.57,-0.44,1.27,-0.57,1.29,1.28,2.55,...,-60.45,-33.44,-3.25,26.70,31.26,88.57,0.83,26.23,37.64,13.01,17.84,-22.05,-3.03,1.31,51.45,513,2022,12,362,3.55
283092,70.00,4.60,8.71,6.05,10.08,6.39,8.42,9.08,5.53,6.97,7.32,30274.65,-0.29,1.87,-1.26,0.52,-1.25,0.62,0.80,2.11,...,-55.91,-37.35,-18.20,37.80,33.81,99.43,10.90,21.06,36.53,14.15,23.12,-25.60,-5.88,9.32,45.32,513,2022,12,363,2.96
283093,79.81,4.60,8.71,6.05,10.08,6.39,8.42,9.08,5.53,6.97,7.32,30296.92,-1.44,0.49,-2.91,-0.97,-2.63,-0.73,-0.17,1.23,...,-59.46,-36.64,-35.02,45.66,35.37,109.39,21.37,20.42,36.05,6.38,29.00,-27.06,-1.42,16.06,31.88,513,2022,12,364,3.77


In [18]:
yy[np.isnan(yy)] = 0
train_pool = Pool(data=XX,label = yy)

X_train3, X_test3, y_trai3, y_test3 = train_test_split(XX, yy, test_size=0.33, random_state=42)

bst2 = CatBoostRegressor(**opt_params)
bst2.fit(train_pool, eval_set=(X_test3, y_test3), plot=True,silent=True)
print(bst2.get_best_score())

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

{'learn': {'RMSE': 0.01443546618823433}, 'validation': {'RMSE': 0.014498142649282186}}


In [19]:
# set up parameters for LightGBM
params = {'boosting_type': 'gbdt',
          'objective': 'regression',
          'metric': 'rmse',
          'max_depth': 4,
          'num_leaves': 31,
          'learning_rate': 0.05,
          'feature_fraction': 0.9,
          'bagging_fraction': 0.8,
          'bagging_freq': 5,
          'early_stopping_round': 50,
          'n_estimators': 15000,
          'device': 'gpu'}

reg_lgb = lgb.LGBMRegressor(**params)

reg_lgb.fit(X_train3, y_trai3, eval_set=(X_test3, y_test3))

y_pred_cat = bst2.predict(X_test)

y_pred_lgb = reg_lgb.predict(X_test)

ensemble_preds = y_pred_lgb*0.60+y_pred_cat*0.40

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Did not meet early stopping. Best iteration is:
[14999]	valid_0's rmse: 0.0592734


In [20]:
submit_cat = sample.copy()
submit_cat[target] = y_pred_cat
submit_cat.to_csv(CFG.DATA_PATH + '/y_pred_cat.csv', index = False)

submit_lgb = sample.copy()
submit_lgb[target] = y_pred_lgb
submit_lgb.to_csv(CFG.DATA_PATH + '/y_pred_lgb.csv', index = False)

submit_lgb = sample.copy()
submit_lgb[target] = ensemble_preds
submit_lgb.to_csv(CFG.DATA_PATH + '/ensemble_preds.csv', index = False)