In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
from sklearn.preprocessing import FunctionTransformer
import seaborn as sns 
from scipy import stats
import statsmodels.api as sm

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
import h2o
from h2o.automl import H2OAutoML
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.neural_network import MLPRegressor
from scipy.stats import norm
import copy
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool, metrics, cv
import xgboost as xgb
from scipy.stats import gmean
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pystacknet
import joblib
import sys
sys.modules['sklearn.externals.joblib'] = joblib
from pystacknet.pystacknet import StackNetRegressor


In [2]:
train_df = pd.read_csv('./best_train.csv')
test_df = pd.read_csv('./best_test.csv')

In [3]:
seed = 1234
np.random.seed(seed)

In [4]:
features = ['contest-wind-h500-14d__wind-hgt-500',
 'contest-slp-14d__slp',
 'nmme-tmp2m-34w__ccsm3',
 'elevation__elevation',
 'lon',
 'contest-prwtr-eatm-14d__prwtr',
 'lat',
 'climateregions__climateregion',
 'contest-pres-sfc-gauss-14d__pres',
 'season_sin',
 'day_of_year_sin',
 'contest-precip-14d__precip',
 'contest-wind-uwnd-250-14d__wind-uwnd-250',
 'nmme-prate-34w__cfsv2',
 'nmme-prate-34w__nasa',
 'nmme-prate-56w__gfdlflora',
 'wind-uwnd-250-2010-7',
 'contest-wind-vwnd-925-14d__wind-vwnd-925',
 'nmme-prate-34w__nmmemean',
 'nmme0-prate-34w__ccsm30',
 'contest-wind-h850-14d__wind-hgt-850',
 'contest-wind-uwnd-925-14d__wind-uwnd-925',
 'nmme0-prate-56w__cfsv20',
 'nmme-prate-34w__cancm3',
 'contest-rhum-sig995-14d__rhum',
 'nmme-prate-34w__gfdlflorb',
 'wind-hgt-850-2010-4',
 'contest-wind-vwnd-250-14d__wind-vwnd-250',
 'wind-hgt-100-2010-2',
 'wind-uwnd-250-2010-18',
 'wind-hgt-10-2010-5',
 'wind-uwnd-250-2010-15',
 'wind-uwnd-250-2010-4',
 'nmme0-prate-56w__nasa0',
 'nmme0-prate-34w__cfsv20',
 'wind-vwnd-250-2010-10',
 'contest-wind-h10-14d__wind-hgt-10',
 'wind-uwnd-925-2010-15',
 'wind-vwnd-250-2010-3',
 'nmme-prate-34w__cancm4',
 'sst-2010-4',
 'nmme0-prate-56w__ccsm30',
 'wind-uwnd-250-2010-16',
 'nmme0-prate-34w__gfdl0',
 'nmme0-prate-56w__cancm40',
 'sst-2010-1',
 'sst-2010-3',
 'wind-uwnd-250-2010-14',
 'nmme0-prate-34w__gfdlflora0',
 'nmme-prate-34w__gfdl',
 'wind-hgt-850-2010-9',
 'wind-vwnd-250-2010-1',
 'sst-2010-5',
 'cancm30',
 'nmme-prate-34w__ccsm4',
 'nmme0-prate-34w__nasa0',
 'wind-hgt-500-2010-9',
 'nmme0-prate-34w__cancm30',
 'wind-vwnd-250-2010-13',
 'wind_diff',
 'wind_diff_min',
 'wind_diff_min_month',
 'wind_diff_month',
 'diff_slp_first',
 'diff_wind_first',
 'diff_precip_first',
 'diff_sst_10_first',
 'diff_sst_10_min',
 'diff_sst_10_max',
 'range_sst_10',
 'scale_sst_10',
 'diff_sst_9_first',
 'diff_sst_9_min',
 'diff_sst_9_max',
 'range_sst_9',
 'scale_sst_9',
 'diff_sst_8_first',
 'diff_sst_8_min',
 'diff_sst_8_max',
 'range_sst_8',
 'scale_sst_8',
 'diff_sst_7_first',
 'diff_sst_7_min',
 'diff_sst_7_max',
 'range_sst_7',
 'scale_sst_7',
 'diff_sst_6_first',
 'diff_sst_6_min',
 'diff_sst_6_max',
 'range_sst_6',
 'scale_sst_6',
 'diff_sst_1_first',
 'diff_sst_1_min',
 'diff_sst_1_max',
 'range_sst_1',
 'scale_sst_1',
 'diff_sst_2_first',
 'diff_sst_2_min',
 'diff_sst_2_max',
 'range_sst_2',
 'scale_sst_2',
 'diff_sst_3_first',
 'diff_sst_3_min',
 'diff_sst_3_max',
 'range_sst_3',
 'scale_sst_3',
 'diff_sst_4_first',
 'diff_sst_4_min',
 'diff_sst_4_max',
 'range_sst_4',
 'scale_sst_4',
 'diff_sst_5_first',
 'diff_sst_5_min',
 'diff_sst_5_max',
 'range_sst_5',
 'scale_sst_5',
 'diff_pres_1_first',
 'diff_pres_1_min',
 'diff_pres_1_max',
 'range_pres_1',
 'scale_pres_1',
 'diff_ccsm3_month_1_first',
 'diff_ccsm3_month_1_min',
 'diff_ccsm3_month_1_max',
 'range_ccsm3_month_1',
 'scale_ccsm3_month_1',
 'diff_sst_1_month_1_first',
 'diff_sst_1_month_1_min',
 'diff_sst_1_month_1_max',
 'range_sst_1_month_1',
 'scale_sst_1_month_1',
 'sst_1_lag_1',
 'sst_5_lag_1',
 'sst_4_lag_1',
 'sst_1_lag_1_season',
 'sst_5_lag_1_season',
 'sst_4_lag_1_season']
exclude_cols = ['index', 'startdate']
target=["contest-tmp2m-14d__tmp2m"]

In [35]:
l1_clf1 = CatBoostRegressor(
                                verbose=200, task_type="GPU",
                                devices='0:1', 
                                iterations=5000,
                                 early_stopping_rounds=100
                            )

In [36]:
l1_clf2 = GradientBoostingRegressor(n_estimators=400,
                                    min_samples_leaf=10,
                                    max_depth=15, 
                                    max_features='sqrt', 
                                    subsample=0.85,
                                    random_state=seed)

In [37]:
l1_clf3 = LGBMRegressor(boosting_type='gbdt',
                        boost_from_average="false",
                        num_leaves=491,
                        max_depth=20,
                        min_child_weight=0.035,
                        feature_fraction=0.38,
                        bagging_fraction=0.42,
                        min_data_in_leaf=100,
                        max_bin=255,
                        importance_type='split',
                        reg_alpha=0.4,
                        reg_lambda=0.65,
                        bagging_seed=seed,
                        random_state=seed,
                        verbosity=-1,
                        subsample=0.85,
                        colsample_bytree=0.8,
                        min_child_samples=79)

In [38]:
l2_clf1 = RandomForestRegressor(n_estimators=250, 
                                max_depth=5, 
                                max_features='sqrt', 
                                random_state=seed)

In [44]:
models = [[l1_clf1,  l1_clf3], 
          [l2_clf1]]

In [45]:
model = StackNetRegressor(models, 
                          folds=5,
                           restacking=False,
                           use_retraining=True,
                           random_state=seed,
                           n_jobs=-1, 
                           verbose=1)

In [41]:
X = train_df[features].values
y = train_df[target[0]].values

In [46]:
model.fit(X, y)

Input Dimensionality 137 at Level 0 
2 models included in Level 0 








Fold 1/5 , model 0 , rmse===0.463557 
Fold 1/5 , model 1 , rmse===0.445516 




Learning rate set to 0.032632
0:	learn: 9.5859863	total: 235ms	remaining: 19m 32s
200:	learn: 1.4949676	total: 46.6s	remaining: 18m 33s
400:	learn: 1.2028441	total: 1m 33s	remaining: 17m 55s
600:	learn: 1.0491437	total: 2m 20s	remaining: 17m 4s
800:	learn: 0.9454620	total: 3m 6s	remaining: 16m 18s
1000:	learn: 0.8647953	total: 3m 53s	remaining: 15m 32s
1200:	learn: 0.8042314	total: 4m 40s	remaining: 14m 45s
1400:	learn: 0.7573156	total: 5m 26s	remaining: 13m 58s
1600:	learn: 0.7172783	total: 6m 13s	remaining: 13m 12s
1800:	learn: 0.6839304	total: 6m 57s	remaining: 12m 20s
2000:	learn: 0.6547821	total: 7m 40s	remaining: 11m 30s
2200:	learn: 0.6303596	total: 8m 26s	remaining: 10m 43s
2400:	learn: 0.6086755	total: 9m 10s	remaining: 9m 55s
2600:	learn: 0.5877842	total: 9m 55s	remaining: 9m 9s
2800:	learn: 0.5701944	total: 10m 40s	remaining: 8m 23s
3000:	learn: 0.5540251	total: 11m 19s	remaining: 7m 32s
3200:	learn: 0.5389343	total: 12m 4s	remaining: 6m 47s
3400:	learn: 0.5257429	total: 12m



Fold 2/5 , model 0 , rmse===0.464801 
Fold 2/5 , model 1 , rmse===0.451094 




Learning rate set to 0.032632
0:	learn: 9.5916145	total: 249ms	remaining: 20m 44s
200:	learn: 1.4993934	total: 45.5s	remaining: 18m 6s
400:	learn: 1.2013081	total: 1m 30s	remaining: 17m 21s
600:	learn: 1.0461403	total: 2m 15s	remaining: 16m 34s
800:	learn: 0.9400857	total: 2m 58s	remaining: 15m 35s
1000:	learn: 0.8640202	total: 3m 33s	remaining: 14m 13s
1200:	learn: 0.8048973	total: 3m 59s	remaining: 12m 36s
1400:	learn: 0.7580563	total: 4m 12s	remaining: 10m 49s
1600:	learn: 0.7147601	total: 4m 53s	remaining: 10m 22s
1800:	learn: 0.6815048	total: 5m 33s	remaining: 9m 53s
2000:	learn: 0.6516807	total: 6m 15s	remaining: 9m 22s
2200:	learn: 0.6249126	total: 6m 55s	remaining: 8m 49s
2400:	learn: 0.6036967	total: 7m 39s	remaining: 8m 16s
2600:	learn: 0.5839660	total: 8m 25s	remaining: 7m 46s
2800:	learn: 0.5663332	total: 9m 12s	remaining: 7m 13s
3000:	learn: 0.5508176	total: 9m 59s	remaining: 6m 39s
3200:	learn: 0.5363540	total: 10m 46s	remaining: 6m 3s
3400:	learn: 0.5231164	total: 11m 33



Fold 3/5 , model 0 , rmse===0.464653 
Fold 3/5 , model 1 , rmse===0.447400 




Fold 4/5 , model 0 , rmse===0.467727 
Fold 4/5 , model 1 , rmse===0.451349 




Learning rate set to 0.032632
0:	learn: 9.5937987	total: 248ms	remaining: 20m 39s
200:	learn: 1.5059385	total: 47.1s	remaining: 18m 43s
400:	learn: 1.2090083	total: 1m 33s	remaining: 17m 54s
600:	learn: 1.0490938	total: 2m 19s	remaining: 17m 3s
800:	learn: 0.9411904	total: 3m 5s	remaining: 16m 12s
1000:	learn: 0.8653851	total: 3m 51s	remaining: 15m 26s
1200:	learn: 0.8046049	total: 4m 38s	remaining: 14m 41s
1400:	learn: 0.7559864	total: 5m 25s	remaining: 13m 55s
1600:	learn: 0.7164589	total: 6m 11s	remaining: 13m 7s
1800:	learn: 0.6817186	total: 6m 57s	remaining: 12m 21s
2000:	learn: 0.6533045	total: 7m 44s	remaining: 11m 36s
2200:	learn: 0.6294036	total: 8m 31s	remaining: 10m 51s
2400:	learn: 0.6066536	total: 9m 19s	remaining: 10m 5s
2600:	learn: 0.5869737	total: 10m 6s	remaining: 9m 19s
2800:	learn: 0.5693628	total: 10m 53s	remaining: 8m 32s
3000:	learn: 0.5534999	total: 11m 25s	remaining: 7m 36s
3200:	learn: 0.5385784	total: 11m 36s	remaining: 6m 31s
3400:	learn: 0.5246580	total: 11



Learning rate set to 0.032632
0:	learn: 9.6013171	total: 106ms	remaining: 8m 51s
200:	learn: 1.4947982	total: 11.1s	remaining: 4m 24s
400:	learn: 1.2078738	total: 21.2s	remaining: 4m 2s
600:	learn: 1.0492974	total: 32.5s	remaining: 3m 57s
800:	learn: 0.9410084	total: 43.4s	remaining: 3m 47s
1000:	learn: 0.8632162	total: 53.8s	remaining: 3m 35s
1200:	learn: 0.8052214	total: 1m 5s	remaining: 3m 26s
1400:	learn: 0.7543185	total: 1m 15s	remaining: 3m 14s
1600:	learn: 0.7152279	total: 1m 26s	remaining: 3m 3s
1800:	learn: 0.6812265	total: 1m 37s	remaining: 2m 53s
2000:	learn: 0.6539635	total: 1m 47s	remaining: 2m 41s
2200:	learn: 0.6280288	total: 1m 59s	remaining: 2m 31s
2400:	learn: 0.6066674	total: 2m 10s	remaining: 2m 21s
2600:	learn: 0.5871831	total: 2m 21s	remaining: 2m 10s
2800:	learn: 0.5698450	total: 2m 32s	remaining: 1m 59s
3000:	learn: 0.5541396	total: 2m 43s	remaining: 1m 49s
3200:	learn: 0.5396177	total: 2m 50s	remaining: 1m 35s
3400:	learn: 0.5259553	total: 2m 52s	remaining: 1m 



Output dimensionality of level 0 is 2 
 level 0 lasted 3977.405770 seconds 
Input Dimensionality 2 at Level 1 
1 models included in Level 1 




Learning rate set to 0.032632
0:	learn: 9.5935330	total: 61.4ms	remaining: 5m 6s
200:	learn: 1.4979156	total: 10.5s	remaining: 4m 10s
400:	learn: 1.2058552	total: 21.8s	remaining: 4m 10s
600:	learn: 1.0516304	total: 32.6s	remaining: 3m 58s
800:	learn: 0.9376608	total: 42.8s	remaining: 3m 44s
1000:	learn: 0.8609830	total: 53.7s	remaining: 3m 34s
1200:	learn: 0.7989886	total: 1m 4s	remaining: 3m 24s
1400:	learn: 0.7508669	total: 1m 15s	remaining: 3m 13s
1600:	learn: 0.7113878	total: 1m 26s	remaining: 3m 2s
1800:	learn: 0.6770394	total: 1m 37s	remaining: 2m 52s
2000:	learn: 0.6498737	total: 1m 48s	remaining: 2m 41s
2200:	learn: 0.6251161	total: 1m 59s	remaining: 2m 31s
2400:	learn: 0.6036929	total: 2m 9s	remaining: 2m 20s
2600:	learn: 0.5848077	total: 2m 20s	remaining: 2m 9s
2800:	learn: 0.5666329	total: 2m 31s	remaining: 1m 58s
3000:	learn: 0.5503166	total: 2m 42s	remaining: 1m 47s
3200:	learn: 0.5357229	total: 2m 52s	remaining: 1m 36s
3400:	learn: 0.5219047	total: 3m 2s	remaining: 1m 25



Fold 3/5 , model 0 , rmse===0.503848 




Fold 4/5 , model 0 , rmse===0.507386 




Fold 5/5 , model 0 , rmse===0.502646 




Output dimensionality of level 1 is 1 
 level 1 lasted 298.304870 seconds 
 fit() lasted 4275.712422 seconds 
Learning rate set to 0.0336
0:	learn: 9.5849130	total: 42.7ms	remaining: 3m 33s
200:	learn: 1.4852771	total: 11.8s	remaining: 4m 41s
400:	learn: 1.1961458	total: 22.5s	remaining: 4m 17s
600:	learn: 1.0326194	total: 32.5s	remaining: 3m 57s
800:	learn: 0.9278262	total: 43.2s	remaining: 3m 46s
1000:	learn: 0.8518391	total: 54.1s	remaining: 3m 35s
1200:	learn: 0.7921581	total: 1m 5s	remaining: 3m 26s
1400:	learn: 0.7461349	total: 1m 16s	remaining: 3m 15s
1600:	learn: 0.7073456	total: 1m 27s	remaining: 3m 6s
1800:	learn: 0.6725886	total: 1m 39s	remaining: 2m 55s
2000:	learn: 0.6453437	total: 1m 50s	remaining: 2m 45s
2200:	learn: 0.6204015	total: 2m 1s	remaining: 2m 34s
2400:	learn: 0.5988708	total: 2m 12s	remaining: 2m 23s
2600:	learn: 0.5792382	total: 2m 22s	remaining: 2m 11s
2800:	learn: 0.5609923	total: 2m 34s	remaining: 2m 1s
3000:	learn: 0.5439915	total: 2m 45s	remaining: 1m 49

In [47]:
s = model.predict(test_df[features].values)

1 estimators included in Level 0 




1 estimators included in Level 1 




In [48]:
res_df = pd.read_csv('./sample_solution.csv')
res_df['contest-tmp2m-14d__tmp2m'] = s[:, 0]
res_df.to_csv('submission_stacking.csv', index=False)

best_df = pd.read_csv('best_sub.csv')
y_best = best_df[target[0]].values

from numpy import dot
from numpy.linalg import norm

cos_sim = dot(y_best, s)/(norm(y_best)*norm(s))
print("cos_sim with best submission:", cos_sim)

cos_sim with best submission: [0.99907485]


In [None]:
# X = train_df[features].values
# y = train_df[target[0]].values

# skf = KFold(n_splits=5, random_state=2048, shuffle=True)
# skf.get_n_splits(X, y)

# train_index_list = []
# test_index_list = []
# train_x_list = []
# val_x_list = []
# train_y_list = []
# val_y_list = []

# for i, (train_index, test_index) in tqdm(enumerate(skf.split(X, y))):
#     train_index_list.append(train_index)
#     test_index_list.append(test_index)

#     train_x_list.append(X[train_index, :])
#     val_x_list.append(X[test_index, :])
#     train_y_list.append(y[train_index])
#     val_y_list.append(y[test_index]) 
    
   
# def train_stack_model(x, y):
#     print(x.shape, y.shape)
    
#     models = []
#     for i in tqdm(range(5)):
#         train_x, val_x = train_x_list[i], val_x_list[i]
#         train_y, val_y = train_y_list[i], val_y_list[i]
        
#         model = StackNetRegressor(models, 
#                            restacking=False,
#                            use_retraining=True,
#                            random_state=seed,
#                            n_jobs=-1, 
#                            verbose=1)
#         model.fit(train_x, train_y, eval_set=[(val_x, val_y)], verbose=200, early_stopping_rounds=100)
#         models.append(model)
    
#     return cat_models
# models = train_stack_model(X, y)

# preds = []
# for model in tqdm(models):
#     preds.append(model.predict(test_df[features].values))
# s = 0
# for pred in preds:
#     s += pred
# s = s/5
# res_df = pd.read_csv('./sample_solution.csv')
# res_df['contest-tmp2m-14d__tmp2m'] = s
# res_df.to_csv('submission_stacking.csv', index=False)


# from numpy import dot
# from numpy.linalg import norm

# cos_sim = dot(y_best, s)/(norm(y_best)*norm(s))
# print("cos_sim with best submission:", cos_sim)