In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as f
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn import preprocessing
from tqdm import tqdm
from sklearn.preprocessing import FunctionTransformer
import seaborn as sns 
from scipy import stats
import statsmodels.api as sm

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
import h2o
from h2o.automl import H2OAutoML
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.neural_network import MLPRegressor
from scipy.stats import norm
import copy
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool, metrics, cv
import xgboost as xgb
from scipy.stats import gmean

In [5]:
train_df = pd.read_csv('./n_train.csv')
test_df = pd.read_csv('./n_test.csv')

In [18]:
exclude_cols = ['index', 'startdate']
temporal_attrs = ['year', 'quarter', 'month', 'week', 'dayofyear', 'season', 'day_of_year_sin', 'day_of_year_cos', 'week_sin', 'week_cos', 'month_sin', 'month_cos', 'season_sin', 'season_cos', 'quarter_sin', 'quarter_cos']
loc_attrs = ['lat', 'lon', 'loc_group']
embedding_attrs = ['climateregions__climateregion']
target=["contest-tmp2m-14d__tmp2m"]
main_attrs = [c for c in train_df.columns if c not in exclude_cols and c not in temporal_attrs and c not in loc_attrs and c not in target and c not in embedding_attrs]
print("Main features:", len(main_attrs))

Main features: 240


In [20]:
train_independent_corr = train_df[main_attrs + temporal_attrs + loc_attrs + embedding_attrs].corr()
train_seleted_corr_columns = np.full((train_independent_corr.shape[0],), True, dtype=bool)
for i in range(train_independent_corr.shape[0]):
    for j in range(i + 1, train_independent_corr.shape[0]):
        if train_independent_corr.iloc[i, j] >= 0.85:
            
            if train_seleted_corr_columns[j]:
                train_seleted_corr_columns[j] = False
train_selected_columns = train_df[main_attrs + temporal_attrs + loc_attrs + embedding_attrs].columns[train_seleted_corr_columns]

In [21]:
out_cols = ['icec-2010-1',
 'icec-2010-2',
 'icec-2010-3',
 'icec-2010-4',
 'icec-2010-5',
 'icec-2010-6',
 'icec-2010-7',
 'icec-2010-8',
 'icec-2010-9',
 'icec-2010-10',
 'year',
 'quarter',
 'month_cos',
 'quarter_sin',
 'quarter_cos']
features = [c for c in train_selected_columns if c not in out_cols]
print(features)

['contest-pevpr-sfc-gauss-14d__pevpr', 'nmme0-tmp2m-34w__cancm30', 'contest-wind-h10-14d__wind-hgt-10', 'contest-rhum-sig995-14d__rhum', 'nmme-prate-34w__cancm3', 'nmme-prate-34w__ccsm3', 'nmme-prate-34w__ccsm4', 'nmme-prate-34w__cfsv2', 'nmme-prate-34w__gfdl', 'nmme-prate-34w__gfdlflora', 'nmme-prate-34w__nasa', 'nmme0-prate-56w__cancm30', 'nmme0-prate-56w__cancm40', 'nmme0-prate-56w__ccsm30', 'nmme0-prate-56w__ccsm40', 'nmme0-prate-56w__cfsv20', 'nmme0-prate-56w__gfdlflora0', 'nmme0-prate-56w__gfdl0', 'nmme0-prate-56w__nasa0', 'nmme0-prate-34w__cancm40', 'contest-slp-14d__slp', 'contest-wind-vwnd-925-14d__wind-vwnd-925', 'contest-pres-sfc-gauss-14d__pres', 'contest-wind-uwnd-250-14d__wind-uwnd-250', 'contest-prwtr-eatm-14d__prwtr', 'contest-wind-vwnd-250-14d__wind-vwnd-250', 'contest-precip-14d__precip', 'contest-wind-h850-14d__wind-hgt-850', 'contest-wind-uwnd-925-14d__wind-uwnd-925', 'elevation__elevation', 'wind-vwnd-250-2010-1', 'wind-vwnd-250-2010-2', 'wind-vwnd-250-2010-3', 'wi

In [35]:
class StackingCVRegressorRetrained(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, regressors, meta_regressor, n_folds=2, use_features_in_secondary=False):
        self.regressors = regressors
        self.meta_regressor = meta_regressor
        self.n_folds = n_folds
        self.use_features_in_secondary = use_features_in_secondary
        self.regr_ = [clone(x) for x in self.regressors]
        self.meta_regr_ = clone(self.meta_regressor)

    def fit(self, train, y):
        if type(train) == pd.core.frame.DataFrame:            
            X = copy.deepcopy(train).values
            #ADD more features for meta regressor
            #train = add_features(train)
        elif type(train) == np.ndarray:            
            X = copy.deepcopy(train)

        kfold = KFold(n_splits=self.n_folds, shuffle=True)


        # Create out-of-fold predictions for training meta-model
        for i, regr in enumerate(self.regr_):
            for train_idx, holdout_idx in kfold.split(X, y):
                instance = clone(regr)
                instance.fit(X[train_idx], y[train_idx])
                out_of_fold_predictions[holdout_idx, i] = instance.predict(X[holdout_idx])
       
        # Retrain base models on all data
        all_predictions = np.zeros((X.shape[0], len(self.regressors)))
        for i, regr in enumerate(self.regr_):
            regr.fit(X, y)
            all_predictions[:, i] = regr.predict(X)
        
        # Train meta-model
        #ADD more features
            X = train.values
        #X = train.values
        
        if self.use_features_in_secondary:
            self.meta_regr_.fit(np.hstack((X, out_of_fold_predictions)), y)
        else:
            self.meta_regr_.fit(out_of_fold_predictions, y)

        return self
    
    def predict(self, test):
        if type(test) == pd.core.frame.DataFrame:
            X = copy.deepcopy(test).values
        elif type(test) == np.ndarray:            
            X = copy.deepcopy(test)
        
        meta_features = np.column_stack([
            regr.predict(X) for regr in self.regr_
        ])
        
        if type(test) == pd.core.frame.DataFrame:
            #ADD more features      
            X = test.values

        if self.use_features_in_secondary:
            return self.meta_regr_.predict(np.hstack((X, meta_features)))
        else:
            return self.meta_regr_.predict(meta_features)

In [24]:
train = train_df[features]
test = test_df[features]

y_train = train_df[target[0]].values

In [40]:
en = make_pipeline(RobustScaler(), SelectFromModel(Lasso(alpha=0.03)), ElasticNet(alpha=0.001, l1_ratio=0.1))
    
cat = CatBoostRegressor(iterations=4900, verbose=200)

adb = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=2017)

bag = BaggingRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=2017, verbose=200)

rf = RandomForestRegressor(n_estimators=250, n_jobs=4, min_samples_split=25, min_samples_leaf=25, verbose=200)

et = ExtraTreesRegressor(n_estimators=250, n_jobs=4, min_samples_split=25, min_samples_leaf=25, verbose=200)

gbr = GradientBoostingRegressor(loss='huber', learning_rate=0.1, verbose=200, n_estimators=350, min_samples_split=25, min_samples_leaf=25)

xgbm = xgb.sklearn.XGBRegressor(max_depth=6, learning_rate=0.005, subsample=0.6,
                                objective='reg:linear', n_estimators=1000, verbose=200)

nn = MLPRegressor(hidden_layer_sizes=(200, 400, 50), random_state =2017, early_stopping=True, verbose=200)

svm = SVR(kernel='poly', degree=3, gamma='auto', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=True, max_iter=-1)

In [None]:
results = cross_val_score(cat, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.673187 (0.066807)
print("cat boost regressor score: {:4f} ({:4f})".format(results.mean(), results.std()))

In [None]:
results = cross_val_score(gbr, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.673187 (0.066807)
print("gradient boosting regressor score: {:4f} ({:4f})".format(results.mean(), results.std()))

results = cross_val_score(adb, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#-0.373219 (0.343094)
print("adboost regressor score: {:4f} ({:4f})".format(results.mean(), results.std()))

results = cross_val_score(bag, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.567878 (0.058145)
print("bagging regressor score: {:4f} ({:4f})".format(results.mean(), results.std()))

results = cross_val_score(nn, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#-17.162579 (34.638616)
print("neural network score: {:4f} ({:4f})".format(results.mean(), results.std()))

results = cross_val_score(rf, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.518(0.057)
print("RandomForest score: {:4f} ({:4f})".format(results.mean(), results.std()))

results = cross_val_score(et, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.617(0.05)
print("ExtraTrees score: {:4f} ({:4f})".format(results.mean(), results.std()))

results = cross_val_score(xgbm, train.values, y_train, cv=5, scoring='neg_mean_squared_error')
print("XGBoost score: {:4f} ({:4f})".format(results.mean(), results.std()))#0.658(0.065)


results = cross_val_score(svm, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#
print("SVM score: {:4f} ({:4f})".format(results.mean(), results.std()))


stack_with_feats = StackingCVRegressorRetrained((nn, rf, et), cat, use_features_in_secondary=True)

results = cross_val_score(stack_with_feats, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#en: 0.674925(0.06)
print("Stacking (with primary feats) score: {:4f} ({:4f})".format(results.mean(), results.std()))
#Stacking (with primary feats) nn score: 0.674353 (0.059497)

stack_with_feats_2 = StackingCVRegressorRetrained((bag, gbr, rf, et), cat, use_features_in_secondary=True)   
results = cross_val_score(stack_with_feats_2, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.675728 (0.066580)
print("Stacking (with primary feats) 2 score: {:4f} ({:4f})".format(results.mean(), results.std()))    

stack_with_feats_2 = StackingCVRegressorRetrained((gbr, et), cat, use_features_in_secondary=True)  

results = cross_val_score(stack_with_feats_2, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.675728 (0.066580)
print("Stacking (with primary feats) 2 score: {:4f} ({:4f})".format(results.mean(), results.std())) 

stack_with_feats_2 = StackingCVRegressorRetrained([xgbm, gbr], cat, use_features_in_secondary=False)   
results = cross_val_score(stack_with_feats_2, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.675728 (0.066580)
print("Stacking (with primary feats) 2 score: {:4f} ({:4f})".format(results.mean(), results.std()))

stack_with_feats_2 = StackingCVRegressorRetrained((bag, cat, rf, et), gbr, use_features_in_secondary=True)

stack_with_feats_2 = StackingCVRegressorRetrained([xgbm], cat, use_features_in_secondary=True)#, gbr
results = cross_val_score(stack_with_feats_2, train.values, y_train, cv=5, scoring='neg_mean_squared_error')#0.675728 (0.066580)
print("Stacking (with primary feats) 2 score: {:4f} ({:4f})".format(results.mean(), results.std()))

stack_with_feats_2.fit(train, y_train)

y_pred = stack_with_feats_2.predict(test)

df = pd.read_csv('./sample_solution.csv')
df['contest-tmp2m-14d__tmp2m'] = y_pred
df.to_csv('submission_corr_important.csv', index=False)

best_df = pd.read_csv('best_sub.csv')
y_best = best_df[target[0]].values

from numpy import dot
from numpy.linalg import norm

cos_sim = dot(y_best, s)/(norm(y_best)*norm(s))
print("cos_sim with best submission:", cos_sim)

stack_res = stack()

      Iter       Train Loss   Remaining Time 
         1          37.5992           93.62m
         2          31.7201           75.62m
         3          26.9288           68.04m
         4          22.9560           63.91m
         5          19.7134           61.59m
         6          17.0372           59.59m
         7          14.8279           58.45m
         8          12.9184           57.57m
         9          11.3425           57.03m
        10          10.0332           56.40m
        11           8.9431           55.86m
        12           8.0546           55.40m
        13           7.2884           54.81m
        14           6.6528           54.20m
        15           6.1081           53.70m
        16           5.6096           53.27m
        17           5.1953           52.93m
        18           4.8493           52.59m
        19           4.5339           52.28m
        20           4.2724           51.93m
        21           4.0299           51.64m
        2

       183           1.0121           30.41m
       184           1.0101           30.20m
       185           1.0080           30.69m
       186           1.0034           30.47m
       187           1.0004           30.25m
       188           0.9962           30.02m
       189           0.9936           29.82m
       190           0.9892           29.66m
       191           0.9869           29.51m
       192           0.9851           29.35m
       193           0.9828           29.20m
       194           0.9798           29.04m
       195           0.9782           28.88m
       196           0.9762           28.73m
       197           0.9726           28.57m
       198           0.9666           28.40m
       199           0.9643           28.24m
       200           0.9616           28.07m
       201           0.9584           27.90m
       202           0.9559           27.73m
       203           0.9543           27.56m
       204           0.9520           27.40m
       205

        15           6.4032           73.99m
        16           5.8676           73.64m
        17           5.4151           73.42m
        18           5.0270           73.08m
        19           4.7061           72.89m
        20           4.4312           72.67m
        21           4.1815           72.35m
        22           3.9517           72.04m
        23           3.7672           71.76m
        24           3.5909           71.52m
        25           3.4448           71.26m
        26           3.3024           71.04m
        27           3.1811           70.89m
        28           3.0675           71.50m
        29           2.9729           71.46m
        30           2.8811           71.46m
        31           2.8010           71.58m
        32           2.7309           71.53m
        33           2.6620           71.29m
        34           2.6026           71.07m
        35           2.5440           70.84m
        36           2.4783           70.57m
        37

       198           0.9706           27.63m
       199           0.9687           27.42m
       200           0.9658           27.21m
       201           0.9637           27.01m
       202           0.9608           26.80m
       203           0.9584           26.59m
       204           0.9569           26.38m
       205           0.9537           26.18m
       206           0.9512           25.98m
       207           0.9481           25.77m
       208           0.9433           25.58m
       209           0.9403           25.38m
       210           0.9377           25.19m
       211           0.9364           25.03m
       212           0.9330           24.86m
       213           0.9313           24.66m
       214           0.9293           24.46m
       215           0.9267           24.25m
       216           0.9248           24.05m
       217           0.9221           23.85m
       218           0.9193           23.65m
       219           0.9175           23.46m
       220

        30           2.9727           63.29m
        31           2.8817           62.59m
        32           2.8030           61.94m
        33           2.7126           61.26m
        34           2.6332           60.68m
        35           2.5632           60.41m
        36           2.5045           60.42m
        37           2.4535           59.94m
        38           2.4018           59.41m
        39           2.3465           58.90m
        40           2.2914           58.37m
        41           2.2526           57.91m
        42           2.2089           57.43m
        43           2.1750           57.01m
        44           2.1420           56.56m
        45           2.1144           56.17m
        46           2.0857           55.76m
        47           2.0566           55.32m
        48           2.0292           54.94m
        49           2.0002           54.53m
        50           1.9770           54.16m
        51           1.9449           53.78m
        52