In [45]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb
import catboost as catb

from sklearn.model_selection import KFold, StratifiedKFold

import warnings
warnings.filterwarnings("ignore")

In [2]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for i in range(20):
    train_df[f"F-{i+1}"] = np.random.randn(5000)
    test_df[f"F-{i+1}"] = np.random.randn(1000)
for i in range(3):
    train_df[f"C-{i+1}"] = np.random.randint(5, size=5000)
    test_df[f"C-{i+1}"] = np.random.randint(5, size=1000)
    
train_df[f'target'] = train_df[[f"F-{i+1}" for i in range(20)]].mean(1)+0.1*np.random.randn(5000)
test_df[f'target'] = test_df[[f"F-{i+1}" for i in range(20)]].mean(1)+0.1*np.random.randn(1000)
    
k = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
train_df["Fold"]=0
for i, (trn, val) in  enumerate( k.split(train_df, (2000*train_df["target"]).astype(int)) ) :
    train_df.loc[val, "Fold"] = i

In [3]:
class LightGBM_wrapper:
    def __init__(self, features, target, cat_cols="auto"):
        self.param = {
            'boosting_type': 'gbdt',
            'metric': 'rmse',
            "objective":"rmse",
            
            
            #'objective': 'multiclass','metric': 'multiclass', 'num_class':4,
            #'metric': 'rmse',  'metric': 'tweedie',
            #'objective': 'tweedie',
            #'tweedie_variance_power': trial.suggest_uniform('tweedie_variance_power', 1.01, 1.8),

            'n_estimators': 1400,
            'boost_from_average': False,'verbose': -1,'random_state':2020,
            
            
            'num_leaves': 96,
            'max_depth': 10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.95,
            'bagging_freq': 5
        }
        
        self.features = features
        self.target = target
        self.cat_cols = cat_cols 
        
    def set_param(self, param):
        self.param = param
    
    def train_predict(self, train_df, test_df):
        """
        Fold のカラム名は　基本Fold
        """        
        models=[]
        test_df[f'{self.target}_lgb_predict']=0
        Folds = set(train_df["Fold"])
        valid = []
        
        for fold in Folds:
            train_data = lgb.Dataset(train_df.loc[train_df["Fold"]!=fold, self.features], train_df.loc[train_df["Fold"]!=fold, self.target])
            valid_df = train_df[train_df["Fold"]==fold]
            valid_data = lgb.Dataset(valid_df[self.features], valid_df[self.target])
            model = lgb.train(self.param,  train_data,  num_boost_round=3000,  valid_sets=[train_data, valid_data], 
                              verbose_eval=500, early_stopping_rounds=100, categorical_feature=self.cat_cols)
            
            
            valid_df[f'{self.target}_lgb_predict'] = model.predict(valid_df[self.features])
            test_df[f'{self.target}_lgb_predict'] += model.predict(test_df[self.features])/len(Folds)
            valid.append(valid_df)
            models.append(model)
            
        train_df = pd.concat(valid, axis=0)
        self.models = models
        return train_df, test_df

In [42]:
class XGBoost_wrapper:
    def __init__(self, features, target):
        self.param = {
            'objective': 'reg:squarederror',
          #  'eval_metric': 'rmse',
           # 'num_class':4,
            
              'n_estimators':500,
            'colsample_bytree': 0.8,                 
            'learning_rate': 0.08,
            'max_depth': 10,
            'subsample': 1,
            'min_child_weight':3,
            'gamma':0.25,
          
        }
        
        self.features = features
        self.target = target
        
    def set_param(self, param):
        self.param = param
    
    def train_predict(self, train_df, test_df):
        models = []
        """
        Fold のカラム名は　基本Fold
        """        
        train_df = train_df.fillna(-999)
        test_df = test_df.fillna(-999)
        
        test_df[f'{self.target}_xgb_predict']=0
        test_data = xgb.DMatrix(test_df[self.features], missing=-999)
        Folds = set(train_df["Fold"])
        valid = []
        
        for fold in Folds:
            tmp_df = train_df[train_df["Fold"]!=fold]
            
            train_data = xgb.DMatrix(tmp_df[self.features], label=tmp_df[ self.target], missing=-999)
            valid_df = train_df[train_df["Fold"]==fold]
            valid_data = xgb.DMatrix(valid_df[self.features], label=valid_df[self.target], missing=-999)
            
            model = xgb.train(self.param, train_data, num_boost_round=1000, evals=[(train_data, 'train'), (valid_data, 'val')], 
                      verbose_eval=100, early_stopping_rounds=50)
            
            valid_df[f'{self.target}_xgb_predict'] = model.predict(valid_data)
            test_df[f'{self.target}_xgb_predict'] += model.predict(test_data)/len(Folds)
            
            valid.append(valid_df)
            models.append(model)
            
        train_df = pd.concat(valid, axis=0)
        self.models=models
        return train_df, test_df

In [4]:
class CatBoost_wrapper:
    def __init__(self, features, target, cat_cols=None):
        self.param = {
            'boosting_type': 'gbdt',
            'metric': 'rmse',
            "objective":"rmse",
            
            
            #'objective': 'multiclass','metric': 'multiclass', 'num_class':4,
            #'metric': 'rmse',  'metric': 'tweedie',
            #'objective': 'tweedie',
            #'tweedie_variance_power': trial.suggest_uniform('tweedie_variance_power', 1.01, 1.8),

            'n_estimators': 1400,
            'boost_from_average': False,'verbose': -1,'random_state':2020,
            
            
            'num_leaves': 96,
            'max_depth': 10,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.95,
            'bagging_freq': 5
        }
        
        self.features = features
        self.target = target
        self.cat_cols = cat_cols 
        
    def set_param(self, param):
        self.param = param
    
    def train_predict(self, train_df, test_df):
        """
        Fold のカラム名は　基本Fold
        """        
        models=[]
        test_df[f'{self.target}_catbst_predict']=0
        Folds = set(train_df["Fold"])
        valid = []
        
        for fold in Folds:
            
            train_data = lgb.Dataset(train_df.loc[train_df["Fold"]!=fold, self.features], train_df.loc[train_df["Fold"]!=fold, self.target])
            valid_df = train_df[train_df["Fold"]==fold]
            valid_data = lgb.Dataset(valid_df[self.features], valid_df[self.target])
            
            model = lgb.train(self.param,  train_data,  num_boost_round=3000,  valid_sets=[train_data, valid_data], 
                              verbose_eval=500, early_stopping_rounds=100)
            
            
            valid_df[f'{self.target}_catbst_predict'] = model.predict(valid_df[self.features])
            test_df[f'{self.target}_catbst_predict'] += model.predict(test_df[self.features])/len(Folds)
                
            valid.append(valid_df)
            models.append(model)
            
        train_df = pd.concat(valid, axis=0)
        self.models = models
        return train_df, test_df

In [16]:
features = [f"F-{i+1}" for i in range(20)]
target = "target"
cat_cols = [f"C-{i+1}" for i in range(3)]

features+=cat_cols

In [43]:
lightgbm_model = LightGBM_wrapper(features, target, cat_cols)
train_df, test_df = lightgbm_model.train_predict(train_df, test_df)

Training until validation scores don't improve for 100 rounds


KeyboardInterrupt: 

In [46]:
xgb_model = XGBoost_wrapper(features, target)
train_df, test_df = xgb_model.train_predict(train_df, test_df)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:0.51942	val-rmse:0.51336
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[100]	train-rmse:0.08369	val-rmse:0.14524
Stopping. Best iteration:
[106]	train-rmse:0.08352	val-rmse:0.14515

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:0.51832	val-rmse:0.51758
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will tr

In [47]:
train_df

Unnamed: 0,F-1,F-2,F-3,F-4,F-5,F-6,F-7,F-8,F-9,F-10,...,F-18,F-19,F-20,C-1,C-2,C-3,target,Fold,target_lgb_predict,target_xgb_predict
1,-0.904807,-2.329425,-0.777753,-0.466021,-0.817599,-1.620667,-0.522693,0.436611,1.975566,-0.890367,...,-0.675441,0.004294,-1.375143,3,3,4,-0.409058,0,-0.281957,-0.258552
2,1.273125,-1.012009,-0.271562,0.471718,0.871602,0.452382,0.178096,0.512615,1.951222,0.246685,...,0.740974,1.232847,-0.325171,0,1,4,0.105263,0,0.126393,0.094460
5,-0.149552,0.701165,-1.301870,-0.956208,-0.121461,0.230688,-0.690058,-0.209999,-1.028294,-0.426856,...,-0.382658,0.741096,0.672676,1,2,3,-0.280788,0,-0.296439,-0.250667
13,-0.581027,0.604209,-0.327847,-0.250518,1.361976,0.071762,-0.297529,-0.412899,0.770452,-0.280054,...,-0.789319,-1.882323,-1.215639,3,4,3,-0.024233,0,-0.133146,-0.138577
29,-0.146767,-0.635896,-0.995952,1.713335,-0.102747,-0.539946,1.761099,-0.369458,0.359207,-0.858735,...,-1.671627,0.564417,0.507709,4,1,0,0.042914,0,-0.078203,-0.015974
34,-0.254326,0.436764,1.043271,0.778564,0.233856,0.090584,1.320751,-0.942540,-0.760667,0.495681,...,0.874349,0.320837,0.475689,4,4,3,0.152363,0,0.198514,0.139724
35,0.031767,1.929047,-0.070900,1.722443,0.994972,-0.165906,0.445832,-0.402373,-0.222685,-1.805660,...,0.417783,0.080170,0.690078,4,3,1,0.363901,0,0.283817,0.166887
40,1.770880,1.497164,1.362226,0.715655,-0.058987,-2.523901,2.130093,-0.551186,-0.904781,1.308723,...,-0.703488,-0.889065,-0.835628,4,4,0,0.057680,0,-0.049500,-0.013465
44,-1.067266,2.414701,0.140295,2.249959,1.070339,0.709220,-0.951209,0.358168,0.640999,-0.418128,...,-1.141297,0.190284,-0.151431,1,0,1,0.163109,0,0.339085,0.261976
52,0.123896,0.628263,0.542392,-0.176155,-0.371858,-0.249939,-0.228117,-2.182274,1.859571,-0.543709,...,0.031244,0.236507,2.292551,2,4,0,0.027844,0,0.028753,0.024904


In [72]:
train_df.head()

Unnamed: 0,F-1,F-2,F-3,F-4,F-5,F-6,F-7,F-8,F-9,F-10,...,F-17,F-18,F-19,F-20,C-1,C-2,C-3,target,Fold,target_lgb_predict
1,0.225132,-2.329318,1.05898,-2.123879,-1.018657,0.404889,-0.983345,1.334326,0.898735,-0.284811,...,-2.409123,0.407514,-0.58725,0.29622,2,2,0,-0.228797,0,-0.187161
2,-0.239848,-0.971187,-1.248627,1.293189,-0.498288,0.04772,0.275242,0.47223,0.107576,-0.536529,...,-2.215553,-0.946058,0.650546,-1.839742,2,0,0,-0.095174,0,-0.167507
6,-0.13227,-0.472223,-0.618476,1.114498,1.079823,-0.152007,2.159515,0.272505,-0.578421,0.157657,...,0.295759,0.621841,1.015149,-0.290344,1,4,4,-0.039823,0,-0.12615
8,-0.496385,0.196068,1.772326,-1.383614,1.471695,-0.837056,-1.178166,-1.791408,-0.138647,0.869542,...,-0.018994,-0.927803,0.231405,-1.292182,1,3,0,0.050056,0,-0.036466
11,-0.304878,2.006305,0.620965,0.783064,0.440139,0.848947,0.224513,1.203717,0.222593,-1.780872,...,-0.450807,0.508951,1.02608,-0.313216,4,3,1,0.065313,0,0.042379


In [71]:
test_df.head()

Unnamed: 0,F-1,F-2,F-3,F-4,F-5,F-6,F-7,F-8,F-9,F-10,...,F-16,F-17,F-18,F-19,F-20,C-1,C-2,C-3,target,target_lgb_predict
0,-1.289295,-0.324608,1.628307,0.225956,-0.41104,0.361094,-1.183165,0.034036,-1.112994,-0.663756,...,-0.534598,-0.325803,-0.158735,-0.077218,0.820054,2,3,4,-0.345793,-0.206606
1,0.330874,-0.016019,1.182377,-0.995726,1.020174,0.818459,1.295896,-1.911299,-0.925972,-0.136488,...,-1.090801,-0.615711,1.564897,1.11841,-0.698912,1,4,4,0.120074,-0.085043
2,-2.175271,-0.108922,0.262382,0.960336,0.437409,0.433336,1.332441,0.833576,-0.083542,-0.172972,...,-0.421596,0.122742,0.382677,1.839474,0.456839,1,3,3,0.263362,0.056367
3,0.919909,2.154887,0.693598,0.715809,0.743402,0.754806,-0.26634,-0.422949,0.278539,0.766621,...,0.465704,1.171798,0.396139,0.130628,0.630531,0,1,0,0.684973,0.441926
4,0.225601,2.099686,-0.302992,-0.307833,0.767957,-0.220622,0.074358,-0.548538,-0.162078,3.428322,...,0.212756,0.02626,-0.872583,1.075112,0.549678,0,3,0,0.436006,0.139881
