In [6]:
from sklearn.grid_search import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import log_loss, mean_squared_error as mse, r2_score 
from sklearn.metrics.scorer import make_scorer

In [7]:
def runXGB(X_train, y_train, X_test=None, num_class=1, feature_names=None, seed=0, num_rounds=1000, early_stopping_rounds=None):
    params = {
    'booster': 'gbtree',
    'objective': 'reg:linear', #'multi:softprob'
    'subsample': 0.8,
    'colsample_bytree': 0.85, #like max_features
    'eta': 0.05,
    'max_depth': 7,
    'seed': seed,
    'silent': 0,
    'eval_metric': 'rmse' # "logloss", "mlogloss", auc" # for ranking 
    }
    
    if num_class!=1:
        params['num_class']=num_class

    plst = list(params.items())
    dtrain = xgb.DMatrix(X_train, y_train)
    
    model = xgb.train(plst, dtrain, num_boost_round=num_rounds, early_stopping_rounds=early_stopping_rounds)

    if X_test is not None:
        dtest = xgb.DMatrix(X_test)
        pred = model.predict(dtest)
        return pred, model
    return None, model

In [8]:
def runXGBShuffle(X_train, y_train, X_test, num_class=1, feature_names=None, seed=0, num_rounds=1000, test_size=.3, \
               early_stopping_rounds=None):
    params = {
    'booster': 'gbtree',
    'objective': 'reg:linear', #'multi:softprob'
    'subsample': 0.8,
    'colsample_bytree': 0.85, #like max_features
    'eta': 0.05,
    'max_depth': 7,
    'seed': seed,
    'silent': 0,
    'eval_metric': 'rmse' # "logloss", "mlogloss", auc" # for ranking 
    }
    
    if num_class!=1:
        params['num_class']=num_class

    plst = list(params.items())
    X_dtrain, X_deval, y_dtrain, y_deval=train_test_split(X_train, y_train, random_state=seed, test_size=test_size)
    dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
    deval = xgb.DMatrix(X_deval, y_deval)
    watchlist = [(deval, 'eval')]
    
    model = xgb.train(plst, dtrain, num_rounds, watchlist, early_stopping_rounds=early_stopping_rounds)

    if X_test is not None:
        dtest = xgb.DMatrix(X_test)
        pred = model.predict(dtest)
        return pred, model
    return None, model

In [15]:
class xgbClass(object):
    def __init__(self, eta=.1, subsample=.8, num_class=1, max_depth=5, seed=17, silent=0, eva_metric='mlogloss',\
                colsample_bytree=.8, objective='solfprob'):
        self.params={
        'objective' : objective, #'reg:linear','multi:softprob'
        'subsample' : subsample,
        'colsample_bytree' : colsample_bytree, #like max_features
        'eta': eta,
        'max_depth': max_depth,
        'seed': seed,
        'silent': silent,
        'eval_metric': eva_metric#'rmse' "logloss", "mlogloss", auc" # for ranking 
        }
        
        if num_class!=1:
            self.params['num_class']=num_class
        self.model=[]
        
    def fit(X_train, y_train, num_rounds=500, early_stopping_rounds=None):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        self.model = xgb.train(self.params, dtrain, num_boost_round=num_rounds, early_stopping_rounds=early_stopping_rounds)
    
    def predict(X_test):
        return self.model.predict(X_test)

In [16]:
import kagglegym
env = kagglegym.make()
o = env.reset()
excl = [env.ID_COL_NAME, env.SAMPLE_COL_NAME, env.TARGET_COL_NAME, env.TIME_COL_NAME]
col = [c for c in o.train.columns if c not in excl]

O = pd.read_hdf('../input/train.h5')
d_mean= O[col].median(axis=0)

ymean_dict = dict(o.train.groupby(["id"])["y"].median())

X_train=(O[col])[O.timestamp <= 905]
y_train=O.y[O.timestamp <= 905]
X_test=(O[col])[O.timestamp > 905]
y_test=O.y[O.timestamp > 905]
X_train=X_train.fillna(d_mean)
X_test=X_test.fillna(d_mean)

In [61]:
pred, model=runXGB(X_train=X_train, y_train=y_train, X_test=X_test, num_rounds=500)
print(r2_score(pred, y_test))
#-44.5409187933

-44.3721467031


In [41]:
pred, model=runXGBShuffle(X_train=X_train, y_train=y_train, X_test=X_test, num_rounds=500, test_size=.3)
print(r2_score(pred, y_test))

[0]	eval-rmse:0.475282
Will train until eval-rmse hasn't improved in 50 rounds.
[1]	eval-rmse:0.451571
[2]	eval-rmse:0.429047
[3]	eval-rmse:0.407654
[4]	eval-rmse:0.387334
[5]	eval-rmse:0.368031
[6]	eval-rmse:0.349697
[7]	eval-rmse:0.332282
[8]	eval-rmse:0.315744
[9]	eval-rmse:0.300037
[10]	eval-rmse:0.285118
[11]	eval-rmse:0.27095
[12]	eval-rmse:0.257495
[13]	eval-rmse:0.244718
[14]	eval-rmse:0.232583
[15]	eval-rmse:0.221062
[16]	eval-rmse:0.210122
[17]	eval-rmse:0.199734
[18]	eval-rmse:0.189872
[19]	eval-rmse:0.180509
[20]	eval-rmse:0.171622
[21]	eval-rmse:0.163186
[22]	eval-rmse:0.15518
[23]	eval-rmse:0.147581
[24]	eval-rmse:0.14037
[25]	eval-rmse:0.133526
[26]	eval-rmse:0.127036
[27]	eval-rmse:0.120879
[28]	eval-rmse:0.115039
[29]	eval-rmse:0.109503
[30]	eval-rmse:0.104253
[31]	eval-rmse:0.099277
[32]	eval-rmse:0.094563
[33]	eval-rmse:0.090098
[34]	eval-rmse:0.085867
[35]	eval-rmse:0.081861
[36]	eval-rmse:0.078069
[37]	eval-rmse:0.074482
[38]	eval-rmse:0.07109
[39]	eval-rmse:0.0678

In [12]:
model=xgbClass()
model.fit(X_train, y_train)
pred=model.predict(X_test)
print(r2_score(pred, y_test))

TypeError: can not initialize DMatrix from xgbClass