In [25]:
from sklearn.grid_search import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import log_loss, mean_squared_error as mse, r2_score 
from sklearn.metrics.scorer import make_scorer

In [11]:
def runXGB(X_train, y_train, X_test=None, num_class=1, feature_names=None, seed=0, num_rounds=1000, early_stopping_rounds=None):
    params = {
    'booster': 'gbtree',
    'objective': 'reg:linear', #'multi:softprob'
    'subsample': 0.8,
    'colsample_bytree': 1, #0.85, #like max_features
    'eta': 0.05,
    'max_depth': 7,
    'seed': seed,
    'silent': 0,
    'eval_metric': 'rmse' # "logloss", "mlogloss", auc" # for ranking 
    }
    
    if num_class!=1:
        params['num_class']=num_class

    plst = list(params.items())
    dtrain = xgb.DMatrix(X_train, y_train)
    
    model = xgb.train(plst, dtrain, num_boost_round=num_rounds, early_stopping_rounds=early_stopping_rounds)

    if X_test is not None:
        dtest = xgb.DMatrix(X_test)
        pred = model.predict(dtest)
        return pred, model
    return None, model

In [12]:
def runXGBShuffle(X_train, y_train, X_test, num_class=1, feature_names=None, seed=0, num_rounds=1000, test_size=.3, \
               early_stopping_rounds=None):
    params = {
    'booster': 'gbtree',
    'objective': 'reg:linear', #'multi:softprob'
    'subsample': 0.8,
    'colsample_bytree': 1, #0.85, #like max_features
    'eta': 0.05,
    'max_depth': 7,
    'seed': seed,
    'silent': 0,
    'eval_metric': 'rmse' # "logloss", "mlogloss", auc" # for ranking 
    }
    
    if num_class!=1:
        params['num_class']=num_class

    plst = list(params.items())
    X_dtrain, X_deval, y_dtrain, y_deval=train_test_split(X_train, y_train, random_state=seed, test_size=test_size)
    dtrain = xgb.DMatrix(X_dtrain, y_dtrain)
    deval = xgb.DMatrix(X_deval, y_deval)
    watchlist = [(deval, 'eval')]
    
    model = xgb.train(plst, dtrain, num_rounds, watchlist, early_stopping_rounds=early_stopping_rounds)

    if X_test is not None:
        dtest = xgb.DMatrix(X_test)
        pred = model.predict(dtest)
        return pred, model
    return None, model

In [18]:
class xgbClass(object):
    def __init__(self, eta=.1, subsample=.8, num_class=1, max_depth=5, seed=17, silent=0, eva_metric='mlogloss',\
                colsample_bytree=1, objective='solfprob'):
        self.params={
        'objective' : objective, #'reg:linear','multi:softprob'
        'subsample' : subsample,
        'colsample_bytree' : colsample_bytree, #like max_features
        'eta': eta,
        'max_depth': max_depth,
        'seed': seed,
        'silent': silent,
        'eval_metric': eva_metric#'rmse' "logloss", "mlogloss", auc" # for ranking 
        }
        
        if num_class!=1:
            self.params['num_class']=num_class
        self.model=[]
        
    def fit(self, X_train, y_train, num_rounds=500, early_stopping_rounds=None):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        self.model = xgb.train(self.params, dtrain, num_boost_round=num_rounds, early_stopping_rounds=early_stopping_rounds)
    
    def predict(self, X_test):
        dtest = xgb.DMatrix(X_test)
        return self.model.predict(dtest)

In [19]:
n=100000
X=pd.DataFrame(np.random.randn(n,1))
y=X.iloc[:,0]+.2*pd.Series(np.random.randn(n))
X_train,y_train=X.iloc[:n/2], y.iloc[:n/2]
X_test, y_test=X.iloc[n/2:], y.iloc[n/2:]

# import kagglegym
# env = kagglegym.make()
# o = env.reset()
# excl = [env.ID_COL_NAME, env.SAMPLE_COL_NAME, env.TARGET_COL_NAME, env.TIME_COL_NAME]
# col = [c for c in o.train.columns if c not in excl]

# O = pd.read_hdf('../input/train.h5')
# d_mean= O[col].median(axis=0)

# ymean_dict = dict(o.train.groupby(["id"])["y"].median())

# X_train=(O[col])[O.timestamp <= 905]
# y_train=O.y[O.timestamp <= 905]
# X_test=(O[col])[O.timestamp > 905]
# y_test=O.y[O.timestamp > 905]
# X_train=X_train.fillna(d_mean)
# X_test=X_test.fillna(d_mean)

In [20]:
pred, model=runXGB(X_train=X_train, y_train=y_train, X_test=X_test, num_rounds=500)
print(r2_score(pred, y_test))
#-44.5409187933

0.95912476935


In [16]:
pred, model=runXGBShuffle(X_train=X_train, y_train=y_train, X_test=X_test, num_rounds=500, test_size=.3)
print(r2_score(pred, y_test))

[0]	eval-rmse:1.08444
[1]	eval-rmse:1.03227
[2]	eval-rmse:0.982866
[3]	eval-rmse:0.93602
[4]	eval-rmse:0.891542
[5]	eval-rmse:0.849469
[6]	eval-rmse:0.809562
[7]	eval-rmse:0.771749
[8]	eval-rmse:0.735948
[9]	eval-rmse:0.702123
[10]	eval-rmse:0.670081
[11]	eval-rmse:0.639809
[12]	eval-rmse:0.611173
[13]	eval-rmse:0.584058
[14]	eval-rmse:0.558446
[15]	eval-rmse:0.534348
[16]	eval-rmse:0.511632
[17]	eval-rmse:0.490155
[18]	eval-rmse:0.469953
[19]	eval-rmse:0.45091
[20]	eval-rmse:0.433076
[21]	eval-rmse:0.416246
[22]	eval-rmse:0.400447
[23]	eval-rmse:0.3856
[24]	eval-rmse:0.371737
[25]	eval-rmse:0.358743
[26]	eval-rmse:0.346619
[27]	eval-rmse:0.335264
[28]	eval-rmse:0.324691
[29]	eval-rmse:0.314848
[30]	eval-rmse:0.305699
[31]	eval-rmse:0.297187
[32]	eval-rmse:0.289293
[33]	eval-rmse:0.281913
[34]	eval-rmse:0.275099
[35]	eval-rmse:0.268825
[36]	eval-rmse:0.263026
[37]	eval-rmse:0.257696
[38]	eval-rmse:0.252784
[39]	eval-rmse:0.248266
[40]	eval-rmse:0.244088
[41]	eval-rmse:0.240266
[42]	eva

In [21]:
model=xgbClass(objective='reg:linear', eva_metric='rmse')
model.fit(X_train, y_train)
pred=model.predict(X_test)
print(r2_score(pred, y_test))

0.959140452508
