In [None]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
#from IPython import get_ipython
#get_ipython().magic('matplotlib inline')
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

train = pd.read_csv('../Data/subset_training_crossV_1.csv',delimiter=",", header=0, index_col=0)
target = 'loss'
IDcol = 'id'


def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=100):
    
    if useTrainCV:
        print "is in if"
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, 
                          num_boost_round=alg.get_params()['n_estimators'], 
                          nfold=cv_folds, 
                          metrics=['mae'], 
                          early_stopping_rounds=early_stopping_rounds)#, show_progress=False)
        print cvresult
        print cvresult.shape[0]
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Train on data
    xgb_param = alg.get_xgb_params()
    final_gb = xgb.train(xgb_param, xgtrain, num_boost_round = cvresult.shape[0])
    print "trained"
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['loss'],eval_metric='mae')
    print "Fitted"
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    print "Predicted"
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    print "Probs"
    
    #Print model report:
    print "\nModel Report"
    print "Explained variance: %.4g" % metrics.explained_variance_score(dtrain['loss'].values, dtrain_predictions)
    print "MAE (Train): %f" % metrics.mean_absolute_error(dtrain['loss'], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')


    
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]

#print predictors
xgb1 = XGBClassifier(
 learning_rate =0.05,
 n_estimators=3000,
 max_depth=3,
 min_child_weight=1,
 gamma=0,
 subsample=0.9,
 colsample_bytree=0.54,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 seed=42)
print "Classifier"
modelfit(xgb1, train, predictors)





Classifier
is in if
      test-mae-mean  test-mae-std  train-mae-mean  train-mae-std
0       2886.007373      4.456550     2886.016065       1.172841
1       2742.052539      4.371791     2742.183496       1.100577
2       2606.419287      4.247414     2606.512012       0.988036
3       2479.553174      4.109457     2479.541601       0.762356
4       2361.880371      3.929556     2361.835547       1.073810
5       2253.427734      4.104824     2253.337647       1.046024
6       2154.374951      3.795487     2154.315967       1.253383
7       2065.360303      4.255397     2065.411084       1.673943
8       1985.320801      3.661455     1985.181616       2.171704
9       1913.682641      3.675978     1913.519873       2.392394
10      1850.245630      3.414214     1849.909375       2.275843
11      1794.409522      4.277369     1794.010010       3.287976
12      1743.867236      4.637342     1743.446606       4.324663
13      1698.492359      4.681446     1697.943604       4.098709
14   

In [8]:
param_test1 = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,6,2)
}

gsearch1 = GridSearchCV(
    estimator = XGBClassifier( 
        learning_rate =0.05, 
        n_estimators=121, 
        max_depth=3,
        min_child_weight=1, 
        gamma=0, 
        subsample=0.9, 
        colsample_bytree=0.53,
        objective= 'reg:linear', 
        nthread=4, 
        scale_pos_weight=1, 
        seed=42), 
    param_grid = param_test1, scoring='neg_mean_absolute_error',n_jobs=1,iid=False, cv=2)
gsearch1.fit(train[predictors],train[target])
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: -2405.00224, std: 106.83809, params: {'max_depth': 1, 'min_child_weight': 1},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 1, 'min_child_weight': 3},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 1, 'min_child_weight': 5},
  mean: -2405.00224, std: 106.83809, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: -2405.00224, std: 106.83809, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: -2405.00224, std: 106.83809, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: -2873.69097, std: 27.75296, params: {'max_depth': 7, 