In [1]:
import pandas as pd
train = pd.read_csv('train_final.csv')
test = pd.read_csv('test_final.csv')
print(train.shape,test.shape)

(131662, 34) (87395, 34)


In [2]:
train[['Customer_Since_Months','Life_Style_Index','Var1']].describe()

Unnamed: 0,Customer_Since_Months,Life_Style_Index,Var1
count,131662.0,131662.0,131662.0
mean,0.601591,0.417258,0.189695
std,0.354441,0.058405,0.082265
min,0.0,0.078299,0.0
25%,0.3,0.385157,0.188889
50%,0.6,0.417384,0.189422
75%,1.0,0.448369,0.189422
max,1.0,1.0,1.0


In [3]:
X = train.drop('Surge_Pricing_Type',axis=1)
y = train.Surge_Pricing_Type
X_test = test.drop('Surge_Pricing_Type',axis=1)
y_test = test.Surge_Pricing_Type
print(X.shape,y.shape,X_test.shape,y_test.shape)

(131662, 33) (131662,) (87395, 33) (87395,)


In [4]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier()

#brute force scan for all parameters, here are the tricks
#usually max_depth is 6,7,8
#learning rate is around 0.05, but small changes may make big diff
#tuning min_child_weight subsample colsample_bytree can have 
#much fun of fighting against overfit 
#n_estimators is how many round of boosting
#finally, ensemble xgboost with multiple seeds may reduce variance
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['multi:softmax'],
              'learning_rate': [0.005], #so called `eta` value
              'max_depth': [7],
              'min_child_weight': [7,9],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [1100], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [91,13255]}

print("Model Ready")

Model Ready


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(n_splits=5, shuffle=True), 
                   scoring='accuracy',
                   verbose=2, refit=True)

clf.fit(X, y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  20 out of  20 | elapsed: 57.6min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_...
             iid='deprecated', n_jobs=5,
             param_grid={'colsample_bytree': [0.7], 'learning_rate': [0.005],
                         'max_depth': [7], 'min_child_weight': [7, 9],
                         'missing': [-999], 'n_estimators': [1100],
                         'nthread': [4], 'objective': ['multi:softmax'],
              

In [6]:
preds = clf.predict(X_test)
df_submit = pd.read_csv('sample_submission.csv')
df_submit['Surge_Pricing_Type'] = preds
df_submit.to_csv('BasicTree.csv',index=False , header=True)

In [7]:
## Getting best params from GridSearchCV
from sklearn.metrics import classification_report

print("Best parameters set found on development set:")
print(clf.best_params_)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

print("Detailed classification report:")
y_true, y_pred = y, clf.predict(X)
print(classification_report(y_true, y_pred))

Best parameters set found on development set:
{'colsample_bytree': 0.7, 'learning_rate': 0.005, 'max_depth': 7, 'min_child_weight': 7, 'missing': -999, 'n_estimators': 1100, 'nthread': 4, 'objective': 'multi:softmax', 'seed': 13255, 'silent': 1, 'subsample': 0.7}
0.687 (+/-0.004) for {'colsample_bytree': 0.7, 'learning_rate': 0.005, 'max_depth': 7, 'min_child_weight': 7, 'missing': -999, 'n_estimators': 1100, 'nthread': 4, 'objective': 'multi:softmax', 'seed': 91, 'silent': 1, 'subsample': 0.7}

0.688 (+/-0.004) for {'colsample_bytree': 0.7, 'learning_rate': 0.005, 'max_depth': 7, 'min_child_weight': 7, 'missing': -999, 'n_estimators': 1100, 'nthread': 4, 'objective': 'multi:softmax', 'seed': 13255, 'silent': 1, 'subsample': 0.7}

0.687 (+/-0.004) for {'colsample_bytree': 0.7, 'learning_rate': 0.005, 'max_depth': 7, 'min_child_weight': 9, 'missing': -999, 'n_estimators': 1100, 'nthread': 4, 'objective': 'multi:softmax', 'seed': 91, 'silent': 1, 'subsample': 0.7}

0.688 (+/-0.005) for {