In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
Nrows = 20000 # max: 2_000_000
measured_data =  pd.read_csv('../input/nyc-taxi-fare-initial-split/stack_validation_label.csv', nrows=Nrows, header=None)
xgb_prediction = pd.read_csv('../input/nyc-taxi-fare-prediction-xgboost/submission.csv', nrows=Nrows)
rnf_prediction = pd.read_csv('../input/nyc-taxi-fare-prediction-randomforest-new-forked/submission.csv', nrows=Nrows)
train_data = pd.concat((xgb_prediction, rnf_prediction, measured_data), axis=1)
train_data.columns = ['xgb', 'rnf', 'fare_amount']
train_data.head()

In [None]:
import matplotlib.pyplot as plt
Nrows_to_show = 20_000
plt.scatter(x=train_data.iloc[:Nrows_to_show,0].values, y=train_data.iloc[:Nrows_to_show, 1].values, alpha= 0.3)
plt.xlabel('measured')
plt.ylabel('predicted')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(train_data, test_size=0.2, random_state=1)

import xgboost as xgb

train_data_labels = train_data.fare_amount
train_data.drop('fare_amount', axis=1, inplace=True)
test_data_labels = test_data.fare_amount
test_data.drop('fare_amount', axis=1, inplace=True)


dtrain = xgb.DMatrix(train_data.values, label=train_data_labels.values)
dvalid = xgb.DMatrix(test_data.values, label=test_data_labels.values)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

# Fine-tuned based on <>/misc/XGB_hyperparameter_fine_tune.py
xgb_pars = {'min_child_weight': 75, 'eta': 0.05, 'colsample_bytree': 0.5, 'max_depth': 30,
            'subsample': 0.6, 'lambda': 2., 'nthread': 4, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}

xgb_reg = xgb.train(xgb_pars, dtrain, 2000, watchlist, early_stopping_rounds=100,
              maximize=False, verbose_eval=10)


# final prediction
prediction_data1 = pd.read_csv('../input/nyc-taxi-fare-prediction-xgboost/submission_of_final_test.csv', nrows=Nrows)
prediction_data2 = pd.read_csv('../input/nyc-taxi-fare-prediction-randomforest-new-forked/submission_of_final_test.csv', nrows=Nrows)
keys = prediction_data1.key.values
final_data = pd.concat((prediction_data1['fare_amount'], prediction_data2['fare_amount']), axis=1)
final_data.columns = ['xgb', 'rnf']


X_test_csv_prepared = xgb.DMatrix(final_data.values)
final_predictions_csv = xgb_reg.predict(X_test_csv_prepared)

final_data_set = pd.DataFrame({'key': keys, 'fare_amount': final_predictions_csv})
final_data_set.to_csv('ensemble_submission.csv', index=None)

final_data_set.head()