In [4]:
from abc import ABCMeta, abstractmethod
from sklearn import preprocessing, model_selection, metrics
import datetime
from sklearn.metrics import mean_squared_error
import os
import pandas as pd
import numpy as np
import xgboost as xgb


class BaseModelInterface:
    __metaclass__ = ABCMeta

    @abstractmethod
    def fit(self, x_train, y_train, x_val=None, y_val=None, seed=42):
        """Train the model on the x_train and y_train. x_val and y_val is used for early stopping for some models.

        :param x_train: 2-d numpy array
        :param y_train: 1-d numpy array
        :param x_val: 2-d numpy array - validation data
        :param y_val: 1-d numpy array - validation data
        :param seed
        :return: 1-dim array - prediction result
        """
        raise NotImplementedError

    @abstractmethod
    def predict(self, X):
        """Predict on the data X.

        :param X: 2-d numpy array
        :return: 1-dim array - prediction result
        """
        raise NotImplementedError


class XgbWrapper(BaseModelInterface):

    default_params = {
        'learning_rate': 0.3,
        'max_depth': 6,
        'min_child_weight': 1,
        'gamma': 0,
        'subsample': 1,
        'colsample_bytree': 1,
        'num_boost_round': 1000,
        'early_stopping_rounds': 40,
        'verbose_eval': 10,
        'lambda': 1,
        'alpha': 0,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1,
    }

    def __init__(self, params, feature_names=None):
        self.params = self.default_params.copy()

        for k in params:
            self.params[k] = params[k]

        self.model = None
        self.feature_names = feature_names

    def fit(self, x_train, y_train, x_val=None, y_val=None, seed=42):
        params = self.params.copy()
        params['seed'] = seed

        # Slight improvement
        params['base_score'] = np.median(y_train)

        dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=self.feature_names)

        if x_val is None:
            watchlist = [(dtrain, 'train')]
        else:
            dval = xgb.DMatrix(x_val, label=y_val, feature_names=self.feature_names)
            watchlist = [(dval, 'eval'), (dtrain, 'train')]

        self.model = xgb.train(params, dtrain, evals=watchlist)

    def predict(self, X):
        return self.model.predict(xgb.DMatrix(X, feature_names=self.feature_names))


def calculate_score_fun(y_true, y_pred):
    """Calculate the mean squared error.

    :param y_true: 1-d array
    :param y_pred: 1-d array
    :return: double
    """
    if not isinstance(y_true, np.ndarray) or not isinstance(y_pred, np.ndarray):
        raise Exception("y_true and y_pred must have type 'np.ndarray'")

    if y_true.ndim != 1 or y_pred.ndim != 1:
        raise Exception("y_true and y_pred should be 1-dim array")

    return mean_squared_error(y_true=y_true, y_pred=y_pred)


class ModelClient(object):
    seed = 2018
    test_id_col_name = "fullVisitorId"
    test_target_col_name = "PredictedLogRevenue"

    def __init__(self, model, X_train, y_train, X_test, test_ids,
                 metric_fun,
                 data_name="google_analytics",
                 n_fold=10,
                 n_bag=1):

        if not isinstance(model, BaseModelInterface):
            raise Exception('Your model must implement the BaseModelInterface')

        if (not isinstance(X_train, np.ndarray) or not isinstance(y_train, np.ndarray)
                or not isinstance(X_test, np.ndarray) or not isinstance(test_ids, np.ndarray)):
            raise Exception("X_train, y_train, X_test, test_ids must have type 'np.ndarray'!")

        if y_train.ndim != 1 or test_ids.ndim != 1:
            raise Exception("y_train and test_ids should be 1-dimension array")

        if X_train.ndim != 2 or X_test.ndim != 2:
            raise Exception("X_train and X_test should be 2-dimension array")

        if X_train.shape[1] != X_test.shape[1]:
            raise Exception("X_train and X_test must have the same number of features")

        if X_train.shape[0] != y_train.shape[0]:
            raise Exception("X_train and y_train must have the same number of rows / records")

        self.model = model
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.test_ids = test_ids

        self.n_folds = n_fold
        self.n_bags = n_bag

        self.metric_fun = metric_fun
        self.data_name = data_name

    def fit_predict(self):
        kf = model_selection.KFold(n_splits=self.n_folds, shuffle=True, random_state=self.seed)

        total_test_preds = np.zeros(self.X_test.shape[0], dtype=np.float64)
        fold_val_preds = np.zeros(self.X_train.shape[0], dtype=np.float64)
        val_score_sum = 0

        start_time = datetime.datetime.now()
        for fold, (train_index, val_index) in enumerate(kf.split(self.X_train)):
            x_train, x_val = self.X_train[train_index], self.X_train[val_index]
            y_train, y_val = self.y_train[train_index], self.y_train[val_index]

            print("\n\n")
            print('fold - ', fold, '\n')

            fold_start = datetime.datetime.now()
            fold_score_sum = 0

            # Do bagging for models on which random sees have big impact like xgboost, neural network etc.
            for bag in range(self.n_bags):
                bag_start = datetime.datetime.now()
                seed = 42 + 17 * fold + 13 * bag
                print('bag - ', bag, '\n')
                print('seed: ', seed)

                print('Training...')
                self.model.fit(x_train, y_train, x_val, y_val, seed)

                print("Validating...")
                y_preds = self.model.predict(x_val)
                fold_val_preds[val_index] += y_preds

                bag_score = self.metric_fun(y_true=y_val, y_pred=y_preds)

                fold_score_sum += bag_score
                val_score_sum += bag_score
                print('Bag score: ', bag_score)
                print("Elapsed time for fold %d - bag %d: " % (fold, bag), datetime.datetime.now() - bag_start)
                print("\n")

                total_test_preds += self.model.predict(self.X_test)

            print("Elapsed time for fold %d: " % fold, datetime.datetime.now() - fold_start)
            print('Fold score: ', fold_score_sum / self.n_bags)
            print("\n")

        avg_val_score = val_score_sum / (self.n_folds * self.n_bags)
        test_preds = total_test_preds / (self.n_folds * self.n_bags)
        fold_val_preds /= self.n_bags

        test_submission = pd.DataFrame({
            self.test_id_col_name: self.test_ids,
            self.test_target_col_name: test_preds
        })

        if not os.path.exists("Result"):
            os.mkdir("Result")
        file_path_template = 'Result/' + self.data_name + "_" + self.model.__class__.__name__ + "_" + str(
            self.n_folds) + 'fold_' + str(self.n_bags) + '_bag_' + datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S") + '_%s_score_%s.csv'

        # Write test result
        test_file_path = file_path_template % ("test", str(avg_val_score))
        print("Writing test result to %s" % test_file_path)
        test_submission.to_csv(test_file_path, index=False)

        # Write val result
        val_file_path = file_path_template % ("val", str(avg_val_score))
        print("Writing validation result to %s" % val_file_path)
        pd.DataFrame(fold_val_preds).to_csv(val_file_path, index=False, header=False)

        print('\n')
        print('Total Elapsed time: ', datetime.datetime.now() - start_time)
        print('Local cv score: ', avg_val_score)

In [3]:
# Data sample taken from https://www.kaggle.com/c/santander-value-prediction-challenge/data
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
test = test.iloc[:1000]

In [5]:
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [6]:
test.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
params = {
    'learning_rate': 0.5,
    'max_depth': 8,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 1,
    'colsample_bytree': 1,
}
model = XgbWrapper(params, feature_names=train.columns[2:])

X_train = train.iloc[:, 2:].values
y_train = train["target"].values
X_test = test.iloc[:, 1:].values
test_ids = test["ID"].values
data_name = "santander"

client = ModelClient(model, X_train, y_train, X_test, test_ids, 
                     metric_fun=calculate_score_fun,
                     data_name=data_name,
                     n_fold=5,
                     n_bag=2)
client.fit_predict()




fold -  0 

bag -  0 

seed:  42
Training...
[0]	eval-rmse:7.44879e+06	train-rmse:7.48892e+06
[1]	eval-rmse:7.02782e+06	train-rmse:6.71831e+06
[2]	eval-rmse:6.97963e+06	train-rmse:6.1762e+06
[3]	eval-rmse:7.00159e+06	train-rmse:5.84672e+06
[4]	eval-rmse:6.99531e+06	train-rmse:5.52838e+06
[5]	eval-rmse:6.96317e+06	train-rmse:5.38728e+06
[6]	eval-rmse:6.9588e+06	train-rmse:5.25654e+06
[7]	eval-rmse:6.96659e+06	train-rmse:5.05333e+06
[8]	eval-rmse:6.98377e+06	train-rmse:4.98559e+06
[9]	eval-rmse:6.94952e+06	train-rmse:4.90423e+06
Validating...
Bag score:  4.82957563279e+13
Elapsed time for fold 0 - bag 0:  0:00:04.117852


bag -  1 

seed:  55
Training...
[0]	eval-rmse:7.44879e+06	train-rmse:7.48892e+06
[1]	eval-rmse:7.02782e+06	train-rmse:6.71831e+06
[2]	eval-rmse:6.97963e+06	train-rmse:6.1762e+06
[3]	eval-rmse:7.00159e+06	train-rmse:5.84672e+06
[4]	eval-rmse:6.99531e+06	train-rmse:5.52838e+06
[5]	eval-rmse:6.96317e+06	train-rmse:5.38728e+06
[6]	eval-rmse:6.9588e+06	train-rmse:5.25654