In [64]:
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline

import xgboost as xgb

class BostonHousingModel(object):
    def __init__(self, hyperparameters=None, transformers=[]):        
        if hyperparameters:
            model = xgb.XGBRegressor(**hyperparameters)
        else:
            model = xgb.XGBRegressor()
        
        self.pipeline = Pipeline([('t_%s' % str(i + 1), transformer) 
                                   for transformer in transformers] + 
                                 [('clf', model)]
                                )
        
    def train(self, data, target):
        self.pipeline = self.pipeline.fit(data, target)
        
    def score(self, data, target, metric='r2'):
        predictions = self.predict(data)
        
        if metric == 'r2':
            return r2_score(target, predictions)
        
        elif metric == 'mse':
            return mean_squared_error(target, predictions)
    
    def save(self, file):
        _ = joblib.dump(self.pipeline, file)
    
    def load(self, file, replace=True):
        pipeline = joblib.load(file)
        
        if replace:
            self.pipeline = pipeline
        
        return pipeline
    
    def predict(self, data):
        predictions = self.pipeline.predict(data)
        return predictions    

In [None]:
# Usage
BOSTON_MODEL_PATH = 'boston-housing.pkl'
MODEL_TRACKER_PATH = 'data/tracker.pkl'
HYPER_PARAMETERS = {
    'objective': 'reg:squarederror'
}
METRIC = 'r2'

In [97]:
import pickle
from datetime import datetime

class ModelTracker(object):
    def __init__(self):
        self.training_results = []
        self.best_score = 0
    
    def add_training_result(self, metric, score, record_time=datetime.now()):
        if score > self.best_score:
            self.best_score = score
            
        self.training_results.append((metric, score, record_time))
        
    def compare_score(self, score):
        if self.best_score < score:
            return False
        else:
            return True
        
    @staticmethod
    def save(obj, file):
        with open(file, 'wb') as outfile:
            pickle.dump(obj, outfile)
            
    @staticmethod
    def load(file):
        with open(file, 'rb') as readfile:
            obj = pickle.load(readfile)
            return obj
            

In [80]:
def setup():
    _ = run_pipeline()

In [103]:
# pipeline
import os

def run_pipeline():
    # load model tracker (if exists)
    if os.path.isfile:
        model_tracker = ModelTracker.load(MODEL_TRACKER_PATH)
    else:
        model_tracker = ModelTracker()

    # load model and data
    data, target, features, description = load_data()
    model = BostonHousingModel(HYPER_PARAMETERS)

    # train, score and save training iteration
    _ = model.train(data, target)
    score = model.score(data, target, METRIC)
    model_tracker.add_training_result(METRIC, score)

    # save results if new model is better
    is_better = model_tracker.compare_score(score)

    if is_better:
        _ = model.save(BOSTON_MODEL_PATH)

    _ = model_tracker.save(model_tracker, MODEL_TRACKER_PATH)

In [107]:
features

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [114]:
import numpy as np

In [116]:
np.array([1,2,3,4]).reshape(1,-1)

array([[1, 2, 3, 4]])

In [112]:
# server
data[0].reshape(1, -1)

array([[6.320e-03, 1.800e+01, 2.310e+00, 0.000e+00, 5.380e-01, 6.575e+00,
        6.520e+01, 4.090e+00, 1.000e+00, 2.960e+02, 1.530e+01, 3.969e+02,
        4.980e+00]])

In [113]:
model.predict(data[0].reshape(1, -1))[0]

26.647913

In [10]:
# utils
from sklearn.datasets import load_boston

def load_data():
    """ Load and return the boston house-prices dataset (regression). """
    
    dataset = load_boston()
    data, target, features, description = (dataset['data'], dataset['target'], 
                                           dataset['feature_names'], dataset['DESCR'])
    
    return data, target, features, description


In [11]:
data, target, features, description = load_data()

In [120]:
data_dict = {}

In [121]:
columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
'TAX', 'PTRATIO', 'B', 'LSTAT']

In [123]:
for feature, value in zip(columns, data[0]):
    data_dict[feature] = value

In [125]:
import json

In [126]:
json.dumps(data_dict)

'{"CRIM": 0.00632, "ZN": 18.0, "INDUS": 2.31, "CHAS": 0.0, "NOX": 0.538, "RM": 6.575, "AGE": 65.2, "DIS": 4.09, "RAD": 1.0, "TAX": 296.0, "PTRATIO": 15.3, "B": 396.9, "LSTAT": 4.98}'

In [15]:
features

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')