# Data

## Taxifare

In [0]:
import pandas as pd

def get_data():
    url = "s3://wagon-public-datasets/taxi-fare-train.csv"
    df = pd.read_csv(url, nrows=100)
    return df

In [0]:
df = get_data()
df.head(3)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2


## Clean data

In [0]:
def clean_df(df):
    df = df.dropna(how='any', axis='rows')
    df = df[(df.dropoff_latitude != 0) | (df.dropoff_longitude != 0)]
    df = df[(df.pickup_latitude != 0) | (df.pickup_longitude != 0)]
    if "fare_amount" in list(df):
        df = df[df.fare_amount.between(0, 4000)]
    df = df[df.passenger_count < 8]
    df = df[df.passenger_count >= 1]
    df = df[df["pickup_latitude"].between(left=40, right=42)]
    df = df[df["pickup_longitude"].between(left=-74.3, right=-72.9)]
    df = df[df["dropoff_latitude"].between(left=40, right=42)]
    df = df[df["dropoff_longitude"].between(left=-74, right=-72.9)]
    return df

In [0]:
df.shape

(100, 8)

In [0]:
df = clean_df(df)

In [0]:
df.shape

(84, 8)

# Holdout

In [0]:
from sklearn.model_selection import train_test_split

y_train = df["fare_amount"]
X_train = df.drop("fare_amount", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1)

# Model

In [0]:
from sklearn.ensemble import RandomForestRegressor

model_params = dict(
  n_estimators=100,
  max_depth=1)

model = RandomForestRegressor()
model.set_params(**model_params)

RandomForestRegressor(max_depth=1)

# Transformers

In [0]:
def minkowski_distance(df, p,
                       start_lat="pickup_latitude",
                       start_lon="pickup_longitude",
                       end_lat="dropoff_latitude",
                       end_lon="dropoff_longitude"):
    x1 = df[start_lon]
    x2 = df[end_lon]
    y1 = df[start_lat]
    y2 = df[end_lat]
    return ((abs(x2 - x1) ** p) + (abs(y2 - y1)) ** p) ** (1 / p)

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class DistanceTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, distance_type="euclidian", **kwargs):
        self.distance_type = distance_type

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        if self.distance_type == "euclidian":
            X["distance"] = minkowski_distance(X, p=2)
        if self.distance_type == "manhattan":
            X["distance"] = minkowski_distance(X, p=1)
        return X[["distance"]]

    def fit(self, X, y=None):
        return self

# Pipeline

In [0]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

pipe_distance = make_pipeline(
    DistanceTransformer(),
    StandardScaler())


cols = ["pickup_latitude", "pickup_longitude", "dropoff_latitude", "dropoff_longitude"]

feateng_blocks = [
    ('distance', pipe_distance, cols),
]

features_encoder = ColumnTransformer(feateng_blocks)

pipeline = Pipeline(steps=[
            ('features', features_encoder),
            ('model', model)])

In [0]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 ColumnTransformer(transformers=[('distance',
                                                  Pipeline(steps=[('distancetransformer',
                                                                   DistanceTransformer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['pickup_latitude',
                                                   'pickup_longitude',
                                                   'dropoff_latitude',
                                                   'dropoff_longitude'])])),
                ('model', RandomForestRegressor(max_depth=1))])

# Metrics

In [0]:
import numpy as np

def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true) ** 2).mean())

In [0]:
y_pred = pipeline.predict(X_test)

In [0]:
rmse = compute_rmse(y_pred, y_test)

In [0]:
rmse

2.589335478037243

# MLflow

In [0]:
from memoized_property import memoized_property

import mlflow
from  mlflow.tracking import MlflowClient

class MLFlowBase():

    def __init__(self, experiment_name, MLFLOW_URI):
        self.experiment_name = experiment_name
        self.MLFLOW_URI = MLFLOW_URI

    @memoized_property
    def mlflow_client(self):
        mlflow.set_tracking_uri(self.MLFLOW_URI)
        return MlflowClient()

    @memoized_property
    def mlflow_experiment_id(self):
        try:
            return self.mlflow_client \
                .create_experiment(self.experiment_name)
        except BaseException:
            return self.mlflow_client \
                .get_experiment_by_name(self.experiment_name).experiment_id

    def mlflow_create_run(self):
        self.mlflow_run = self.mlflow_client \
            .create_run(self.mlflow_experiment_id)

    def mlflow_log_param(self, key, value):
        self.mlflow_client \
            .log_param(self.mlflow_run.info.run_id, key, value)

    def mlflow_log_metric(self, key, value):
        self.mlflow_client \
            .log_metric(self.mlflow_run.info.run_id, key, value)


# Trainer parameters

In [0]:
pipeline.get_params()

  and should_run_async(code)


{'memory': None,
 'steps': [('features',
   ColumnTransformer(transformers=[('distance',
                                    Pipeline(steps=[('distancetransformer',
                                                     DistanceTransformer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    ['pickup_latitude', 'pickup_longitude',
                                     'dropoff_latitude', 'dropoff_longitude'])])),
  ('model', RandomForestRegressor(max_depth=1))],
 'verbose': False,
 'features': ColumnTransformer(transformers=[('distance',
                                  Pipeline(steps=[('distancetransformer',
                                                   DistanceTransformer()),
                                                  ('standardscaler',
                                                   StandardScaler())]),
                                  ['p

# Gridsearch

In [0]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(
    pipeline, 
    param_grid={
        'features__distance__standardscaler__copy': [True],
        'model__min_samples_leaf': [3],
        'model__oob_score': [True],
        'model__min_weight_fraction_leaf': [0.0, 0.1]
    },
    cv=5
)

grid_search.fit(X_train, y_train)
grid_search.score(X_test, y_test)

grid_search.best_estimator_
grid_search.best_params_

  and should_run_async(code)


{'features__distance__standardscaler__copy': True,
 'model__min_samples_leaf': 3,
 'model__min_weight_fraction_leaf': 0.0,
 'model__oob_score': True}