## Chicago taxi fare training experience 

This experiment using Scikit learn's Random Forest implementation to train a ML model on Chicago taxi dataset to estimate taxi trip fare. Selected approach, feature engineering is based on https://github.com/v-loves-avocados/chicago-taxi data exploration and analysis by [Aradhana Chaturvedi](https://www.linkedin.com/in/aradhana-chaturvedi-b91b8818).

In [19]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from joblib import dump, load
from scipy import stats

from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# plotting libraries:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

# Google clients
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

# Default appearance
sns.set(color_codes=True) #overide maplot libs ugly colours.
mpl.rcParams['figure.figsize'] = [13, 8] #default figure size

In [20]:
BQ_DATASET = "chicago_taxi_trips"
BQ_TABLE = "taxi_trips"
BQ_QUERY = """
with tmp_table as (
SELECT trip_seconds, trip_miles, fare, tolls, 
    company, pickup_latitude, pickup_longitude, dropoff_latitude, dropoff_longitude,
    DATETIME(trip_start_timestamp, 'America/Chicago') trip_start_timestamp,
    DATETIME(trip_end_timestamp, 'America/Chicago') trip_end_timestamp,
    CASE WHEN (pickup_community_area IN (56, 64, 76)) OR (dropoff_community_area IN (56, 64, 76)) THEN 1 else 0 END is_airport,
FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
WHERE
  dropoff_latitude IS NOT NULL and
  dropoff_longitude IS NOT NULL and
  pickup_latitude IS NOT NULL and
  pickup_longitude IS NOT NULL and
  fare > 0 and 
  trip_miles > 0 and
  MOD(ABS(FARM_FINGERPRINT(unique_key)), 100) {}
ORDER BY RAND()
LIMIT 20000)
SELECT *,
    EXTRACT(YEAR FROM trip_start_timestamp) trip_start_year,
    EXTRACT(MONTH FROM trip_start_timestamp) trip_start_month,
    EXTRACT(DAY FROM trip_start_timestamp) trip_start_day,
    EXTRACT(HOUR FROM trip_start_timestamp) trip_start_hour,
    FORMAT_DATE('%a', DATE(trip_start_timestamp)) trip_start_day_of_week
FROM tmp_table
"""

# Create BigQuery client
credentials, your_project_id = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
bqclient = bigquery.Client(credentials=credentials, project=your_project_id,)
bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials)

  and should_run_async(code)


### Query dataset

In [21]:
df = (
    bqclient.query(BQ_QUERY.format("between 0 and 99"))
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
)

### Column info

Watch amount of null values in 'Non-Null Count column'

In [22]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   trip_seconds            19999 non-null  float64       
 1   trip_miles              20000 non-null  float64       
 2   fare                    20000 non-null  float64       
 3   tolls                   15898 non-null  float64       
 4   company                 16287 non-null  object        
 5   pickup_latitude         20000 non-null  float64       
 6   pickup_longitude        20000 non-null  float64       
 7   dropoff_latitude        20000 non-null  float64       
 8   dropoff_longitude       20000 non-null  float64       
 9   trip_start_timestamp    20000 non-null  datetime64[ns]
 10  trip_end_timestamp      20000 non-null  datetime64[ns]
 11  is_airport              20000 non-null  int64         
 12  trip_start_year         20000 non-null  int64 

None

### Raw descriptive statistics

In [23]:
display(df.describe())

Unnamed: 0,trip_seconds,trip_miles,fare,tolls,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,is_airport,trip_start_year,trip_start_month,trip_start_day,trip_start_hour
count,19999.0,20000.0,20000.0,15898.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,845.438872,3.84146,12.903325,0.0,41.900434,-87.656891,41.900987,-87.654205,0.1284,2015.9193,6.5056,15.65755,11.08705
std,1180.802602,11.237028,11.230263,0.0,0.037356,0.069259,0.038678,0.05895,0.334543,1.887582,3.398874,8.66668,5.608212
min,0.0,0.01,0.01,0.0,41.67382,-87.913625,41.663671,-87.913625,0.0,2012.0,1.0,1.0,0.0
25%,360.0,0.9,6.25,0.0,41.880994,-87.654007,41.880994,-87.655998,0.0,2014.0,4.0,8.0,7.0
50%,600.0,1.59,8.44,0.0,41.892073,-87.632746,41.892508,-87.633308,0.0,2016.0,6.0,16.0,11.0
75%,971.0,3.7,13.65,0.0,41.907492,-87.626211,41.919225,-87.626211,0.0,2017.0,9.0,23.0,15.0
max,66091.0,1320.0,117.5,0.0,42.009623,-87.551428,42.009623,-87.534903,1.0,2020.0,12.0,31.0,23.0


### Feature engineering

In [24]:
def feature_engineering(data):
    # Add 'N/A' for missing 'Company'
    data.fillna(value={'company':'N/A','tolls':0}, inplace=True)
    # Drop rows contains null data.
    data.dropna(how='any', axis='rows', inplace=True)
    # Pickup and dropoff locations distance
    data["abs_distance"] = (np.hypot(data["dropoff_latitude"]-data["pickup_latitude"], data["dropoff_longitude"]-data["pickup_longitude"]))*100

    # Remove extremes, outliers
    possible_outliers_cols = ['trip_seconds', 'trip_miles', 'fare', 'abs_distance']
    data=data[(np.abs(stats.zscore(data[possible_outliers_cols])) < 3).all(axis=1)].copy()
    # Reduce location accuracy
    data=data.round({'pickup_latitude': 3, 'pickup_longitude': 3, 'dropoff_latitude':3, 'dropoff_longitude':3})
    return data

  and should_run_async(code)


In [25]:
df=feature_engineering(df)
display(df.describe())

Unnamed: 0,trip_seconds,trip_miles,fare,tolls,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude,is_airport,trip_start_year,trip_start_month,trip_start_day,trip_start_hour,abs_distance
count,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0,19226.0
mean,752.596588,3.153647,11.616097,0.0,41.8992,-87.652258,41.900203,-87.650938,0.096952,2015.906897,6.504005,15.657547,11.14626,4.677448
std,603.71692,4.184648,9.083996,0.0,0.035887,0.060654,0.037686,0.051092,0.2959,1.884732,3.410381,8.66908,5.606075,6.508893
min,0.0,0.01,0.01,0.0,41.674,-87.914,41.664,-87.914,0.0,2012.0,1.0,1.0,0.0,0.0
25%,360.0,0.8,6.25,0.0,41.881,-87.651,41.881,-87.656,0.0,2014.0,4.0,8.0,7.0,1.241323
50%,598.0,1.5,8.25,0.0,41.892,-87.633,41.893,-87.633,0.0,2016.0,6.0,16.0,11.0,2.264018
75%,900.0,3.3,12.65,0.0,41.906,-87.626,41.915,-87.626,0.0,2017.0,9.0,23.0,15.0,4.820092
max,4380.0,35.0,46.5,0.0,42.01,-87.551,42.01,-87.535,1.0,2020.0,12.0,31.0,23.0,29.004328


#### Remaining null values per column after feature engineering

In [26]:
print(df.isnull().sum().sort_values(ascending=False))

abs_distance              0
trip_start_day_of_week    0
trip_miles                0
fare                      0
tolls                     0
company                   0
pickup_latitude           0
pickup_longitude          0
dropoff_latitude          0
dropoff_longitude         0
trip_start_timestamp      0
trip_end_timestamp        0
is_airport                0
trip_start_year           0
trip_start_month          0
trip_start_day            0
trip_start_hour           0
trip_seconds              0
dtype: int64


  and should_run_async(code)


### Data profiling

(executing the next cell takes long time)

In [None]:
ProfileReport(df, title='Chicago taxi dataset profiling Report').to_notebook_iframe()

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=33.0), HTML(value='')))

### Visual dropoff locations

In [None]:
sc = plt.scatter(df.dropoff_longitude, df.dropoff_latitude, c = df["fare"], cmap = "summer")
plt.colorbar(sc)

#### Location histograms

In [None]:
fig, axs = plt.subplots(2)
fig.suptitle('Pickup location histograms')
df.hist('pickup_longitude', bins=100, ax=axs[0])
df.hist('pickup_latitude', bins=100, ax=axs[1])
plt.show()

fig, axs = plt.subplots(2)
fig.suptitle('Dropoff location histograms')
df.hist('dropoff_longitude', bins=100, ax=axs[0])
df.hist('dropoff_latitude', bins=100, ax=axs[1])
plt.show()

### Time based explorations

#### Trip start distribution

In [None]:
fig, axs = plt.subplots(4)
fig.suptitle('Trip start histograms')
fig.set_size_inches(18, 12, forward=True)
df.hist('trip_start_year', bins=8, ax=axs[0], )
df.hist('trip_start_month', bins=12, ax=axs[1])
df.hist('trip_start_day', bins=31, ax=axs[2])
df.hist('trip_start_hour', bins=24, ax=axs[3])
plt.show()

#### Trip loginess

In [None]:
fig, axs = plt.subplots(2)
fig.set_size_inches(18, 8, forward=True)
df.hist('trip_miles', bins=50, ax=axs[0])
df.hist('trip_seconds', bins=50, ax=axs[1])
plt.show()

#### Fare by trip start hour

In [None]:
display(df.groupby("trip_start_hour")["fare"].mean().plot())

### Split dataframe to examples and output

In [None]:
# Drop complex fields and split dataframe to examples and output
mlflow.log_param('training_shape', f'{df.shape}')

X=df.drop(['trip_start_timestamp'],axis=1)
y=df['fare']

### Training pipeline

In [None]:
ct_pipe = ColumnTransformer(transformers=[
    ('hourly_cat', OneHotEncoder(categories=[range(0,24)], sparse = False), ["trip_start_hour"]),
    ('dow', OneHotEncoder(categories=[['Mon', 'Tue', 'Sun', 'Wed', 'Sat', 'Fri', 'Thu']], sparse = False), ["trip_start_day_of_week"]),
    ('std_scaler', StandardScaler(), [
        'trip_start_year',
        'abs_distance',
        'pickup_longitude',
        'pickup_latitude',
        'dropoff_longitude',
        'dropoff_latitude',
        'trip_miles',
        'trip_seconds'])
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)
X_train=X_train.drop('fare', axis=1)

In [None]:
# for more details: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
rfr_pipe = Pipeline([
    ('ct', ct_pipe),
    ('forest_reg', RandomForestRegressor(
        n_estimators = 20,
        max_features = 'auto',
        n_jobs = -1,
        random_state = 3,
        max_depth=None,
        max_leaf_nodes=None,
    ))
])

rfr_score = cross_val_score(rfr_pipe, X_train, y_train, scoring = "neg_mean_squared_error", cv = 5)
rfr_rmse = np.sqrt(-rfr_score)
rfr_rmse.mean()
mlflow.log_metric("train_cross_valid_score_rmse_mean", np.sqrt(-rfr_score).mean())
mlflow.log_param("number_of_estimators", 20)

#### Option 1: Simple training
(~fast)

In [None]:
# To see all RandomForestRegressor hyper parameters:
# estimator=RandomForestRegressor()
# display(estimator.get_params())

# Train model
mlflow.set_experiment("chicago-taxi-0")
# mlflow.sklearn.autolog()
with mlflow.start_run(nested=True) as mlflow_run:
    final_model=rfr_pipe.fit(X_train, y_train)
    mlflow.sklearn.log_model(final_model, "chicago_rnd_forest")

#### Option 2: Parameter search + training
(time consuming)

In [None]:
param_grid = {'forest_reg__n_estimators': [5, 250], 'forest_reg__max_features': [6, 16, 'auto']}
forest_gs = GridSearchCV(rfr_pipe, param_grid, cv = 5, scoring = 'neg_mean_squared_error', n_jobs = -1)
forest_gs.fit(X_train, y_train)
print(f'Best parameters: {forest_gs.best_params_}')
print(f'Best score: {np.sqrt(-forest_gs.best_score_)}')

print(f'(All scores: {np.sqrt(-forest_gs.cv_results_["mean_test_score"])})')

final_model=forest_gs.best_estimator_

### Prediction test

In [None]:
X_pred = pd.DataFrame(X_test, columns=X_test.columns)
X_pred['fare_pred'] = final_model.predict(X_test.drop('fare',axis=1))
X_pred.head(5)

### Cross validation score to test set

In [None]:
rfr_score = cross_val_score(final_model, X_test, y_test, scoring='neg_mean_squared_error', cv = 5)
rfr_rmse = np.sqrt(-rfr_score)
rfr_rmse.mean()
mlflow.log_metric("eval_cross_valid_score_rmse_mean", np.sqrt(-rfr_score).mean())

In [65]:
# Comparer test
def model_comparer(job_name, **kwargs):
    job_name_1=job_name+'_1'
    job_name_2=job_name+'_2'

    experiment = mlflow.get_experiment_by_name("chicago-taxi-1")
    print(experiment)
    filter_string = f"tags.job_name ILIKE '{job_name}_%'"
    df = mlflow.search_runs([experiment.experiment_id], filter_string=filter_string)
    display(df)
    maxr= df.loc[df['metrics.train_cross_valid_score_rmse_mean'].idxmax()]
    display(maxr)
    
model_comparer("training_job_20210118_205422")

<Experiment: artifact_location='gs://mlops-50-artifacts/experiments/6', experiment_id='6', lifecycle_stage='active', name='chicago-taxi-1', tags={}>


  and should_run_async(code)


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.train_cross_valid_score_rmse_mean,metrics.eval_cross_valid_score_rmse_mean,params.number_of_estimators,tags.job_name,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.source.type,tags.version
0,627a7528afc94fba8882493434cd0ce5,6,FINISHED,gs://mlops-50-artifacts/experiments/6/627a7528...,2021-01-18 20:54:24.071000+00:00,2021-01-18 20:54:24.289000+00:00,0.628741,0.572777,99,training_job_20210118_205422_1,/usr/local/bin/airflow,airflow,LOCAL,fake


run_id                                                        627a7528afc94fba8882493434cd0ce5
experiment_id                                                                                6
status                                                                                FINISHED
artifact_uri                                 gs://mlops-50-artifacts/experiments/6/627a7528...
start_time                                                    2021-01-18 20:54:24.071000+00:00
end_time                                                      2021-01-18 20:54:24.289000+00:00
metrics.train_cross_valid_score_rmse_mean                                             0.628741
metrics.eval_cross_valid_score_rmse_mean                                              0.572777
params.number_of_estimators                                                                 99
tags.job_name                                                   training_job_20210118_205422_1
tags.mlflow.source.name                           