In [3]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import datetime
import s3fs
from dask.distributed import Client

In [5]:
client = Client()
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46405 instead


0,1
Client  Scheduler: tcp://127.0.0.1:36871  Dashboard: http://127.0.0.1:46405/status,Cluster  Workers: 4  Cores: 12  Memory: 8.18 GB


In [6]:

taxi_dtypes = {
    'store_and_fwd_flag': str,
    'RatecodeID': 'float64',
    'VendorID': 'float64',
    'passenger_count': 'float64',
    'payment_type': 'float64',
}

In [7]:
taxi = dd.read_csv(
    's3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv',
    dtype=taxi_dtypes, 
    parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'],
    storage_options={'anon': True},
)

In [8]:
taxi.memory_usage(deep=True).sum().compute() / 1e9

AioReadTimeoutError: Read timeout on endpoint URL: "https://nyc-tlc.s3.amazonaws.com/trip%20data/yellow_tripdata_2019-02.csv"

In [None]:
np.round(taxi.describe().compute(), 3).T

## **Feature Engineering**

In [None]:

def make_features(df):
    df['pickup_weekday'] = df.tpep_pickup_datetime.dt.weekday
    df['pickup_weekofyear'] = df.tpep_pickup_datetime.dt.isocalendar().week.astype(int)
    df['pickup_hour'] = df.tpep_pickup_datetime.dt.hour
    df['pickup_minute'] = df.tpep_pickup_datetime.dt.minute
    df['pickup_year_seconds'] = (df.tpep_pickup_datetime - datetime.datetime(2019, 1, 1, 0, 0, 0)).dt.seconds
    df['pickup_week_hour'] = (df.pickup_weekday * 24) + df.pickup_hour
    df['store_and_fwd_flag'] = (df.store_and_fwd_flag == 'Y').astype(int)
    df['VendorID'] = df.VendorID.fillna(-1)
    df['RatecodeID'] = df.RatecodeID.fillna(-1)

In [None]:
make_features(taxi)

In [None]:
taxi.head()

In [None]:
taxi = taxi.persist() #If you have the RAM, you can call df.persist() to avoid repeated CSV loading. This returns a future which continues to execute in the background until it's complete.

In [None]:
from dask.distributed import wait
_ = wait(taxi) #Can call wait() to block until the persist() is done.

In [None]:
np.round(taxi.describe().compute(), 3).T

## ** Machine Learning**

In [None]:
numeric_feat = [
    'pickup_weekday', 
    'pickup_weekofyear', 
    'pickup_hour', 
    'pickup_minute', 
    'pickup_year_seconds',
    'pickup_week_hour', 
    'passenger_count',
]
categorical_feat = [
    'VendorID', 
    'RatecodeID', 
    'store_and_fwd_flag',
    'PULocationID',
    'DOLocationID',
]
features = numeric_feat + categorical_feat
y_col = 'total_amount'

In [None]:
# note the dask_ml imports rather than sklearn
from dask_ml.model_selection import train_test_split
from dask_ml.metrics import mean_squared_error
from dask_ml.xgboost import XGBRegressor

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    taxi[features], taxi[y_col], test_size=0.33, random_state=seed, shuffle=True)

In [None]:
X_train = X_train.persist()
y_train = y_train.persist()

In [None]:
xgb = XGBRegressor(
    n_estimators=10, 
    max_depth=3, 
    learning_rate=0.1, 
    random_state=seed, 
)

In [None]:
_ = xgb.fit(X_train, y_train)

In [None]:
# get test RMSE
preds = xgb.predict(X_test)
np.sqrt(mean_squared_error(preds, y_test.to_dask_array()))