# Random Forest

## Dask + RAPIDS 

 <img src="https://images.exxactcorp.com/CMS/landing-page/resource-center/supported-software/deep-learning/rapids/Rapids-Logo-lg.png" width="400" />
 
**Hardware**: 20 nodes, g4dn.xlarge (4 CPU, 16GB RAM; 1 GPU, 16GB GPU RAM)

# Load data

In [1]:
from dask.distributed import Client, wait
from dask import persist
from dask_saturn import SaturnCluster

cluster = SaturnCluster(n_workers=20, scheduler_size='xlarge', worker_size='g4dnxlarge')
client = Client(cluster)
cluster

[2020-07-26 23:51:45] INFO - dask-saturn | Cluster is ready


VBox(children=(HTML(value='<h2>SaturnCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n   …

In [2]:
import dask_cudf
import dask.dataframe as dd
import s3fs

In [3]:
fs = s3fs.S3FileSystem(anon=True)
files = [f"s3://{x}" for x in fs.ls('s3://nyc-tlc/trip data/')
         if 'yellow' in x and ('2019' in x or '2018' in x or '2017' in x)]
cols = ['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance',
        'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount',
        'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount']

taxi = dask_cudf.read_csv(files, 
                          assume_missing=True,
                          parse_dates=[1,2], 
                          usecols=cols, 
                          storage_options={'anon': True})

In [4]:
%%time
len(taxi)

CPU times: user 117 ms, sys: 4.59 ms, total: 121 ms
Wall time: 20.8 s


300700143

# Feature engineering

In [5]:
taxi['pickup_weekday'] = taxi.tpep_pickup_datetime.dt.weekday
taxi['pickup_hour'] = taxi.tpep_pickup_datetime.dt.hour
taxi['pickup_minute'] = taxi.tpep_pickup_datetime.dt.minute
taxi['pickup_week_hour'] = (taxi.pickup_weekday * 24) + taxi.pickup_hour
taxi['passenger_count'] = taxi.passenger_count.astype(float)

In [6]:
# features and target column names
numeric_feat = [
    'pickup_weekday', 
    'pickup_hour', 
    'pickup_minute',
    'pickup_week_hour', 
    'passenger_count',
]

categorical_feat = [
    'VendorID', 
    'RatecodeID', 
    'store_and_fwd_flag',
    'PULocationID',
    'DOLocationID',
]
features = numeric_feat + categorical_feat
y_col = 'total_amount'

In [7]:
taxi[categorical_feat] = taxi[categorical_feat].astype('category')
for cat in categorical_feat:
    taxi[cat] = taxi[cat].cat.codes

X = taxi[features].astype('float32').fillna(-1)
y = taxi[y_col]

In [8]:
%%time
X, y = persist(X, y)
_ = wait([X, y])

CPU times: user 1.44 s, sys: 63.3 ms, total: 1.5 s
Wall time: 24.1 s


In [9]:
%%time
len(X)

CPU times: user 26.8 ms, sys: 7.21 ms, total: 34 ms
Wall time: 115 ms


300700143

In [10]:
# to check GPU utilization on each worker
# in Saturn Cloud, go to Logs page and examine worker logs
def smi():
    import os
    os.system('nvidia-smi')
    
_ = client.run(smi)

# Train random forest!

In [11]:
from cuml.dask.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=200, max_depth=15, seed=42)

In [12]:
%%time

_ = rf.fit(X, y)

CPU times: user 122 ms, sys: 1.06 ms, total: 123 ms
Wall time: 1.75 s
