# **How RayDP works together with Ray**

## 1. Colab enviroment Setup

In [None]:
# Install ray and raydp
! pip install ray==1.9
! pip install raydp
! pip install --upgrade pip
! pip install raydp
! pip install ray[tune]
! pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
! pip list

## 2. Import dependencies

In [17]:
import ray
from ray.util.sgd.torch import TrainingOperator
from ray.util.sgd import TorchTrainer
from ray import tune
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader

import raydp
from raydp.torch import TorchEstimator
from raydp.utils import random_split
from raydp.spark import RayMLDataset

import os
import argparse

import numpy as np
import pandas as pd

from os.path import dirname, realpath

import numpy as np
from pyspark.sql.functions import hour, quarter, month, year, dayofweek, dayofmonth, weekofyear, col, lit, udf, abs as functions_abs

## 3. Get the data file

In [18]:

base_date = np.datetime64("2010-01-01 00:00:00")

# The size of data
N = 2000

fare_amount = np.random.uniform(3.0, 50.0, size=N)
pick_long = np.random.uniform(-74.2, -73.8, size=N)
pick_lat = np.random.uniform(40.7, 40.8, size=N)
drop_long = np.random.uniform(-74.2, -73.8, size=N)
drop_lat = np.random.uniform(40.7, 40.8, size=N)
passenger_count = np.random.randint(1, 5, size=N)
date = np.random.randint(0, 157680000, size=N) + base_date
date = np.array([t.item().strftime("%Y-%m-%d %H:%m:%S UTC") for t in date])
key = ["fake_key"] * N
df = pd.DataFrame({
    "key": key,
    "fare_amount":fare_amount,
    "pickup_datetime": date,
    "pickup_longitude": pick_long,
    "pickup_latitude": pick_lat,
    "dropoff_longitude": drop_long,
    "dropoff_latitude": drop_lat,
    "passenger_count": passenger_count
    })
df.to_csv("/content/fake_nyctaxi.csv", index=False)

## 4. Define the data_process function

In [19]:
def clean_up(data):
    data = data.filter(col("pickup_longitude")<=-72) \
            .filter(col("pickup_longitude")>=-76) \
            .filter(col("dropoff_longitude")<=-72) \
            .filter(col("dropoff_longitude")>=-76) \
            .filter(col("pickup_latitude")<=42) \
            .filter(col("pickup_latitude")>=38) \
            .filter(col("dropoff_latitude")<=42) \
            .filter(col("dropoff_latitude")>=38) \
            .filter(col("passenger_count")<=6) \
            .filter(col("passenger_count")>=1) \
            .filter(col("fare_amount") > 0) \
            .filter(col("fare_amount") < 250) \
            .filter(col("dropoff_longitude") != col("pickup_longitude")) \
            .filter(col("dropoff_latitude") != col("pickup_latitude"))
    return data

# Add time related features
def add_time_features(data):
    data = data.withColumn("day", dayofmonth(col("pickup_datetime")))
    data = data.withColumn("hour_of_day", hour(col("pickup_datetime")))
    data = data.withColumn("day_of_week", dayofweek(col("pickup_datetime"))-2)
    data = data.withColumn("week_of_year", weekofyear(col("pickup_datetime")))
    data = data.withColumn("month_of_year", month(col("pickup_datetime")))
    data = data.withColumn("quarter_of_year", quarter(col("pickup_datetime")))
    data = data.withColumn("year", year(col("pickup_datetime")))
    @udf("int")
    def night(hour, weekday):
        if ((16 <= hour <= 20) and (weekday < 5)):
            return int(1)
        else:
            return int(0)

    @udf("int")
    def late_night(hour):
        if ((hour <= 6) or (hour >= 20)):
            return int(1)
        else:
            return int(0)
    data = data.withColumn("night", night("hour_of_day", "day_of_week"))
    data = data.withColumn("late_night", late_night("hour_of_day"))
    return data

def add_distance_features(data):
    @udf("float")
    def manhattan(lat1, lon1, lat2, lon2):
        return float(np.abs(lat2 - lat1) + np.abs(lon2 - lon1))
    # Location of NYC downtown
    ny = (-74.0063889, 40.7141667)
    # Location of the three airport in NYC
    jfk = (-73.7822222222, 40.6441666667)
    ewr = (-74.175, 40.69)
    lgr = (-73.87, 40.77)
    # Features about the distance between pickup/dropoff and airport
    data = data.withColumn("abs_diff_longitude", functions_abs(col(
        "dropoff_longitude")-col("pickup_longitude"))) \
               .withColumn("abs_diff_latitude", functions_abs(col(
        "dropoff_latitude") - col("pickup_latitude")))
    data = data.withColumn("manhattan", col(
        "abs_diff_latitude")+col("abs_diff_longitude"))
    data = data.withColumn("pickup_distance_jfk", manhattan(
        "pickup_longitude", "pickup_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("dropoff_distance_jfk", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("pickup_distance_ewr", manhattan(
        "pickup_longitude", "pickup_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("dropoff_distance_ewr", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("pickup_distance_lgr", manhattan(
        "pickup_longitude", "pickup_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("dropoff_distance_lgr", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("pickup_distance_downtown", manhattan(
        "pickup_longitude", "pickup_latitude", lit(ny[0]), lit(ny[1])))
    data = data.withColumn("dropoff_distance_downtown", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(ny[0]), lit(ny[1])))
    return data

def drop_col(data):
    data = data.drop("pickup_datetime") \
            .drop("pickup_longitude") \
            .drop("pickup_latitude") \
            .drop("dropoff_longitude") \
            .drop("dropoff_latitude") \
            .drop("passenger_count") \
            .drop("key")
    return data

def nyc_taxi_preprocess(data):
    data = clean_up(data)
    data = add_time_features(data)
    data = add_distance_features(data)
    return drop_col(data)

## 5. Init or connect to a ray cluster

In [20]:
# 
# ray.init(address="auto")
# 
ray.init(num_cpus=6)

2022-05-06 04:09:27,083	INFO services.py:1340 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'metrics_export_port': 59568,
 'node_id': 'bac610b2509573b1935eadbffd5d63cbe3e7b255b5ed3834a8dc5a41',
 'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2022-05-06_04-09-24_557903_60/sockets/plasma_store',
 'raylet_ip_address': '172.28.0.2',
 'raylet_socket_name': '/tmp/ray/session_2022-05-06_04-09-24_557903_60/sockets/raylet',
 'redis_address': '172.28.0.2:6379',
 'session_dir': '/tmp/ray/session_2022-05-06_04-09-24_557903_60',
 'webui_url': '127.0.0.1:8265'}

## 6. Get a spark session

In [21]:
app_name = "NYC Taxi Fare Prediction with RayDP"
num_executors = 1
cores_per_executor = 1
memory_per_executor = "500M"
spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor)

## 7. Data processing

In [22]:
# Read data from file
data = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load('/content/fake_nyctaxi.csv')
# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")
# Transform the dataset
data = nyc_taxi_preprocess(data)
# Split data into train_dataset and test_dataset
train_df, test_df = random_split(data, [0.9, 0.1], 0)
features = [field.name for field in list(train_df.schema) if field.name != "fare_amount"]
# Convert spark dataframe into ML Dataset
train_dataset = RayMLDataset.from_spark(train_df, num_executors, 32)
test_dataset = RayMLDataset.from_spark(test_df, num_executors, 32)
# Then convert to torch datasets
train_dataset = train_dataset.to_torch(feature_columns=features, label_column="fare_amount")
test_dataset = test_dataset.to_torch(feature_columns=features, label_column="fare_amount")



## 8. Define a neural network model

In [23]:
class NYC_Model(nn.Module):
    def __init__(self, cols):
        super(NYC_Model, self).__init__()
        
        self.fc1 = nn.Linear(cols, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 16)
        self.fc5 = nn.Linear(16, 1)
        
        self.bn1 = nn.BatchNorm1d(256)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(64)
        self.bn4 = nn.BatchNorm1d(16)

    def forward(self, *x):
        x = torch.cat(x, dim=1)
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = F.relu(self.fc3(x))
        x = self.bn3(x)
        x = F.relu(self.fc4(x))
        x = self.bn4(x)
        x = self.fc5(x)
        
        return x

## 9. Define the TrainingOperator

In [24]:
class CustomOperator(TrainingOperator):
    def setup(self, config):
        nyc_model = NYC_Model(len(features))
        criterion = nn.SmoothL1Loss()
        optimizer = torch.optim.Adam(nyc_model.parameters(), lr=config['lr'])
        # A quick work-around for https://github.com/ray-project/ray/issues/14352
        self.model, self.optimizer, self.criterion = self.register(
            models=[nyc_model], optimizers=[optimizer], criterion=criterion)
        self.model = self.model[0]
        self.optimizer = self.optimizer[0]
        # Get the corresponging shard
        train_shard = train_dataset.get_shard(self.world_rank)
        train_loader = DataLoader(train_shard, batch_size=64)
        test_shard = test_dataset.get_shard(self.world_rank)
        val_loader = DataLoader(test_shard, batch_size=64)
        self.register_data(train_loader=train_loader, validation_loader=val_loader)

## 10. Train model via TorchTrainer

In [25]:
trainer = TorchTrainer(training_operator_cls=CustomOperator,
                       num_workers=num_executors,
                       add_dist_sampler=False,
                       num_cpus_per_worker=1,
                       config={"lr":0.01})
for i in range(10):
    stats = trainer.train()
    print(stats)
    val_stats = trainer.validate()
    print(val_stats)
trainer.shutdown()

[2m[36m(TorchRunner pid=1683)[0m   t = torch.as_tensor(column, dtype=dtype)


{'num_samples': 1789, 'epoch': 1.0, 'batch_count': 28.0, 'train_loss': 24.79593428371338, 'last_train_loss': 22.45583724975586}
{'num_samples': 211, 'batch_count': 4.0, 'val_loss': 39.25195845834452, 'last_val_loss': 40.1341438293457, 'val_accuracy': 0.0, 'last_val_accuracy': 0.0}
{'num_samples': 1789, 'epoch': 2.0, 'batch_count': 28.0, 'train_loss': 20.867513675806975, 'last_train_loss': 16.393869400024414}
{'num_samples': 211, 'batch_count': 4.0, 'val_loss': 19.96777047252203, 'last_val_loss': 20.588455200195312, 'val_accuracy': 0.0, 'last_val_accuracy': 0.0}
{'num_samples': 1789, 'epoch': 3.0, 'batch_count': 28.0, 'train_loss': 14.817819767520707, 'last_train_loss': 14.141854286193848}
{'num_samples': 211, 'batch_count': 4.0, 'val_loss': 13.604915072002681, 'last_val_loss': 12.041680335998535, 'val_accuracy': 0.0, 'last_val_accuracy': 0.0}
{'num_samples': 1789, 'epoch': 4.0, 'batch_count': 28.0, 'train_loss': 11.712579469163702, 'last_train_loss': 12.467962265014648}
{'num_samples':

## 11. shut down ray and raydp

In [26]:
raydp.stop_spark()
ray.shutdown()