# **How RayDP works together with Pytorch**

## 1. Colab enviroment Setup

In [1]:
# Install ray and raydp
! pip install ray==1.9
! pip install raydp
! pip install --upgrade pip
! pip install raydp
! pip install ray[tune]
! pip install torch==1.8.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
! pip list

Collecting ray==1.9
  Downloading ray-1.9.0-cp37-cp37m-manylinux2014_x86_64.whl (57.6 MB)
[K     |████████████████████████████████| 57.6 MB 1.5 kB/s 
Collecting redis>=3.5.0
  Downloading redis-4.2.2-py3-none-any.whl (226 kB)
[K     |████████████████████████████████| 226 kB 43.3 MB/s 
Collecting async-timeout>=4.0.2
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting deprecated>=1.2.3
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: deprecated, async-timeout, redis, ray
Successfully installed async-timeout-4.0.2 deprecated-1.2.13 ray-1.9.0 redis-4.2.2
Collecting raydp
  Downloading raydp-0.4.2-py3-none-any.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 7.3 MB/s 
[?25hCollecting pyspark>=3.2.0
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 32 kB/s 
Collecting netifaces
  Downloading netifaces-0.11.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (

Collecting pip
  Downloading pip-22.0.4-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 11.8 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.0.4
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.3/125.3 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.5
[0mLooking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.1+cpu
  Downloading https://download.pytorch.org/whl/cpu/torch-1.8.1%2Bcpu-cp37-cp37m-linux_x86_64.whl (169.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.1/169.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages:

## 2. Import dependencies

In [2]:
import ray
import torch
import torch.nn as nn
import torch.nn.functional as F

import raydp
from raydp.torch import TorchEstimator
from raydp.utils import random_split

import os
import argparse

import numpy as np
import pandas as pd

from os.path import dirname, realpath

import numpy as np
from pyspark.sql.functions import hour, quarter, month, year, dayofweek, dayofmonth, weekofyear, col, lit, udf, abs as functions_abs

## 3. Get the data file

In [3]:

base_date = np.datetime64("2010-01-01 00:00:00")

# The size of data
N = 2000

fare_amount = np.random.uniform(3.0, 50.0, size=N)
pick_long = np.random.uniform(-74.2, -73.8, size=N)
pick_lat = np.random.uniform(40.7, 40.8, size=N)
drop_long = np.random.uniform(-74.2, -73.8, size=N)
drop_lat = np.random.uniform(40.7, 40.8, size=N)
passenger_count = np.random.randint(1, 5, size=N)
date = np.random.randint(0, 157680000, size=N) + base_date
date = np.array([t.item().strftime("%Y-%m-%d %H:%m:%S UTC") for t in date])
key = ["fake_key"] * N
df = pd.DataFrame({
    "key": key,
    "fare_amount":fare_amount,
    "pickup_datetime": date,
    "pickup_longitude": pick_long,
    "pickup_latitude": pick_lat,
    "dropoff_longitude": drop_long,
    "dropoff_latitude": drop_lat,
    "passenger_count": passenger_count
    })
df.to_csv("/content/fake_nyctaxi.csv", index=False)

## 4. Define the data_process function

In [4]:
def clean_up(data):
    data = data.filter(col("pickup_longitude")<=-72) \
            .filter(col("pickup_longitude")>=-76) \
            .filter(col("dropoff_longitude")<=-72) \
            .filter(col("dropoff_longitude")>=-76) \
            .filter(col("pickup_latitude")<=42) \
            .filter(col("pickup_latitude")>=38) \
            .filter(col("dropoff_latitude")<=42) \
            .filter(col("dropoff_latitude")>=38) \
            .filter(col("passenger_count")<=6) \
            .filter(col("passenger_count")>=1) \
            .filter(col("fare_amount") > 0) \
            .filter(col("fare_amount") < 250) \
            .filter(col("dropoff_longitude") != col("pickup_longitude")) \
            .filter(col("dropoff_latitude") != col("pickup_latitude"))
    return data

# Add time related features
def add_time_features(data):
    data = data.withColumn("day", dayofmonth(col("pickup_datetime")))
    data = data.withColumn("hour_of_day", hour(col("pickup_datetime")))
    data = data.withColumn("day_of_week", dayofweek(col("pickup_datetime"))-2)
    data = data.withColumn("week_of_year", weekofyear(col("pickup_datetime")))
    data = data.withColumn("month_of_year", month(col("pickup_datetime")))
    data = data.withColumn("quarter_of_year", quarter(col("pickup_datetime")))
    data = data.withColumn("year", year(col("pickup_datetime")))
    @udf("int")
    def night(hour, weekday):
        if ((16 <= hour <= 20) and (weekday < 5)):
            return int(1)
        else:
            return int(0)

    @udf("int")
    def late_night(hour):
        if ((hour <= 6) or (hour >= 20)):
            return int(1)
        else:
            return int(0)
    data = data.withColumn("night", night("hour_of_day", "day_of_week"))
    data = data.withColumn("late_night", late_night("hour_of_day"))
    return data

def add_distance_features(data):
    @udf("float")
    def manhattan(lat1, lon1, lat2, lon2):
        return float(np.abs(lat2 - lat1) + np.abs(lon2 - lon1))
    # Location of NYC downtown
    ny = (-74.0063889, 40.7141667)
    # Location of the three airport in NYC
    jfk = (-73.7822222222, 40.6441666667)
    ewr = (-74.175, 40.69)
    lgr = (-73.87, 40.77)
    # Features about the distance between pickup/dropoff and airport
    data = data.withColumn("abs_diff_longitude", functions_abs(col(
        "dropoff_longitude")-col("pickup_longitude"))) \
               .withColumn("abs_diff_latitude", functions_abs(col(
        "dropoff_latitude") - col("pickup_latitude")))
    data = data.withColumn("manhattan", col(
        "abs_diff_latitude")+col("abs_diff_longitude"))
    data = data.withColumn("pickup_distance_jfk", manhattan(
        "pickup_longitude", "pickup_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("dropoff_distance_jfk", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(jfk[0]), lit(jfk[1])))
    data = data.withColumn("pickup_distance_ewr", manhattan(
        "pickup_longitude", "pickup_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("dropoff_distance_ewr", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(ewr[0]), lit(ewr[1])))
    data = data.withColumn("pickup_distance_lgr", manhattan(
        "pickup_longitude", "pickup_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("dropoff_distance_lgr", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(lgr[0]), lit(lgr[1])))
    data = data.withColumn("pickup_distance_downtown", manhattan(
        "pickup_longitude", "pickup_latitude", lit(ny[0]), lit(ny[1])))
    data = data.withColumn("dropoff_distance_downtown", manhattan(
        "dropoff_longitude", "dropoff_latitude", lit(ny[0]), lit(ny[1])))
    return data

def drop_col(data):
    data = data.drop("pickup_datetime") \
            .drop("pickup_longitude") \
            .drop("pickup_latitude") \
            .drop("dropoff_longitude") \
            .drop("dropoff_latitude") \
            .drop("passenger_count") \
            .drop("key")
    return data

def nyc_taxi_preprocess(data):
    data = clean_up(data)
    data = add_time_features(data)
    data = add_distance_features(data)
    return drop_col(data)

## 5. Init or connect to a ray cluster

In [5]:
# 
# ray.init(address="auto")
# 
ray.init(num_cpus=6)

2022-05-06 04:26:42,662	INFO services.py:1340 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'metrics_export_port': 61762,
 'node_id': 'd15a375babb1c84bf443eb45461f3e6438419ef34a7137df6c4f8498',
 'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2022-05-06_04-26-39_113481_60/sockets/plasma_store',
 'raylet_ip_address': '172.28.0.2',
 'raylet_socket_name': '/tmp/ray/session_2022-05-06_04-26-39_113481_60/sockets/raylet',
 'redis_address': '172.28.0.2:6379',
 'session_dir': '/tmp/ray/session_2022-05-06_04-26-39_113481_60',
 'webui_url': '127.0.0.1:8265'}

## 6. Get a spark session

In [6]:
app_name = "NYC Taxi Fare Prediction with RayDP"
num_executors = 1
cores_per_executor = 1
memory_per_executor = "500M"
spark = raydp.init_spark(app_name, num_executors, cores_per_executor, memory_per_executor)

## 7. Data processing

In [7]:
# Read data from file
data = spark.read.format("csv").option("header", "true") \
        .option("inferSchema", "true") \
        .load("/content/fake_nyctaxi.csv")
# Set spark timezone for processing datetime
spark.conf.set("spark.sql.session.timeZone", "UTC")
# Transform the dataset
data = nyc_taxi_preprocess(data)
# Split data into train_dataset and test_dataset
train_df, test_df = random_split(data, [0.9, 0.1], 0)
features = [field.name for field in list(train_df.schema) if field.name != "fare_amount"]



## 8. Define a neural network model

In [8]:
class NYC_Model(nn.Module):
    def __init__(self, cols):
        super().__init__()
        self.fc1 = nn.Linear(cols, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 16)
        self.fc5 = nn.Linear(16, 1)
        self.bn1 = nn.BatchNorm1d(256)
        self.bn2 = nn.BatchNorm1d(128)
        self.bn3 = nn.BatchNorm1d(64)
        self.bn4 = nn.BatchNorm1d(16)

    def forward(self, *x):
        x = torch.cat(x, dim=1)
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = F.relu(self.fc3(x))
        x = self.bn3(x)
        x = F.relu(self.fc4(x))
        x = self.bn4(x)
        x = self.fc5(x)
        return x

## 9. Create model, critetion and optimizer

In [9]:
nyc_model = NYC_Model(len(features))
criterion = nn.SmoothL1Loss()
optimizer = torch.optim.Adam(nyc_model.parameters(), lr=0.001)

## 10. Create distributed estimator and train

In [10]:
estimator = TorchEstimator(num_workers=1, model=nyc_model, optimizer=optimizer, loss=criterion,
                           feature_columns=features, label_column="fare_amount", batch_size=64,
                           num_epochs=30)
# Train the model
estimator.fit_on_spark(train_df, test_df)
estimator.shutdown()

Epoch-0: {'num_samples': 1789, 'epoch': 1.0, 'batch_count': 28.0, 'train_loss': 25.841306849522855, 'last_train_loss': 24.456932067871094}
Epoch-1: {'num_samples': 1789, 'epoch': 2.0, 'batch_count': 28.0, 'train_loss': 25.7200011066247, 'last_train_loss': 25.709367752075195}
Epoch-2: {'num_samples': 1789, 'epoch': 3.0, 'batch_count': 28.0, 'train_loss': 25.54968092946649, 'last_train_loss': 26.506052017211914}
Epoch-3: {'num_samples': 1789, 'epoch': 4.0, 'batch_count': 28.0, 'train_loss': 25.321286180687057, 'last_train_loss': 24.703866958618164}
Epoch-4: {'num_samples': 1789, 'epoch': 5.0, 'batch_count': 28.0, 'train_loss': 25.030547279029534, 'last_train_loss': 22.527454376220703}
Epoch-5: {'num_samples': 1789, 'epoch': 6.0, 'batch_count': 28.0, 'train_loss': 24.677452941391305, 'last_train_loss': 23.636171340942383}
Epoch-6: {'num_samples': 1789, 'epoch': 7.0, 'batch_count': 28.0, 'train_loss': 24.256907315411468, 'last_train_loss': 25.541582107543945}
Epoch-7: {'num_samples': 1789,

## 11. shut down ray and raydp

In [11]:
raydp.stop_spark()
ray.shutdown()

