In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">

  <td>
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/notebook_template.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-managed-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/official/workbench/spark/spark_ml.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>                                                                                               
</table>

## Overview

This notebook tutorial shows you Apache SparkML jobs with Dataproc and BigQuery. Through this notebook, you can learn a common use case in the machine learning pipeline: Ingestion, data cleaning, feature engineering, modeling, and evaluation.

### Dataset

The two datasets, [NYC TLC(Taxi and Limousine Commission) Trips](https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-tlc-trips) dataset and [NYC Citi Bike Trips](https://console.cloud.google.com/marketplace/product/city-of-new-york/nyc-citi-bike) dataset, is available in [BigQuery Public Datasets](https://cloud.google.com/bigquery/public-data), and provides free querying of up to 1TB of data each month. It contains trips data for each Taxi and Citi Bike, the public bicycle sharing system serving the New York City.

### Objective

This notebook tutorial runs an Apache SparkML job that fetches data from the BigQuery dataset, performs exploratory data analysis, cleans data, executes feature engineering, trains the model, evaluates the model, debriefs for the result and saves the model to a Cloud Storage.

This notebook tutorial performs the following steps:

- Setting up a Google Cloud project and Dataproc cluster.
- Configuring the spark-bigquery-connector.
- Ingesting data from BigQuery into a Spark DataFrame.
- Performing Exploratory Data Analysis (EDA).
- Visualizing Data with samples.
- Cleaning Data.
- Training the model.
- Saving the model to a Cloud Storage path.
- Deleting the resources created for this notebook tutorial.

### Costs 

This tutorial uses billable components of Google Cloud:

* [Vertex AI](https://cloud.google.com/vertex-ai/pricing)
* [Cloud Storage](https://cloud.google.com/storage/pricing)
* [Dataproc](https://cloud.google.com/dataproc/pricing)

You can use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

### Set up your Google Cloud project:

1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you create an account, you receive a $300 credit towards to your compute and storage costs.

1. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).

1. [Enable the Notebooks API, Vertex AI API, and Dataproc API](https://console.cloud.google.com/flows/enableapi?apiid=notebooks.googleapis.com,aiplatform.googleapis.com,dataproc&_ga=2.209429842.1903825585.1657549521-326108178.1655322249)

**Note**: Jupyter runs lines prefixed with `!` as shell commands, and inserts the value of Python variables prefixed with `$` into the commands.

### Installation

Install the following packages to run this notebook.

In [None]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

USER_FLAG = ""
# Google Cloud Notebook requires dependencies to be installed with '--user'
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [None]:
if os.getenv("IS_TESTING"):
    """
    The testing suite does not currently support testing on Dataproc clusters,
    so the testing environment is setup to replicate Dataproc via the following steps.
    """
    JAVA_VER = "8u332-b09"
    JAVA_FOLDER = "/tmp/java"
    FILE_NAME = f"openlogic-openjdk-{JAVA_VER}-linux-x64"
    TAR_FILE = f"{JAVA_FOLDER}/{FILE_NAME}.tar.gz"
    DOWNLOAD_LINK = f"https://builds.openlogic.com/downloadJDK/openlogic-openjdk/{JAVA_VER}/openlogic-openjdk-{JAVA_VER}-linux-x64.tar.gz"
    PYSPARK_VER = "3.1.3"

    # Download Open JDK 8. Spark requires Java to execute.
    ! rm -rf $JAVA_FOLDER
    ! mkdir $JAVA_FOLDER
    ! wget -P $JAVA_FOLDER $DOWNLOAD_LINK
    os.environ["JAVA_HOME"] = f"{JAVA_FOLDER}/{FILE_NAME}"
    ! tar -zxf $TAR_FILE -C $JAVA_FOLDER
    ! echo $JAVA_HOME

    # Pin the Spark version to match that the Dataproc 2.0 cluster.
    ! pip install {USER_FLAG} pyspark==$PYSPARK_VER -q

### Create a Dataproc cluster

The Spark job executed in this notebook tutorial is compute intensive. Since the job can take a significant amount time to complete in a standard notebook environment, this notebook tutorial runs on a Dataproc cluster that is created with the Dataproc Component Gateway and Jupyter component installed on the cluster.

**Existing Dataproc with Jupyter cluster?**: If you have a running Dataproc cluster that has the [Component Gateway and Jupyter component installed on the cluster](https://cloud.google.com/dataproc/docs/concepts/components/jupyter#gcloud-command)), you can use it in this tutorial. If you plan to use it, skip this step, and go to `Switch your kernel`.

In [None]:
if not os.getenv("IS_TESTING"):
    CLUSTER_NAME = "[your-cluster]"  # @param {type: "string"}
    CLUSTER_REGION = "[your-region]"  # @param {type: "string"}

    if CLUSTER_REGION == "[your-region]":
        CLUSTER_REGION = "us-central1"

    print(f"CLUSTER_NAME: {CLUSTER_NAME}")
    print(f"CLUSTER_REGION: {CLUSTER_REGION}")

In [None]:
if not os.getenv("IS_TESTING"):
    !gcloud dataproc clusters create $CLUSTER_NAME \
        --region=$CLUSTER_REGION \
        --enable-component-gateway \
        --image-version=2.0 \
        --optional-components=JUPYTER

Your `CLUSTER_NAME` must be **unique within your Google Cloud project**. It must start with a lowercase letter, followed by up to 51 lowercase letters, numbers, and hyphens, and cannot end with a hyphen.

#### Switch your kernel

Your notebook kernel is listed at the top of the notebook page. Your notebook should run on the Python 3 kernel running on your Dataproc cluster.

Select **Kernel > Change Kernel** from the top menu, then select `Python 3 on CLUSTER_NAME: Dataproc cluster in REGION (Remote)`.

### Set your project ID

Run the following cell to get you project ID.

In [None]:
import os

PROJECT_ID = ""

# Get your Google Cloud project ID from gcloud
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)

If the previous command has no output, copy your project ID from the project selector in the [Google Cloud console](https://console.cloud.google.com/). Insert the ID in the `[your-project-id]` placeholder, then run the following command.

In [None]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type: "string"}

In [None]:
! gcloud config set project $PROJECT_ID -q

### Create a Cloud Storage bucket

The Spark DataFrame created in this tutorial is stored in BigQuery, with the data first being written to a Google Cloud Storage bucket before it is written into BigQuery.

#### Region

Before creating a Cloud Storage bucket, re-define the `REGION` variable (when you changed the notebook kernel earlier, previously set variables were deleted).

In [None]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"

#### Timestamp

To avoid name collisions, you can create a timestamp for the current notebook session, then append the timestamp to the name of resources that you create in this tutorial, such as the Cloud Storage bucket or BigQuery dataset that you create in this tutorial.

In [None]:
from datetime import datetime

TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")

Replace the `[your-bucket-name]` placeholder with the name of your Cloud Storage bucket. The name must be unique across all Cloud Storage buckets.

In [None]:
BUCKET_NAME = "[your-bucket-name]"  # @param {type:"string"}
BUCKET_URI = f"gs://{BUCKET_NAME}/"

if BUCKET_NAME == "" or BUCKET_NAME is None or BUCKET_NAME == "[your-bucket-name]":
    BUCKET_NAME = f"{PROJECT_ID}{TIMESTAMP}"
    BUCKET_URI = f"gs://{BUCKET_NAME}/"

In [None]:
! gsutil mb -l $REGION -p $PROJECT_ID $BUCKET_URI

Confirm your access to the Cloud Storage bucket by displaying the bucket's metadata:

In [None]:
! gsutil ls -L -b $BUCKET_URI

## Tutorial

### Import required libraries

In [None]:
# A Spark Session is how you interact with Spark SQL to create Dataframes
from pyspark.sql import SparkSession
# PySpark functions
from pyspark.sql.functions import col, udf, to_timestamp, unix_timestamp, pandas_udf, PandasUDFType, to_date, floor, abs, lit
# These allow us to create a schema for our data
from pyspark.sql.types import DoubleType, BooleanType

from geopandas import gpd
from shapely.geometry import Point
import pandas as pd
import numpy as np
from datetime import datetime

from pyspark.ml.regression import GBTRegressor
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

### Initialize the SparkSession

To use Apache Spark with BigQuery, you must include the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector) when you initialize the SparkSession.

In [None]:
# Initialize the "SparkSession" with the following config.
VER = "0.26.0"
FILE_NAME = f"spark-bigquery-with-dependencies_2.12-{VER}.jar"

if os.getenv("IS_TESTING"):
    connector = f"https://github.com/GoogleCloudDataproc/spark-bigquery-connector/releases/download/{VER}/{FILE_NAME}"
else:
    connector = f"gs://spark-lib/bigquery/{FILE_NAME}"

spark = (
    SparkSession.builder.appName("spark-bigquery-polyglot-language-demo")
    .config("spark.jars", connector)
    .config("spark.sql.debug.maxToStringFields", "500")
    .getOrCreate()
)


### Fetch data from BigQuery

In [None]:
# Load NYC_taxi in Github Activity Public Dataset from BigQuery.
taxi_df = (
    spark.read.format("bigquery")
    .option("table", "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018")
    .load()
)

# Load NYC_Citibike in Github Acitivity Public dataset from BQ.
bike_df = (
    spark.read.format("bigquery")
    .option("table", "bigquery-public-data.new_york_citibike.citibike_trips")
    .load()
)

# Since it consumes too much time and computing resources, we will use 10% of the rows for each dataset.
taxi_df = taxi_df.sample(0.1)
bike_df = bike_df.sample(0.1)

if os.getenv("IS_TESTING"):
    taxi_df = taxi_df.sample(0.001)
    bike_df = bike_df.sample(0.001)

### Perform Exploratory Data Analysis(EDA)

As we get started with a new problem, the first step is to gain an understanding of what the dataset contains. EDA is used to derive insights from the data. Data scientists and analysts try to find different patterns, relations, and anomalies in the data using some statistical graphs and other visualization techniques. It allows analysts to understand the data better before making any assumptions.

Check the data types for Taxi dataset first.

In [None]:
taxi_df.printSchema()

Filter out unnecessary columns and check null counts of the fields.

In [None]:
taxi_df = taxi_df.select(
    col("pickup_datetime"),
    col("dropoff_datetime"),
    col("trip_distance"),
    col("fare_amount"),
    col("pickup_location_id"),
    col("dropoff_location_id"),
)
taxi_df.describe().show()

From this summary, you are able to know a lot of information.
  - There are over 11 millions of trip history for Yellow Taxi in 2018.
  - The current dataset has some abnormal values such as null and negative values in it.
  - `pickup_datetime` and `dropoff_datetime` are string format. To use it effectively, it needs to be re-formatted.
  - In previous years, the exact latitude and longitude were used for the pickup and the dropoff locations. It raised a lot of [privacy concerns](https://agkn.wordpress.com/2014/09/15/riding-with-the-stars-passenger-privacy-in-the-nyc-taxicab-dataset/) and the dataset has been providing `pickup_location_id` and `dropoff_location_id` instead. This id is corresponded to the [NYC Taxi Zones](https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc), roughly based on NYC Department of City Planning’s Neighborhood Tabulation Areas (NTAs) and are meant to approximate neighborhoods.
  - The maximum value of `pickup_location_id` and `dropoff_location_id` shows `99`. However, these might be wrong since the data type of both is string.

First, you can manipulate the time. `pickup_datetime` and `dropoff_datetime` is currently a string format, so using `to_timestamp()` function and `unix_timestamp()` function, you are able to get each pickup and droppoff datetime as a Unix Timestamp type.

Unix time is a way of representing time as the number of seconds since `January 1st, 1970 at 00:00:00 UTC`. Compared to the Timestamp type, Unix time can be represented as an integer, making it easier to parse and use across different systems.

After we get `start_time` and `end_time` by converting the original `pickup_datetime` and `dropoff_datetime`, we are able to get more insteresting columns using these two Timestamps.

In [None]:
# Convert the type of datetime from a string to a Unix timestamp.
taxi_df = taxi_df.withColumn('start_time', unix_timestamp(to_timestamp(col('pickup_datetime'))))
taxi_df = taxi_df.withColumn('end_time', unix_timestamp(to_timestamp(col('dropoff_datetime'))))

# Cast the type of location id from string to integer.
taxi_df = taxi_df.withColumn("start_zone_id", taxi_df.pickup_location_id.cast('int'))
taxi_df = taxi_df.withColumn("end_zone_id", taxi_df.dropoff_location_id.cast('int'))

# Convert start_time to days_of_week.
taxi_df = taxi_df.withColumn(
    'is_weekdays',
    ((floor(col('start_time') / 86400) + 4) % 7 > 0) & ((floor(col('start_time') / 86400) + 4) % 7 < 6))

# Convert start_time to start_time_in_minute.
taxi_df = taxi_df.withColumn(
    'start_time_in_minute',
    floor((col('start_time') % 86400) / 60) - 300)

# Calculate trip_duration.
taxi_df = taxi_df.withColumn('trip_duration', col('end_time') - col('start_time'))

Prior to visualize this modified Taxi dataset, do the similar work for the Citibike dataset.

In [None]:
bike_df.printSchema()

In [None]:
bike_df = bike_df.select(
    col("tripduration").alias("trip_duration"),
    col("starttime"),
    col("stoptime"),
    col("start_station_latitude"),
    col("start_station_longitude"),
    col("end_station_latitude"),
    col("end_station_longitude"),
    col("usertype"),
)
bike_df.describe().show()

From this summary, there is also interesting information from the dataset's summary.
  - There are over 53 millions of trip history for Citibike from 2013 to 2018.
  - The current dataset has some abnormal values.
  - `starttime` and `stoptime` are string format. To use it effectively, it needs to be re-formatted.
  - Unlike the Taxi dataset, starting and ending location has exact latitude and longitude, but since every bike is parked in their station, these coordinates represent the station.

Assume that the Citi Bike users move through the Manhattan's roads, the perfectly perpendicular streets, the trip distance can be calculated by applying the Manhattan distance formula.

Calulating the Manhattan distance is easy. Assume that you are in the **W 15th St and 9th Ave, where Google NYC is located,** and you want to go to **W 33rd St and 5th Ave, where the Empire State building is located,**. In a full of city blocks, the fastest way you can get to the destination is to move east until you reached the 5th Avenue, and move north until you reached the W 33rd St. In this case, **W 16th St and 5th Ave** or **W 33rd St and 9th Ave** can be a hinge point. If we set the starting point as **S**, the ending point as **E**, and the hinge point as **H**, the formula of Manhattan distance is `distance(S, H) + distance(H, E)`.

However, in a real world, it requires more calculation to get the "real" Manhattan distance. First of all, unlike the Cartesian coordinate system. the actual distance in longitude varies with latitude. For example, 1 degree of longitude represents roughly 69 miles at equator, is the same as latitude, while it is about 49 miles at 45 degrees North or South. To calculate the exact distance in a real world using latitudes and longitudes, [Haversine distance](https://en.wikipedia.org/wiki/Haversine_formula) is one of the good choices.

Furthermore, the streets in Manhattan are inclined at about 29 degrees to the True north, so you should rotate the point S and E by 29 degrees anti-clockwise.

Therefore, the final formula should be `manhattan_distance = haversine_distance(S', H') + haversine_distance(H', E')` where `S'`, `H'`, `E'` are the rotated points.

In [None]:
@udf(returnType=DoubleType())
def manhattan_dist(lat1, lon1, lat2, lon2):
    """
    The preprocessing function takes latitudes and longitudes of start position and end position, 
    and returns the Manhattan distance between them.
    Args:
        lat1: The latitude of the start position.
        lat2: The longitude of the start position.
        lon1: The latitude of the end position.
        lon2: The longitude of the end position.
    Returns:
        The Manhattan distance between start and end position.
    """
    EARTH_RADIUS = 3958.76 # Approximate radious of Earth in mile.
    THETA = np.radians(-28.904) # Approximate inclined degree of the streets in Manhattan.
    
    def haversine(pos1, pos2):
        """
        The helper function takes start and end position and returns the haversine distance.
        Args:
            pos1: a latitude and a longitude of start position in np.array form.
            pos2: a latitude and a longitude of end position in np.array form.
        Returns:
            The Haversine, the spherical distance between pos1 and pos2 on a sphere, in this case, the Earth.
        """
        rad_dist_lat, rad_dist_lon = np.radians(pos2[0] - pos1[0]), np.radians(pos2[1] - pos1[1])
        rad_lat1, rad_lat2 = np.radians(pos1[0]), np.radians(pos2[0])
        dist = 2 * np.arcsin(np.sqrt(np.sin(rad_dist_lat / 2) ** 2 + np.cos(rad_lat1) * np.cos(rad_lat2) * np.sin(rad_dist_lon / 2) ** 2))
        return EARTH_RADIUS * dist
    
    def rotate(pos, theta):
        """
        The helper function takes position and the degree theta, and returns rotated position.
        Args:
            pos: a latitude and a longitude of position in np.array form.
            theta: the degree to rotate.
        Returns:
            Rotated position.
        """
        rotate = np.array([[np.cos(theta), np.sin(theta)],
                           [-np.sin(theta), np.cos(theta)]])
        return np.matmul(rotate, pos)
    
    if not (lat1 and lon1 and lat2 and lon2):
        return -1
    
    # Convert positions to np.array format.
    start, end = np.array([lat1, lon1]), np.array([lat2, lon2])
    
    # Rotate each positions by 29' using a helper function.
    rotated_start, rotated_end  = rotate(start, THETA), rotate(end, THETA)
    
    # Get rotated hinge point using rotated start and end point.
    rotated_hinge = np.array([rotated_start[0], rotated_end[1]])
    
    # Re-rotate the hinge point.
    hinge = rotate(rotated_hinge, -THETA)
    
    # Return the Haversine distance between start and hinge and hinge to end.
    return float(haversine(start, hinge) + haversine(hinge, end))

In [None]:
# Convert the type of start/stop time from a string to a Unix timestamp.
bike_df = bike_df.withColumn('starttime', unix_timestamp(to_timestamp(col('starttime'))))
bike_df = bike_df.withColumn('stoptime', unix_timestamp(to_timestamp(col('stoptime'))))

# Check whether the starttime is a weekday or a weekend.
bike_df = bike_df.withColumn(
    'is_weekdays',
    ((floor(col('starttime') / 86400) + 4) % 7 > 0) & ((floor(col('starttime') / 86400) + 4) % 7 < 6))

# Convert starttime to start_time_in_minute
bike_df = bike_df.withColumn(
    'start_time_in_minute',
    floor((col('starttime') % 86400) / 60) - 300)

# Calculate the Manhattan distance between start_station and end_station
bike_df = bike_df.withColumn("trip_distance", manhattan_dist(col("start_station_latitude"), col("start_station_longitude"), col("end_station_latitude"), col("end_station_longitude")))

In [None]:
bike_df.printSchema()

#### Visualization
Check the distributions for the numerical columns. In PySpark, visualizing is expensive because the data is too large. For example, the NYC Taxi dataset in 2018 has more than 11M rows. Therefore, approximately 3% of total data (approx. 330k rows) are extracted as a sample.

In [None]:
%%time
taxi_sample = taxi_df.sample(0.03)
bike_sample = bike_df.sample(0.03)

taxi_pd = taxi_sample.toPandas()
bike_pd = bike_sample.toPandas()

taxi_pd.info()
bike_pd.info()

When a [Decimal Type](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.types.DecimalType.html) column in PySpark are converted to Pandas DataFrame, it is converted into object Type, not float type. To visualize these "object" columns, they need to be converted into the float type.

In [None]:
%%time
FLOAT_TYPE_COLUMNS = ["trip_distance", "fare_amount"]

taxi_pd = taxi_pd.drop(columns=['pickup_datetime', 'dropoff_datetime'])

for COLUMN in FLOAT_TYPE_COLUMNS:
    taxi_pd[COLUMN] = taxi_pd[COLUMN].astype(float)

taxi_pd.info()

Since there are some extreme data points, remove them and bin the data into a box and histogram.

In [None]:
TAXI_COLUMNS_TO_SHOW = {"trip_distance", "trip_duration", "fare_amount"}

taxi_pd_filtered = taxi_pd.query("trip_distance > 0 and trip_distance < 8 \
                                and fare_amount > 0 and fare_amount < 40 \
                                and trip_duration > 0 and trip_duration < 5000")
taxi_pd_filtered.plot(x='trip_distance', y='trip_duration', style='o')

for column in taxi_pd_filtered.columns:
    if column in TAXI_COLUMNS_TO_SHOW:
        _, ax = plt.subplots(1, 2, figsize=(10, 4))
        taxi_pd_filtered[column].plot(kind="box", ax=ax[0])
        taxi_pd_filtered[column].plot(kind="hist", ax=ax[1])
        plt.title(column)
        plt.show()

We found that most columns are skewed right and the trip_distance basically increases as trip_duration increases.

Do the similar work for the Citi Bike dataset.

In [None]:
BIKE_COLUMNS_TO_SHOW = {"trip_distance", "trip_duration"}

bike_pd_filtered = bike_pd.query("trip_distance > 0 and trip_distance < 8 \
                                and trip_duration > 0 and trip_duration < 5000")
bike_pd_filtered.plot(x='trip_distance', y='trip_duration', style='o')

total_count = bike_pd.count()["trip_duration"]
outlier_count = bike_pd[(bike_pd.trip_duration > 3600) & (bike_pd.trip_distance < 1)].count()["trip_distance"]
print(f"Approximately {outlier_count/total_count * 100:.2f}% of data records more than 1 hour of trip_duration and less than 1 mile of trip_distance.")

The plot for trip_distance and trip_duration for Citi Bike looks odd than the same plot for the Taxi dataset.

A lot of trips spent too much time than trip_distance. The reason for this phenomenon seems that, unlike taxis, city bike users did not have a purpose of trip and simply they enjoyed the trip, or they did not park their bicycles properly at the station. 

To remove outliers, let's assume the average bike speed is at least 6 miles per hour, which is slightly faster than the walking pace, and filter them out if a trip_duration multiplied by speed is less trip_distance.

In [None]:
bike_pd_filtered = bike_pd.query("trip_distance > 0 and trip_distance < 8 \
                                and (trip_duration * 0.00166666667) <= trip_distance \
                                and trip_duration > 0 and trip_duration < 5000")
bike_pd_filtered.plot(x='trip_distance', y='trip_duration', style='o')

for column in bike_pd_filtered.columns:
    if column in BIKE_COLUMNS_TO_SHOW:
        _, ax = plt.subplots(1, 2, figsize=(10, 4))
        bike_pd_filtered[column].plot(kind="box", ax=ax[0])
        bike_pd_filtered[column].plot(kind="hist", ax=ax[1])
        plt.title(column)
        plt.show()

### Data Cleaning

In [None]:
%%time

gdf_zone = gpd.read_file("https://data.cityofnewyork.us/api/geospatial/d3c5-ddgc?method=export&format=GeoJSON")
gdf_zone['location_id'] = gdf_zone['location_id'].astype('long')
hm = gdf_zone.to_dict('index')
location_set = {hm[i]['location_id'] for i in hm if hm[i]['borough'] == "Manhattan"}


@udf(returnType=BooleanType())
def is_in_manhattan(location_id):
    return location_id in location_set

@pandas_udf('long')
def preprocess_zone_id(lat: pd.Series, lon: pd.Series) -> pd.Series:
    point_var = [Point(xy) for xy in zip(lon, lat)]
    gdf_points = gpd.GeoDataFrame(pd.DataFrame({'lat': lat, 'lon': lon}), crs='epsg:4326', geometry=point_var)
    gdf_joined = gpd.sjoin(gdf_points, gdf_zone, how='left')
    return gdf_joined['location_id']

@udf(returnType=BooleanType())
def is_summer(start_year, time):
    if not time:
        return False
    for year in range(start_year, 2019):
        summer_start = datetime(year, 6, 1, 4, 0, 0).timestamp()
        summer_end = datetime(year, 9, 1, 3, 59, 0).timestamp()
        if summer_start <= time <= summer_end:
            return True
    return False

In [None]:
%%time
taxi_df = taxi_df.withColumn("is_start_manhattan", is_in_manhattan(col("start_zone_id")))
taxi_df = taxi_df.withColumn("is_end_manhattan", is_in_manhattan(col("end_zone_id")))

taxi_df = taxi_df.where(
    (col('start_time') < col('end_time'))
    & (col('trip_duration') > 0) & (col('trip_duration') < 3600)
    & (col('trip_distance') > 0.2) & (col('trip_distance') < 15)
    & (col("start_zone_id") != col("end_zone_id")) 
    & (col("fare_amount") > 0) & (col("fare_amount") < 500)
    & (col("is_start_manhattan") == True)
    & (col("is_end_manhattan") == True)
    & ((col("trip_duration") * 0.00167) <= col("trip_distance"))
).dropna()

taxi_df.printSchema()
taxi_df.describe().show()

In [None]:
%%time
bike_df = bike_df.withColumn('start_zone_id', preprocess_zone_id(bike_df['start_station_latitude'], bike_df['start_station_longitude']))
bike_df = bike_df.withColumn('end_zone_id', preprocess_zone_id(bike_df['end_station_latitude'], bike_df['end_station_longitude']))
bike_df = bike_df.withColumn('is_start_manhattan', is_in_manhattan(col('start_zone_id')))
bike_df = bike_df.withColumn('is_end_manhattan', is_in_manhattan(col('end_zone_id')))

bike_df = bike_df.where(
    (col('tripduration') > 0) & (col('tripduration') < 3600)
    & (col("start_zone_id") != col("end_zone_id")) 
    & (col('usertype') == "Subscriber")
    & (col('starttime') < col('stoptime'))
    & (col("is_start_manhattan") == True) & (col("is_end_manhattan") == True)
    & ((col("trip_duration") * 0.00167) <= col("trip_distance"))
).dropna()

bike_df.printSchema()
bike_df.describe().show()

### Feature Selection

Not all features in our dataset will be useful. 
Since the purpose of this tutorial is comparing two datasets, we need to use similar features. There are some useful features for training the Taxi dataset, such as `fare_amount`, but I removed it since Citi Bike dataset does not have it.

After choose the columns as features, they needs to be assembled with `VectorAssembler()`.

In [None]:
%%time
feature_cols = [
    "is_weekdays",
    "start_time_in_minute",
    "start_zone_id",
    "end_zone_id",
    "trip_distance",
]
assembler = VectorAssembler(inputCols=taxi_feature_cols, outputCol='features')
taxi_transformed_data = assembler.transform(taxi_df)
bike_transformed_data = assembler.transform(bike_df)

In [None]:
standard_scaler = StandardScaler(inputCol="features", outputCol="features_scaled")

taxi_scaled_df = standard_scaler.fit(taxi_transformed_data).transform(taxi_transformed_data)
taxi_scaled_df.select("features", "features_scaled").show(10, truncate=False)
(taxi_training_data, taxi_test_data) = taxi_scaled_df.randomSplit([0.7, 0.3])

bike_scaled_df = standard_scaler.fit(bike_transformed_data).transform(bike_transformed_data)
bike_scaled_df.select("features", "features_scaled").show(10, truncate=False)
(bike_training_data, bike_test_data) = bike_scaled_df.randomSplit([0.7, 0.3])

### Training the Model

TODO: description for what is GBT and why GBT

In [None]:
%%time

gbt = GBTRegressor(
    featuresCol="features",
    labelCol="trip_duration",
    predictionCol="pred_trip_duration",
)
evaluator_r2 = RegressionEvaluator(
    labelCol=gbt.getLabelCol(),
    predictionCol=gbt.getPredictionCol(),
    metricName="r2"
)
evaluator_rmse = RegressionEvaluator(
    labelCol=gbt.getLabelCol(),
    predictionCol=gbt.getPredictionCol(),
    metricName="rmse"
)

In [None]:
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression, DecisionTreeRegressor, RandomForestRegressor

lr = LinearRegression(featuresCol="features", labelCol="trip_duration", predictionCol="pred_trip_duration").setMaxIter(10).setRegParam(0.3)
glr = GeneralizedLinearRegression(featuresCol="features", labelCol="trip_duration", predictionCol="pred_trip_duration").setFamily("gaussian").setLink("identity").setMaxIter(10).setRegParam(0.3).setLinkPredictionCol("linkOut")
dtr = DecisionTreeRegressor(featuresCol="features", labelCol="trip_duration", predictionCol="pred_trip_duration")
rf = RandomForestRegressor(featuresCol="features", labelCol="trip_duration", predictionCol="pred_trip_duration")

In [None]:
taxi_lr_model = lr.fit(taxi_training_data)
taxi_glr_model = glr.fit(taxi_training_data)
taxi_dtr_model = dtr.fit(taxi_training_data)
taxi_rf_model = rf.fit(taxi_training_data)

In [None]:
bike_lr_model = lr.fit(bike_training_data)
bike_glr_model = glr.fit(bike_training_data)
bike_dtr_model = dtr.fit(bike_training_data)
bike_rf_model = rf.fit(bike_training_data)

In [None]:
taxi_lr_predictions = taxi_lr_model.transform(taxi_test_data)
taxi_glr_predictions = taxi_glr_model.transform(taxi_test_data)
taxi_dtr_predictions = taxi_dtr_model.transform(taxi_test_data)
taxi_rf_predictions = taxi_rf_model.transform(taxi_test_data)

In [None]:
bike_lr_predictions = taxi_lr_model.transform(bike_test_data)
bike_glr_predictions = taxi_glr_model.transform(bike_test_data)
bike_dtr_predictions = taxi_dtr_model.transform(bike_test_data)
bike_rf_predictions = taxi_rf_model.transform(bike_test_data)

In [None]:
taxi_lr_accuracy_r2 = evaluator_r2.evaluate(taxi_lr_predictions)
print(f"Taxi lr R2 = {taxi_lr_accuracy_r2}")
taxi_glr_accuracy_r2 = evaluator_r2.evaluate(taxi_glr_predictions)
print(f"Taxi glr R2 = {taxi_glr_accuracy_r2}")
taxi_dtr_accuracy_r2 = evaluator_r2.evaluate(taxi_dtr_predictions)
print(f"Taxi dtr R2 = {taxi_dtr_accuracy_r2}")
taxi_rf_accuracy_r2 = evaluator_r2.evaluate(taxi_rf_predictions)
print(f"Taxi rf R2 = {taxi_rf_accuracy_r2}")

In [None]:
bike_lr_accuracy_r2 = evaluator_r2.evaluate(bike_lr_predictions)
print(f"bike lr R2 = {bike_lr_accuracy_r2}")
bike_glr_accuracy_r2 = evaluator_r2.evaluate(bike_glr_predictions)
print(f"bike glr R2 = {bike_glr_accuracy_r2}")
bike_dtr_accuracy_r2 = evaluator_r2.evaluate(bike_dtr_predictions)
print(f"bike dtr R2 = {bike_dtr_accuracy_r2}")
bike_rf_accuracy_r2 = evaluator_r2.evaluate(bike_rf_predictions)
print(f"bike rf R2 = {bike_rf_accuracy_r2}")

In [None]:
print(f"Taxi glr R2 = {taxi_glr_accuracy_r2}")
print(f"Taxi dtr R2 = {taxi_dtr_accuracy_r2}")
print(f"Taxi rf R2 = {taxi_rf_accuracy_r2}")
print(f"bike glr R2 = {bike_glr_accuracy_r2}")
print(f"bike dtr R2 = {bike_dtr_accuracy_r2}")
print(f"bike rf R2 = {bike_rf_accuracy_r2}")

In [None]:
%%time
taxi_gbt_model = gbt.fit(taxi_training_data)
taxi_gbt_predictions = taxi_gbt_model.transform(taxi_test_data)

In [None]:
%%time
bike_gbt_model = gbt.fit(bike_training_data)
bike_gbt_predictions = bike_gbt_model.transform(bike_test_data)

In [None]:
%%time
taxi_gbt_accuracy_r2 = evaluator_r2.evaluate(taxi_gbt_predictions)
print(f"Taxi Test GBT R2 Accuracy = {taxi_gbt_accuracy_r2}")

taxi_gbt_accuracy_rmse = evaluator_rmse.evaluate(taxi_gbt_predictions)
print(f"Taxi Test GBT RMSE Accuracy = {taxi_gbt_accuracy_rmse}")

# print(f"Taxi Coefficients: {taxi_model.coefficients}")
# print(f"Taxi Intercept: {taxi_model.intercept}")
# Taxi Test GBT R2 Accuracy = 0.708183954907601 <- 
# Taxi Test GBT R2 Accuracy = 0.6598682899938532
# rmse = 265.5806200025988
# Wall time: 2min 13s
# Taxi Test GBT R2 Accuracy = 0.7779961832460531
# Taxi Test GBT RMSE Accuracy = 197.87517973345072
# CPU times: user 34.8 ms, sys: 9.06 ms, total: 43.9 ms
# Wall time: 19.6 s


In [None]:
%%time

bike_gbt_accuracy_r2 = evaluator_r2.evaluate(bike_gbt_predictions)
print(f"Bike Test GBT R2 Accuracy = {bike_gbt_accuracy_r2}")

bike_gbt_accuracy_rmse = evaluator_rmse.evaluate(bike_gbt_predictions)
print(f"Bike Test GBT RMSE Accuracy = {bike_gbt_accuracy_rmse}")

# print(f"bike Coefficients: {bike_gbt_model.coefficients}")
# print(f"bike Intercept: {bike_gbt_model.intercept}")

# Bike Test GBT R2 Accuracy = 0.9077256551331765
# Bike Test GBT RMSE Accuracy = 124.30534645838586
# CPU times: user 61.9 ms, sys: 8.79 ms, total: 70.7 ms
# Wall time: 1min 58s
# Bike Test GBT R2 Accuracy = 0.8554198857801676
# Bike Test GBT RMSE Accuracy = 164.1698092435127
# CPU times: user 57.1 ms, sys: 4.66 ms, total: 61.8 ms
# Wall time: 1min 16s


### Save the model to a Cloud Storage path

In [None]:
BUCKET_FOLDER = "/tmp/bucket"

# In the testing environment, saving to GCS bucket routes to the local file system.
if os.getenv("IS_TESTING"):
    ! rm -rf $BUCKET_FOLDER
    ! mkdir $BUCKET_FOLDER
    bike_gbt_model.write().overwrite().save(f"{BUCKET_FOLDER}")
    ! gsutil cp $BUCKET_FOLDER gs://$BUCKET_URI
else:
    bike_gbt_model.write().overwrite().save(f"{BUCKET_URI}/")

## Cleaning up

See [Clean up](https://cloud.google.com/vertex-ai/docs/workbench/managed/create-managed-notebooks-instance-console-quickstart#clean-up) to delete your project or the managed notebook created in this tutorial.

In [None]:
# Delete Google Cloud Storage bucket
! gsutil rm -r $BUCKET_URI

### Delete Dataproc Cluster

It is not possible to delete the cluster you are currently using unless you switch the kernel to local. To delete it, you need to switch the kernel to local `Python 3` or `PySpark`, set your `CLUSTER_NAME` and `CLUSTER_REGION` manually in the following cell, and execute the `gcloud` command.

See [Deleting a cluster](https://cloud.google.com/dataproc/docs/guides/manage-cluster#console) to delete the Dataproc cluster created in this tutorial.

In [None]:
CLUSTER_NAME = "[your-cluster-name]"
CLUSTER_REGION = "[your-cluster-region]"

In [None]:
if not os.getenv("IS_TESTING"):
    ! gcloud dataproc clusters delete $CLUSTER_NAME --region=$CLUSTER_REGION -q

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# %%time
# param_grid = (
#     ParamGridBuilder()
#     .addGrid(gbt.maxDepth, [2, 5, 10])
#     .addGrid(gbt.maxBins, [10, 20, 40])
#     .addGrid(gbt.maxIter, [5, 10, 20])
#     .build()
# )
# cv = CrossValidator(
#     estimator=gbt,
#     evaluator=evaluator_r2,
#     estimatorParamMaps=param_grid,
#     numFolds=5
# )


In [None]:
# %%time
# taxi_cv_model = cv.fit(taxi_training_data)
# taxi_predictions = taxi_cv_model.transform(taxi_test_data)
# print(f"R2:{evaluator_r2.evaluate(taxi_predictions)}")

In [None]:
# %%time
# bike_cv_model = cv.fit(bike_training_data)
# bike_predictions = bike_cv_model.transform(bike_test_data)
# print(f"R2:{evaluator_r2.evaluate(bike_predictions)}")