In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
  <td><a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/regression/distributed_pyspark_xgboost/distributed_pyspark_xgboost.ipynb"><img src="https://avatars.githubusercontent.com/u/33467679?s=200&v=4" width="32px" alt="Colab logo"> Run in Colab</a></td>
  <td><a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/regression/distributed_pyspark_xgboost/distributed_pyspark_xgboost.ipynb"><img src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px" alt="GitHub logo"> View on GitHub</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/regression/distributed_pyspark_xgboost/distributed_pyspark_xgboost.ipynb"><img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"> Open in Vertex AI Workbench</a></td>
  <td><a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/regression/distributed_pyspark_xgboost/distributed_pyspark_xgboost.ipynb"><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35"> Open in BQ Studio</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks/regression/distributed_pyspark_xgboost/distributed_pyspark_xgboost.ipynb"><img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"> Open in Colab Enterprise</a></td>
</table>

# Data Science with PySpark and Distributed XGBoost

## Overview

In this notebook, you will learn how to parallelize and scale your data science and machine learning workflows on large datasets using distributed computing with Apache Spark (`PySpark`).

You will build a machine learning pipeline to predict NYC taxi tip amounts. You will focus on scaling data processing (`PySpark DataFrames`/`Spark SQL`), using `PySpark MLlib` for baseline algorithms, and training a distributed `XGBoost` model via the PySpark Estimator API.

### Objectives

* Initialize a cluster-ready PySpark session.
* Construct an end-to-end data pipeline with Spark DataFrames.
* Train regression models with Spark MLlib.
* Configure and train distributed XGBoost on Spark.
* Evaluate validation accuracy on a held-out test dataset.

## Setup

To get started, we need to initialize a `SparkSession`.

### Install Dependencies

First, we will install the necessary packages for PySpark and XGBoost.

In [None]:
# Install dependencies for Spark XGBoost
!pip3 install pyarrow pandas xgboost

#### Initialize Spark Session

Initialize the `SparkSession`, which is the entry point for using Spark functionality. We configure it with an application name and allocate memory resources.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Regression with XGBoost") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .getOrCreate()

spark

---

## Prepare the NYC taxi dataset

This tutorial uses the [NYC Taxi & Limousine Commission (TLC) Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page).

The dataset contains individual trip records from yellow taxis in New York City, and includes fields like:
*   Pick-up and drop-off dates, times, and locations
*   Trip distances
*   Itemized fare amounts
*   Passenger counts
*   **Tip amounts** (*the target variable we will predict*)

#### Define Data Path

Specify the Google Cloud Storage (GCS) path where the NYC taxi dataset is located.

In [None]:
# Just using 1-month data of the NYC Taxi dataset
DATA_DIR = "gs://dataproc-metastore-public-binaries/nyc_taxi_data"
print(f"Using dataset from {DATA_DIR}")

### Load and Clean Data

Let's load the data we downloaded and clean out invalid trips.

In [None]:
import time
from pyspark.sql.functions import col

start_time = time.perf_counter()

# 1. Load data
df = spark.read.parquet(f"{DATA_DIR}/*.parquet")
print(f"Loaded {df.count():,} records.")

# 2. Clean data
df = df.filter(
    (col('fare_amount') > 0) & (col('fare_amount') < 500) &
    (col('trip_distance') > 0) & (col('trip_distance') < 100) &
    (col('tip_amount') >= 0) & (col('tip_amount') < 100) &
    (col('payment_type') == 1) # Credit card only (tips recorded)
)
print(f"Clean records remaining: {df.count():,}.")
print(f"Load and Clean Time: {time.perf_counter() - start_time:.2f} seconds")

### Feature Engineering

Next, we prepare our features for the machine learning models. We will extract time-based features, log-transform the fare, and create route-specific aggregations based on the pickup and dropoff location IDs.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import Window

start_time = time.perf_counter()

# Time Features
df = df.withColumn('hour', F.hour('tpep_pickup_datetime'))
df = df.withColumn('dow', F.dayofweek('tpep_pickup_datetime')) # Note: 1=Sunday, 7=Saturday in PySpark
df = df.withColumn('is_weekend', F.when((col('dow') == 1) | (col('dow') == 7), 1).otherwise(0))
df = df.withColumn('is_rush_hour', 
                   F.when(((col('hour') >= 7) & (col('hour') <= 9)) | 
                          ((col('hour') >= 17) & (col('hour') <= 19)), 1).otherwise(0))

# Amount Features
df = df.withColumn('fare_log', F.log1p('fare_amount'))
df = df.withColumn('fare_decimal', (col('fare_amount') % 1 * 100).cast('int'))
df = df.withColumn('is_round_fare', F.when(col('fare_amount') % 5 == 0, 1).otherwise(0))

# Route Features
df = df.withColumn('route_id', F.concat_ws('_', col('PULocationID').cast('string'), col('DOLocationID').cast('string')))
route_window = Window.partitionBy('route_id')
df = df.withColumn('route_frequency', F.count('*').over(route_window))

# Location Aggregation Features (Mean and Std Tip by Pickup Location)
pu_window = Window.partitionBy('PULocationID')
df = df.withColumn('pu_tip_mean', F.mean('tip_amount').over(pu_window))
df = df.withColumn('pu_tip_std', F.stddev('tip_amount').over(pu_window))
df = df.fillna({'pu_tip_std': 0.0})

#### Display Engineered Features

Display the first few rows of the DataFrame with the newly engineered features to inspect the results.

In [None]:
df.show()

### Prepare Features and Target

Let's select our feature columns, assemble them into a Vector, and split the data into a Train and Test set.

In [None]:
from pyspark.ml.feature import VectorAssembler

feature_cols = [
    'trip_distance', 'fare_amount', 'passenger_count',
    'hour', 'dow', 'is_weekend', 'is_rush_hour',
    'fare_log', 'fare_decimal', 'is_round_fare',
    'route_frequency', 'pu_tip_mean', 'pu_tip_std',
    'PULocationID', 'DOLocationID'
]

# Impute nulls with mean for safe vector assembly
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=feature_cols, outputCols=feature_cols).setStrategy("mean")
imputer_model = imputer.fit(df)
df = imputer_model.transform(df)

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df_assembled = assembler.transform(df)

# Split data
train_df, test_df = df_assembled.randomSplit([0.8, 0.2], seed=42)

print(f"Train rows: {train_df.count():,}")
print(f"Test rows: {test_df.count():,}")

---

## Train Models

Now we will train three different predictive models to forecast tip amounts using PySpark.

### 1. Linear Regression


Implement and train a Linear Regression model from PySpark MLlib to predict tip amounts. Evaluate its performance using RMSE.

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

start_time = time.perf_counter()

lr = LinearRegression(featuresCol='features', labelCol='tip_amount')
lr_model = lr.fit(train_df)

lr_preds = lr_model.transform(test_df)
lr_preds = lr_preds.withColumnRenamed('prediction', 'lr_prediction')

evaluator = RegressionEvaluator(labelCol="tip_amount", predictionCol="lr_prediction", metricName="rmse")
lr_rmse = evaluator.evaluate(lr_preds)

print(f"\n{'Linear Reg RMSE:':<20} ${lr_rmse:.4f}")

### 2. Random Forest


Train a Random Forest Regressor model from PySpark MLlib. Configure parameters like maximum depth and number of trees, and then evaluate its RMSE.

In [None]:
from pyspark.ml.regression import RandomForestRegressor

start_time = time.perf_counter()

rf = RandomForestRegressor(featuresCol='features', labelCol='tip_amount', maxDepth=10, numTrees=20, seed=42)
rf_model = rf.fit(train_df)

rf_preds = rf_model.transform(test_df)
rf_preds = rf_preds.withColumnRenamed('prediction', 'rf_prediction')

evaluator_rf = RegressionEvaluator(labelCol="tip_amount", predictionCol="rf_prediction", metricName="rmse")
rf_rmse = evaluator_rf.evaluate(rf_preds)

print(f"\n{'Random Forest RMSE:':<20} ${rf_rmse:.4f}")

### 3. Distributed XGBoost
To use SparkXGBRegressor, ensure that the xgboost python package is installed in your python environment.

Utilize `SparkXGBRegressor` to train a distributed XGBoost model. Configure the number of workers, maximum depth, and learning rate for the model.

In [None]:
from xgboost.spark import SparkXGBRegressor

start_time = time.perf_counter()

xgb = SparkXGBRegressor(
  features_col="features",
  label_col="tip_amount",
  num_workers=2,
  max_depth=5,
  learning_rate=0.1
)

xgb_model = xgb.fit(train_df)

xgb_preds = xgb_model.transform(test_df)
xgb_preds = xgb_preds.withColumnRenamed('prediction', 'xgb_prediction')

evaluator_xgb = RegressionEvaluator(labelCol="tip_amount", predictionCol="xgb_prediction", metricName="rmse")
xgb_rmse = evaluator_xgb.evaluate(xgb_preds)

print(f"\n{'XGBoost RMSE:':<20} ${xgb_rmse:.4f}")

---

## Evaluate the Models

Finally, let's compare the predictions from our three models and compute a simple average ensemble.

### Compare Model Performance

Review the RMSE for each trained model (Linear Regression, Random Forest, and Distributed XGBoost) to compare their predictive performance on the test dataset.

In [None]:
from pyspark.sql.functions import col, lit

# Join predictions on a unique id or simply use the fact they were all evaluated separately.
# To perform an ensemble average in pyspark, we can join the dataframes if we added a unique ID,
# but for simplicity, we can just print the RMSE values calculated above.

print(f"\n{'Model':<20} {'RMSE':>10}")
print("-" * 32)
print(f"{'Linear Regression':<20} ${lr_rmse:>9.4f}")
print(f"{'Random Forest':<20} ${rf_rmse:>9.4f}")
print(f"{'XGBoost':<20} ${xgb_rmse:>9.4f}")
print("-" * 32)

### Stop Spark Session

Stop the Spark session to release the resources.

In [None]:
spark.stop()