In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Scenario

TheLook, a hypothetical eCommerce clothing retailer, stores data on customers, products, orders, logistics, web events, and digital marketing campaigns in BigQuery. The company wants to leverage the team's existing SQL and PySpark expertise to analyze this data using Apache Spark.

To avoid manual infrastructure provisioning or tuning for Spark, TheLook seeks an auto-scaling solution that allows them to focus on workloads rather than cluster management. Additionally, they want to minimize the effort required to integrate Spark and BigQuery while staying within the BigQuery Studio environment, possibly using BigQuery notebooks.

# Approach

In this use case, we will demonstrate how to build a logistic regression classification model using PySpark to predict whether a user will make a purchase. The entire workflow is executed within a Colab Enterprise notebook in BigQuery Studio, taking advantage of the built-in serverless Spark engine. This approach allows our data science team to use familiar PySpark libraries for data exploration and model training directly on data stored in BigQuery, creating a seamless experience from data to model within a single, integrated environment.



# **Step 1: Setup**

The following steps create resources that will be used throughout the tutorial.

Enable the following APIs and then **refresh the page**.

In [None]:
!gcloud services enable cloudaicompanion.googleapis.com dataproc.googleapis.com

Install necessary libraries.

In [None]:
!if [ $(pip show numpy 2>/dev/null | grep 'Version:' | sed 's/Version: \([0-9]\+\.[0-9]\+\).*/\1/') != "1.26" ]; then pip install -U numpy==1.26; fi

Configure a project id and location.

In [None]:
PROJECT_ID = "" # @param {type:"string"}

LOCATION = "" # @param {type:"string"}

Create a [Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets?utm_campaign=CDR_0x225cfd13_default_b407565440&utm_source=external&utm_medium=web) or set an existing one.


In [None]:
from google.cloud import storage

BUCKET_NAME = f"{PROJECT_ID}-demo"

# storage_client = storage.Client(project=PROJECT_ID)
# bucket_obj = storage_client.create_bucket(BUCKET_NAME, location=LOCATION)

# **Step 2: Configure Spark**

*   Set up the Spark environment: It imports necessary
libraries for connecting to Dataproc and using PySpark.
*   Configure the Dataproc session: It creates and configures a Spark Session with the necessary parameters, providing the spark object for subsequent Spark operations.

This step can also be accomplished in a single line of code below.



```
spark = DataprocSparkSession.builder.getOrCreate()
```





In [None]:
from google.cloud.dataproc_spark_connect import DataprocSparkSession
from google.cloud.dataproc_v1 import Session

session = Session()

session.runtime_config.version = "2.3"

# You can optionally configure Spark properties as well. See https://cloud.google.com/dataproc-serverless/docs/concepts/properties.
session.runtime_config.properties = {
  'spark.dynamicAllocation.enabled': 'false',
}

spark = (
    DataprocSparkSession.builder
      .appName("CustomSparkSession")
      .dataprocSessionConfig(session)
      .getOrCreate()
)

# **Step 3: Load data**


Load each table into Spark and register them as SparkSQL tables.

In [None]:
# Read thelook_ecommerce.users from BigQuery and create a temporary view
users = spark.read.format("bigquery").option("table", "bigquery-public-data.thelook_ecommerce.users").load()
users.createOrReplaceTempView("users")

# Read thelook_ecommerce.order_items from BigQuery
order_items = spark.read.format("bigquery").option("table", "bigquery-public-data.thelook_ecommerce.order_items").load()
order_items.createOrReplaceTempView("order_items")

# **Step 4: Data exploration**

Bigquery Studio can leverage Gemini for [advanced code completion capabilities](https://cloud.google.com/bigquery/docs/write-sql-gemini#generate_python_code?utm_campaign=CDR_0x225cfd13_default_b407565440&utm_source=external&utm_medium=web) which can use Natual Language to perform exploratory analysis using SQL and even generate PySpark Code for Feature Engineering.

Try the following examples.

**Prompt 1**: Use Spark to explore the users table and show the first 10 rows.

**Prompt 2**: Use Spark to explore the order_items table and show the first 10 rows.

**Prompt 3**: Generate PySpark code to show the top 5 most frequent countries in the users table. Display the country and the number of users from each country.

**Prompt 4**: Generate PySpark code to find the average sale price of items in the order_items table.

**Prompt 5**: Using the table "my_dataset.users", generate code to plot country vs traffic source using a suitable plotting library.

**Prompt 6:** Create a histogram showing the distribution of "age", "country_hash", "gender_hash", "traffic_source_hash"

In [None]:
# prompt: Use Spark to explore the users table and show the first 10 rows.

users.show(10)

In [None]:
# prompt: Use Spark to explore the order_items table and show the first 10 rows.

order_items.show(10)

In [None]:
# prompt: Generate PySpark code to show the top 5 most frequent countries in the users table. Display the country and the number of users from each country. All imports should use the Spark connect API, not the regular API.

from pyspark.sql.functions import col, count, desc

users.groupBy("country").agg(count("*").alias("count")).orderBy(desc("count")).limit(5).show()

In [None]:
# prompt: Generate code to find the average sale price of items in the order_items table. All imports should use the Spark connect API, not the regular API.

from pyspark.sql.functions import avg

order_items.agg(avg("sale_price")).show()

In [None]:
# prompt: Create a histogram showing the distribution of "age", "country_hash", "gender_hash", "traffic_source_hash"

import hashlib

def hash_col(df, col):
    df[f'{col}_hash'] = df[col].apply(lambda x: int(hashlib.sha256(x.encode('utf-8')).hexdigest(), 16) % 10**8)
    return df

users_df = users.toPandas()
users_df = hash_col(users_df, 'country')
users_df = hash_col(users_df, 'gender')
users_df = hash_col(users_df, 'traffic_source')

import matplotlib.pyplot as plt
plt.figure(figsize=(15, 5))

plt.subplot(1, 4, 1)
plt.hist(users_df['age'], bins=20)
plt.title('Age Distribution')

plt.subplot(1, 4, 2)
plt.hist(users_df['country_hash'], bins=20)
plt.title('Country Hash Distribution')

plt.subplot(1, 4, 3)
plt.hist(users_df['gender_hash'], bins=20)
plt.title('Gender Hash Distribution')

plt.subplot(1, 4, 4)
plt.hist(users_df['traffic_source_hash'], bins=20)
plt.title('Traffic Source Hash Distribution')


plt.tight_layout()
plt.show()

#**Step 4: Feature Engineering**

In this step, we derive two key columns from the input data:

**Creation of features column**:
It combines user attributes (age, hashed categorical features) into a numerical array, preparing them for a machine learning model.

**Generation of label column:**
It creates a binary target variable indicating whether a user has made a purchase or not, derived from order information.

In [None]:
# Load BigQuery dataset with feature engineering in SQL
features = spark.sql("""
SELECT
  ARRAY(
        CAST(u.age AS DOUBLE),
        CAST(hash(u.country) AS BIGINT) * 1.0,
        CAST(hash(u.gender) AS BIGINT) * 1.0,
        CAST(hash(u.traffic_source) AS BIGINT) * 1.0
    ) AS features,
    CASE WHEN COALESCE(SUM(oi.sale_price), 0) > 0 THEN 1 ELSE 0 END AS label
FROM users AS u
LEFT JOIN order_items AS oi
ON u.id = oi.user_id
GROUP BY u.id, u.age, u.country, u.gender, u.traffic_source;
""")
features.show()

# **Step 5: Perform ML Task**

This code trains a logistic regression model to predict user purchase behavior, with these steps:

**Feature Scaling:** StandardScaler scales the "features" column.

**Model Initialization:** LogisticRegression is set up to predict the binary "label" (purchase/no purchase), with hyperparameters for training.

**Pipeline Definition:** A Pipeline chains StandardScaler and LogisticRegression for streamlined scaling and training.

**Model Training:** `pipeline.fit(dataset)` trains the pipeline (scaling and then the model).

**Prediction:** `pipeline_model.transform(dataset)` generates predictions, and `transformed_dataset.show()` displays the results.

In short, this step scales features, trains a logistic regression model within a pipeline, and produces purchase predictions.

In [None]:
from pyspark.ml.connect.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.connect.evaluation import BinaryClassificationEvaluator
from pyspark.ml.connect.feature import StandardScaler
from pyspark.ml.connect.pipeline import Pipeline

#Split Train and Test Data (80:20)
train_data, test_data = features.randomSplit([0.9, 0.1], seed=42)

# Initialize StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Initialize Logistic Regression model
lr = LogisticRegression(maxIter=30, learningRate=0.1, featuresCol="scaled_features", labelCol="label")

# Define pipeline
pipeline = Pipeline(stages=[scaler, lr])

Train the model.

**Note**: If you see the following logging error, please ignore it: ```OSError: [Errno 99] Cannot assign requested address```

In [None]:
# Fit the model
pipeline_model = pipeline.fit(train_data)

In [None]:
# Transform the dataset using the trained model
transformed_dataset = pipeline_model.transform(test_data)

# Print the new data
transformed_dataset.show()

# **Step 6: Evaluation**

This code evaluates the trained model's performance by:

**Initializing an Evaluator:** A BinaryClassificationEvaluator is set up to calculate the Area Under the Precision-Recall Curve (AUC-PR).

**Calculating AUC-PR:** The evaluate() method calculates the AUC-PR score using the model's predictions.

This step quantifies the model's ability to distinguish between the two classes (e.g., purchase/no purchase).


Further we will use NLP2SQL code generation to visualize the output

**Prompt 1:** Generate code to plot the Precision-Recall (PR) curve. Calculate precision and recall from the model's predictions and display the PR curve using a suitable plotting library.

**Prompt 2:** Generate code to create a confusion matrix visualization. Calculate the confusion matrix from the model's predictions and display it as a heatmap or a table with counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).

In [None]:
!pip install torcheval

In [None]:
# Model evaluation
eva = BinaryClassificationEvaluator(metricName="areaUnderPR")
aucPR = eva.evaluate(transformed_dataset)
print(f"AUC PR: {aucPR}")

# **Step 7: Visualization**

Let's visualize the results to see how our model performs, and how it has predicted.

**Prompt 1:** Generate code to plot the Precision-Recall (PR) curve. Calculate precision and recall from the model's predictions and display the PR curve using a suitable plotting library.


**Prompt 2:** Generate code to create a confusion matrix visualization. Calculate the confusion matrix from the model's predictions and display it as a heat map or a table with counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).


In [None]:
# prompt: Generate code to plot the Precision-Recall (PR) curve. Calculate precision and recall from the model's predictions and display the PR curve using a suitable plotting library.

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
import pandas as pd

# Collect predictions and labels
predictions = transformed_dataset.select("prediction").toPandas()
labels = transformed_dataset.select("label").toPandas()

# Calculate precision and recall
precision, recall, _ = precision_recall_curve(labels, predictions)

# Plot the PR curve
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()

In [None]:
# prompt: Generate code to create a confusion matrix visualization. Calculate the confusion matrix from the model's predictions and display it as a heatmap or a table with counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Collect predictions and labels
predictions = transformed_dataset.select("prediction").toPandas()
labels = transformed_dataset.select("label").toPandas()

# Calculate the confusion matrix
cm = confusion_matrix(labels, predictions)

# Create a heatmap of the confusion matrix
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# prompt: Generate code to plot the Precision-Recall (PR) curve. Calculate precision and recall from the model's predictions and display the PR curve using a suitable plotting library.

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
import pandas as pd

# Convert Spark DataFrame to Pandas DataFrame
transformed_pd = transformed_dataset.select("label", "prediction").toPandas()

# Calculate precision and recall
precision, recall, _ = precision_recall_curve(transformed_pd["label"], transformed_pd["prediction"])

# Plot the PR curve
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()

In [None]:
# prompt: Generate code to create a confusion matrix visualization. Calculate the confusion matrix from the model's predictions and display it as a heatmap or a table with counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Assuming 'transformed_dataset' is a Spark DataFrame with 'label' and 'prediction' columns
predictions_pd = transformed_dataset.select('label', 'prediction').toPandas()

# Calculate the confusion matrix
cm = confusion_matrix(predictions_pd['label'], predictions_pd['prediction'])

# Create a heatmap visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()


# Display the confusion matrix as a table
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
print("Confusion Matrix (Table):", cm_df)

# Extract TP, TN, FP, FN
tn, fp, fn, tp = cm.ravel()
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

In [None]:
import seaborn as sns

# Convert feature list into individual columns
pdf = features.select("features", "label").toPandas()
feature_df = pd.DataFrame(pdf["features"].tolist(), columns=["age", "country_hash", "gender_hash", "traffic_source_hash"])

# Plot histograms
feature_df.hist(figsize=(10, 6), bins=30, color="dodgerblue", alpha=0.7)
plt.suptitle("Feature Distributions", fontsize=14)
plt.show()

# **Step 8: Write Predictions to BigQuery**

Use Gemini to write predictions to BigQuery.

**Prompt:** Using Spark, write the transformed dataset to BigQuery.

In [None]:
# prompt: Using Spark, write the transformed dataset to BigQuery.

# Write the transformed dataset to BigQuery
transformed_dataset.write.format("bigquery").option("table", f"{PROJECT_ID}.my_dataset.predictions").mode("overwrite").save()