In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Propensity Modeling & Churn Predictions

## Overview

TheLook, a hypothetical eCommerce clothing retailer, stores data on customers, products, orders, logistics, web events, and digital marketing campaigns in BigQuery. The company wants to leverage the team's existing SQL and PySpark expertise to analyze this data using Apache Spark.

To avoid manual infrastructure provisioning or tuning for Spark, TheLook seeks an auto-scaling solution that allows them to focus on workloads rather than cluster management. Additionally, they want to minimize the effort required to integrate Spark and BigQuery while staying within the BigQuery Studio environment, possibly using BigQuery notebooks.

In this use case, we will demonstrate how to build a logistic regression classification model using PySpark to predict whether a user will make a purchase. The entire workflow is executed within a Colab Enterprise notebook in BigQuery Studio, taking advantage of the built-in serverless Spark engine. This approach allows our data science team to use familiar PySpark libraries for data exploration and model training directly on data stored in BigQuery, creating a seamless experience from data to model within a single, integrated environment.



## Setup

The following steps create resources that will be used throughout the tutorial.

In [None]:
!pip3 install torcheval google-cloud-bigquery google-cloud-storage scikit-learn matplotlib

In [None]:
import sys

# Authenticate your notebook environment (Colab only)
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

Enable the following APIs.**.

In [None]:
!gcloud services enable dataproc.googleapis.com

Install necessary libraries and then **restart the session** by clicking the down arrow next to **Run all** and clicking **Restart session**.

In [None]:
!if [ $(pip show numpy 2>/dev/null | grep 'Version:' | sed 's/Version: \([0-9]\+\.[0-9]\+\).*/\1/') != "1.26" ]; then pip install -U numpy==1.26; fi

Configure a project id and location.

In [None]:
PROJECT_ID = "PROJECT_ID" # @param {type:"string"}
LOCATION = "LOCATION" # @param {type:"string"}

In [None]:
!gcloud config set project $PROJECT_ID

Create a [Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets?utm_campaign=CDR_0x225cfd13_default_b407565440&utm_source=external&utm_medium=web) or set an existing one.


In [None]:
from google.cloud import storage

if not PROJECT_ID or not LOCATION:
    raise ValueError("PROJECT_ID and LOCATION must be set before proceeding.")

BUCKET_NAME = f"{PROJECT_ID}-demo"

# Uncomment to create a new bucket
# storage_client = storage.Client(project=PROJECT_ID)
# bucket_obj = storage_client.create_bucket(BUCKET_NAME, location=LOCATION)

## Configure Spark

*   Set up the Spark environment: It imports necessary
libraries for connecting to Dataproc and using PySpark.
*   Configure the Dataproc session: It creates and configures a Spark Session with the necessary parameters, providing the spark object for subsequent Spark operations.

In [None]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
    .appName("App")
    .config(f"spark.dynamicAllocation.enabled", "false")
    .getOrCreate())

## Load Data


Load each table into Spark and register them as SparkSQL tables.

In [None]:
# Read thelook_ecommerce.users from BigQuery and create a temporary view
users = spark.read.format("bigquery").option("table", "bigquery-public-data.thelook_ecommerce.users").load()
users.createOrReplaceTempView("users")

# Read thelook_ecommerce.order_items from BigQuery
order_items = spark.read.format("bigquery").option("table", "bigquery-public-data.thelook_ecommerce.order_items").load()
order_items.createOrReplaceTempView("order_items")

## Data Exploration

Bigquery Studio can leverage Gemini for [advanced code completion capabilities](https://cloud.google.com/bigquery/docs/write-sql-gemini#generate_python_code?utm_campaign=CDR_0x225cfd13_default_b407565440&utm_source=external&utm_medium=web) which can use Natual Language to perform exploratory analysis using SQL and even generate PySpark Code for Feature Engineering.

Try the following examples.

* **Prompt 1**: Use Spark to explore the users table and show the first 10 rows.
* **Prompt 2**: Use Spark to explore the order_items table and show the first 10 rows.
* **Prompt 3**: Generate PySpark code to show the top 5 most frequent countries in the users table. Display the country and the number of users from each country.
* **Prompt 4**: Generate PySpark code to find the average sale price of items in the order_items table.
* **Prompt 5**: Using the table "users", generate code to plot country vs traffic source using a suitable plotting library.
* **Prompt 6:** Create a histogram showing the distribution of "age", "country_hash", "gender_hash", "traffic_source_hash"

### Display the first 10 rows of the users table.

In [None]:
# prompt: Use Spark to explore the users table and show the first 10 rows.

users.show(10)

### Display the first 10 rows of the order_items table.

In [None]:
# prompt: Use Spark to explore the order_items table and show the first 10 rows.

order_items.show(10)

### Show the top 5 most frequent countries and their user counts.

In [None]:
# prompt: Generate PySpark code to show the top 5 most frequent countries in the users table. Display the country and the number of users from each country. All imports should use the Spark connect API, not the regular API.

from pyspark.sql.functions import col, count

users.groupBy("country").agg(count("*").alias("user_count")).orderBy(col("user_count").desc()).limit(5).show()

### Calculate the average sale price of items.

In [None]:
# prompt: Generate code to find the average sale price of items in the order_items table.

order_items.agg({"sale_price": "avg"}).show()

### Visualize the distribution of key user attributes.

In [None]:
# prompt: Create a histogram showing the distribution of "age", "country_hash", "gender_hash", "traffic_source_hash"

import matplotlib.pyplot as plt

# Convert Spark DataFrame to Pandas DataFrame for plotting
users_pd = users.toPandas()

# Create histograms for the specified columns
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Distribution of Age, Country, Gender, and Traffic Source')

# Age histogram
axes[0, 0].hist(users_pd['age'].dropna(), bins=20, edgecolor='black')
axes[0, 0].set_title('Age Distribution')
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')

# Country histogram (using value counts for categorical data)
users_pd['country'].value_counts().plot(kind='bar', ax=axes[0, 1])
axes[0, 1].set_title('Country Distribution')
axes[0, 1].set_xlabel('Country')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].tick_params(axis='x', rotation=45)

# Gender histogram (using value counts for categorical data)
users_pd['gender'].value_counts().plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Gender Distribution')
axes[1, 0].set_xlabel('Gender')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].tick_params(axis='x', rotation=45)

# Traffic Source histogram (using value counts for categorical data)
users_pd['traffic_source'].value_counts().plot(kind='bar', ax=axes[1, 1])
axes[1, 1].set_title('Traffic Source Distribution')
axes[1, 1].set_xlabel('Traffic Source')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

## Feature Engineering

In this step, we derive two key columns from the input data:

**Creation of features column**:
It combines user attributes (age, hashed categorical features) into a numerical array, preparing them for a machine learning model.

**Generation of label column:**
It creates a binary target variable indicating whether a user has made a purchase or not, derived from order information.

In [None]:
# Load BigQuery dataset with feature engineering in SQL
features = spark.sql("""
SELECT
  ARRAY(
        CAST(u.age AS DOUBLE),
        CAST(hash(u.country) AS BIGINT) * 1.0,
        CAST(hash(u.gender) AS BIGINT) * 1.0,
        CAST(hash(u.traffic_source) AS BIGINT) * 1.0
    ) AS features,
    CASE WHEN COALESCE(SUM(oi.sale_price), 0) > 0 THEN 1 ELSE 0 END AS label
FROM users AS u
LEFT JOIN order_items AS oi
ON u.id = oi.user_id
GROUP BY u.id, u.age, u.country, u.gender, u.traffic_source;
""")
features.show()

## Perform ML Task

This code trains a logistic regression model to predict user purchase behavior, with these steps:

**Feature Scaling:** StandardScaler scales the "features" column.

**Model Initialization:** LogisticRegression is set up to predict the binary "label" (purchase/no purchase), with hyperparameters for training.

**Pipeline Definition:** A Pipeline chains StandardScaler and LogisticRegression for streamlined scaling and training.

**Model Training:** `pipeline.fit(dataset)` trains the pipeline (scaling and then the model).

**Prediction:** `pipeline_model.transform(dataset)` generates predictions, and `transformed_dataset.show()` displays the results.

In short, this step scales features, trains a logistic regression model within a pipeline, and produces purchase predictions.

In [None]:
from pyspark.ml.connect.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.connect.evaluation import BinaryClassificationEvaluator
from pyspark.ml.connect.feature import StandardScaler
from pyspark.ml.connect.pipeline import Pipeline

# Split Train and Test Data (90:10)
train_data, test_data = features.randomSplit([0.9, 0.1], seed=42)

# Initialize StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Initialize Logistic Regression model
lr = LogisticRegression(maxIter=30, learningRate=0.1, featuresCol="scaled_features", labelCol="label")

# Define pipeline
pipeline = Pipeline(stages=[scaler, lr])

Train the model.

**Note**: If you see the following logging error, please ignore it: OSError: [Errno 99] Cannot assign requested address

In [None]:
# Fit the model
pipeline_model = pipeline.fit(train_data)

### Generate and display predictions on the test dataset.

In [None]:
# Transform the dataset using the trained model
transformed_dataset = pipeline_model.transform(test_data)

# Print the new data
transformed_dataset.show()

## Evaluation

This code evaluates the trained model's performance by:

**Initializing an Evaluator:** A BinaryClassificationEvaluator is set up to calculate the Area Under the Precision-Recall Curve (AUC-PR).

**Calculating AUC-PR:** The evaluate() method calculates the AUC-PR score using the model's predictions.

This step quantifies the model's ability to distinguish between the two classes (e.g., purchase/no purchase).


Further we will use NLP2SQL code generation to visualize the output

**Prompt 1:** Generate code to plot the Precision-Recall (PR) curve. Calculate precision and recall from the model's predictions and display the PR curve using a suitable plotting library.

**Prompt 2:** Generate code to create a confusion matrix visualization. Calculate the confusion matrix from the model's predictions and display it as a heatmap or a table with counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).

In [None]:
# Model evaluation
eva = BinaryClassificationEvaluator(metricName="areaUnderPR")
aucPR = eva.evaluate(transformed_dataset)
print(f"AUC PR: {aucPR}")

## Visualization

Let's visualize the results to see how our model performs, and how it has predicted.

**Prompt 1:** Generate code to plot the Precision-Recall (PR) curve using a suitable plotting library.


**Prompt 2:** Generate code to create a confusion matrix visualization. Calculate the confusion matrix from the model's predictions and display it as a heat map or a table with counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).


### Plot the Precision-Recall (PR) Curve.

In [None]:
# prompt: Generate code to plot the Precision-Recall (PR) curve using a suitable plotting library.

from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt

# Get predictions and labels from the transformed dataset
predictions_and_labels = transformed_dataset.select("prediction", "label").collect()
y_scores = [row.prediction for row in predictions_and_labels]
y_true = [row.label for row in predictions_and_labels]

# Calculate precision and recall
precision, recall, _ = precision_recall_curve(y_true, y_scores)
pr_auc = auc(recall, precision)

# Plot the PR curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2, label=f'PR curve (AUC = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.grid(True)
plt.show()

### Visualize the Confusion Matrix.

In [None]:
# prompt: Generate code to create a confusion matrix visualization. Calculate the confusion matrix from the model's predictions and display it as a heatmap or a table with counts of true positives (TP), true negatives (TN), false positives (FP), and false negatives (FN).

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Get predictions and labels from the transformed dataset
predictions_and_labels = transformed_dataset.select("prediction", "label").collect()
y_pred = [row.prediction for row in predictions_and_labels]
y_true = [row.label for row in predictions_and_labels]

# Calculate the confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Display the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Display the confusion matrix as a table
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP): {tp}")

## Write Predictions to BigQuery

Use Gemini to write predictions to BigQuery.

**Prompt:** Use Python to create a new BigQuery dataset called predictions and then use Spark to write the new prediction data to a table in this dataset.

### Create BigQuery Dataset and Write Predictions.

In [None]:
# prompt: Use Python to create a new BigQuery dataset called predictions, and then use Spark to write the new prediction data to a table in this dataset.

from google.cloud import bigquery

# Construct a BigQuery client object.
client = bigquery.Client(project=PROJECT_ID)

# Set dataset ID
dataset_id = f"{PROJECT_ID}.predictions"

# Construct a full Dataset object to send to the API.
dataset = bigquery.Dataset(dataset_id)

# Specify the geographic location where the dataset should reside.
dataset.location = LOCATION

# Send the dataset to the API for creation, with an explicit timeout.
# Raises google.api_core.exceptions.Conflict if the dataset already exists.
try:
    dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
    print(f"Created dataset {client.project}.{dataset.dataset_id}")
except Exception as e:
    print(f"Dataset {client.project}.{dataset.dataset_id} already exists or an error occurred: {e}")

# Write the transformed_dataset to a new BigQuery table
transformed_dataset.write \
    .format("bigquery") \
    .option("table", f"{dataset_id}.user_purchase_predictions") \
    .option("writeMethod", "direct") \
    .mode("overwrite") \
    .save()

print(f"Predictions written to BigQuery table: {dataset_id}.user_purchase_predictions")