# Auto Audience Segmentation (Interest based)

### Install (and update) additional packages

Install the following packages required to execute this notebook. 

In [1]:
!python --version

Python 3.9.2


In [2]:
import os

# The Vertex AI Workbench Notebook product has specific requirements
IS_WORKBENCH_NOTEBOOK = os.getenv("DL_ANACONDA_HOME")
IS_USER_MANAGED_WORKBENCH_NOTEBOOK = os.path.exists(
    "/opt/deeplearning/metadata/env_version"
)

# Vertex AI Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_WORKBENCH_NOTEBOOK:
    USER_FLAG = "--user"

! pip3 install --upgrade google-cloud-aiplatform {USER_FLAG} -q google-cloud-bigquery db-dtypes
! pip3 install -q --upgrade optuna==3.2.0 {USER_FLAG}
! pip3 install -q --upgrade scikit-learn==1.2.* {USER_FLAG}
! pip3 install -q --upgrade plotly==5.16.0 matplotlib==3.7.2 seaborn==0.12.2 {USER_FLAG}

### Restart the kernel

After you install the additional packages, you need to restart the notebook kernel so it can find the packages.

In [4]:
!pip3 freeze | grep -e 'scikit'

scikit-learn==1.2.2


In [None]:
# Automatically restart kernel after installs
import os

if not os.getenv("IS_TESTING"):
    # Automatically restart kernel after installs
    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

## Before you begin

#### Set your project ID

**If you don't know your project ID**, you may be able to get your project ID using `gcloud`.

In [6]:
!gcloud config set project maj-train-infer-auto

Updated property [core/project].


In [7]:
PROJECT_ID = "maj-train-infer-auto"

# Get your Google Cloud project ID from gcloud
import os

if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

Project ID: maj-train-infer-auto


Otherwise, set your project ID here.

In [8]:
if PROJECT_ID == "" or PROJECT_ID is None:
    PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
    
print ("Your set Project ID is:", PROJECT_ID)

Your set Project ID is: maj-train-infer-auto


In [9]:
REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"
    
print ('REGION:', REGION)

REGION: us-central1


### Authenticate your Google Cloud account

**If you are using Vertex AI Workbench Notebooks**, your environment is already
authenticated. Skip this step.

**If you are using Colab**, run the cell below and follow the instructions
when prompted to authenticate your account via oAuth.

**Otherwise**, follow these steps:

1. In the Cloud Console, go to the <a href="https://console.cloud.google.com/apis/credentials/serviceaccountkey" target="_blank">**Create service account key** page</a>.

2. Click **Create service account**.

3. In the **Service account name** field, enter a name, and
   click **Create**.

4. In the **Grant this service account access to project** section, click the **Role** drop-down list. Type "Vertex AI"
into the filter box, and select
   **Vertex AI Administrator**. Type "Storage Object Admin" into the filter box, and select **Storage Object Admin**.

5. Click *Create*. A JSON file that contains your key downloads to your
local environment.

6. Enter the path to your service account key as the
`GOOGLE_APPLICATION_CREDENTIALS` variable in the cell below and run the cell.

In [10]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account.

import os
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv(
    "DL_ANACONDA_HOME"
):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    # elif not os.getenv("IS_TESTING"):
    #     %env GOOGLE_APPLICATION_CREDENTIALS ''

In [11]:
SERVICE_ACCOUNT = "[your-service-account]"  # @param {type:"string"}

In [12]:
if (
    SERVICE_ACCOUNT == ""
    or SERVICE_ACCOUNT is None
    or SERVICE_ACCOUNT == "[your-service-account]"
):
    # Get your service account from gcloud
    if not IS_COLAB:
        shell_output = !gcloud config list account --format "value(core.account)"
        SERVICE_ACCOUNT = shell_output[0].strip()

    else:  # IS_COLAB:
        shell_output = ! gcloud projects describe  $PROJECT_ID
        project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
        SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"

    print("Service Account:", SERVICE_ACCOUNT)

Service Account: ctimoteo@google.com


### Import libraries

In [13]:
import numpy as np
import pandas as pd
pd.options.plotting.backend = "plotly"

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly

import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
from google.cloud import bigquery
import jinja2
import re

import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import silhouette_samples, silhouette_score

The scikit-learn version is 1.2.2.


  from .autonotebook import tqdm as notebook_tqdm


### Configuration

In [15]:
#@title Settings
SRC_PROJECT_ID = 'maj-train-infer-auto' #@param {type:"string"}
SRC_DATASET_ID = 'marketing_ga4_v1_prod' #@param {type:"string"}
DST_DATASET_ID = 'auto_audience_segmentation' #@param {type:"string"}

DATE_START = "2023-01-01" #@param {type:"date"}
DATE_END = "2023-12-31" #@param {type:"date"}
LOOKBACK_DAYS = 15 #@param {type:"integer"}

##### Creating BigQuery home dataset

In [16]:
from google.cloud import bigquery
client = bigquery.Client(project=PROJECT_ID)
dataset = bigquery.Dataset(f"{PROJECT_ID}.{DST_DATASET_ID}")
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True, timeout=30)
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

Created dataset maj-train-infer-auto.auto_audience_segmentation


## Creating Auto Generated Dataset

In [23]:
#@markdown RE_PAGE PATH is the regex expression that tells the query what part of page path to extract. Example: ^https://your-website.com(/[a-z-0-9]*/?).*
RE_PAGE_PATH = '^https://shop.googlemerchandisestore.com/([-a-zA-Z0-9@:%_+.~#?//=]*)$' #@param {type:"string"}

#@markdown PERC_KEEP is the percent of cumulative traffic you'd like to keep. (Give me all pages/folders which combine for up to X% of all traffic)
PERC_KEEP = 35 #@param {type:"slider", min:1, max:99, step:1}

In [24]:
client = bigquery.Client(project=PROJECT_ID)

In [25]:
sql = f"""
SELECT
    feature,
    ROUND(100 * SUM(users) OVER (ORDER BY users DESC) / SUM(users) OVER (), 2) as cumulative_traffic_percent,

FROM (
    SELECT
        REGEXP_EXTRACT(page_path, @RE_PAGE_PATH) as feature,
        COUNT(DISTINCT user_id) as users

    FROM (
        SELECT
            user_pseudo_id as user_id,
            page_location as page_path
        FROM `{SRC_PROJECT_ID}.{SRC_DATASET_ID}.event`
        WHERE
            event_name = 'page_view'
            AND DATE(event_timestamp) BETWEEN @DATE_START AND @DATE_END
    )
    GROUP BY 1
)
WHERE
    feature IS NOT NULL
QUALIFY
    cumulative_traffic_percent <= @PERC_KEEP
ORDER BY 2 ASC
"""

In [33]:
sql = """
SELECT
    feature,
    ROUND(100 * SUM(users) OVER (ORDER BY users DESC) / SUM(users) OVER (), 2) as cumulative_traffic_percent,

FROM (
    SELECT
        REGEXP_EXTRACT(page_path, '^https://shop.googlemerchandisestore.com/([-a-zA-Z0-9@:%_+.~#?//=]*)$') as feature,
        COUNT(DISTINCT user_id) as users

    FROM (
        SELECT
            user_pseudo_id as user_id,
            page_location as page_path
        FROM `maj-train-infer-auto.marketing_ga4_v1_prod.event`
        WHERE
            event_name = 'page_view'
            AND DATE(event_timestamp) BETWEEN '2023-01-01' AND '2023-12-31'
    )
    GROUP BY 1
)
WHERE
    feature IS NOT NULL
QUALIFY
    cumulative_traffic_percent <= 35
ORDER BY 2 ASC
"""

In [34]:
client.query(query=sql,
                  job_config=bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter("DATE_START", "DATE", DATE_START),
                        bigquery.ScalarQueryParameter("DATE_END", "DATE", DATE_END),
                        bigquery.ScalarQueryParameter("RE_PAGE_PATH", "STRING", RE_PAGE_PATH),
                        bigquery.ScalarQueryParameter("PERC_KEEP", "FLOAT64", PERC_KEEP)
                    ]
                ))

QueryJob<project=maj-train-infer-auto, location=US, id=90bed35f-43cc-4f65-af81-a06338d23203>

In [35]:
df = client.query(query=sql,
                  job_config=bigquery.QueryJobConfig(
                    query_parameters=[
                        bigquery.ScalarQueryParameter("DATE_START", "DATE", DATE_START),
                        bigquery.ScalarQueryParameter("DATE_END", "DATE", DATE_END),
                        bigquery.ScalarQueryParameter("RE_PAGE_PATH", "STRING", RE_PAGE_PATH),
                        bigquery.ScalarQueryParameter("PERC_KEEP", "FLOAT64", PERC_KEEP)
                    ]
                )
).to_dataframe()

In [28]:
print (f'Number of page path categories kept: {len(df)}')

Number of page path categories kept: 13


In [36]:
df

Unnamed: 0,feature,cumulative_traffic_percent
0,,10.31
1,Google+Redesign/Apparel/Mens?sortci=newest+desc,13.23
2,Google+Redesign/Clearance,16.02
3,basket.html,18.47
4,Google+Redesign/Stationery?sortci=newest+desc,20.67
5,Google+Redesign/Lifestyle/Drinkware,22.85
6,signin.html,25.02
7,Google+Redesign/New?sortci=newest+desc,27.1
8,Google+Redesign/Lifestyle/Bags,29.0
9,Google+Redesign/Apparel?sortci=newest+desc,30.71


In [37]:
def _clean_column_values(f):
    if f == '/' or f == '' or f is None: return 'homepage'
    if f.startswith('/'): f = f[1:]
    if f.endswith('/'): f = f[:-1]
    return re.sub('[^0-9a-zA-Z]+', '_', f)

In [42]:
t = jinja2.Template("""
CREATE OR REPLACE PROCEDURE {{ DST_DATASET_ID }}.create_auto_audience_segmentation_dataset(
  DATE_START DATE, DATE_END DATE, LOOKBACK_DAYS INT64
)
BEGIN

    DECLARE RE_PAGE_PATH STRING DEFAULT "{{ re_page_path|e }}";
    
    CREATE OR REPLACE TABLE `{{ DST_DATASET_ID }}.auto_audience_segmentation_full_dataset`
    AS
    WITH 
        visitor_pool AS (
            SELECT
              user_pseudo_id,
              MAX(event_timestamp) as feature_timestamp,
              DATE(MAX(event_timestamp)) - LOOKBACK_DAYS as date_lookback
            FROM `{{ PROJECT_ID }}.{{ DATASET_ID }}.event`
            WHERE DATE(event_timestamp) BETWEEN DATE_START AND DATE_END
            GROUP BY 1
    )

    SELECT
        user_id,
        feature_timestamp,
        {% for f in features %}COUNTIF( REGEXP_EXTRACT(page_path, RE_PAGE_PATH) = '{{ f }}' ) as {{ clean_column_values(f) }},
        {% endfor %}
    FROM (
        SELECT
            vp.feature_timestamp,
            ga.user_pseudo_id as user_id,
            page_location as page_path
        FROM `{{ PROJECT_ID }}.{{ DATASET_ID }}.event` as ga
        INNER JOIN visitor_pool as vp
            ON vp.user_pseudo_id = ga.user_pseudo_id
                AND DATE(ga.event_timestamp) >= vp.date_lookback
        WHERE
            event_name = 'page_view'
            AND DATE(ga.event_timestamp) BETWEEN DATE_START - LOOKBACK_DAYS AND DATE_END
    )
    GROUP BY 1, 2;

END
""")
t.globals.update({'clean_column_values': _clean_column_values})

In [43]:
sql

'\nCREATE OR REPLACE PROCEDURE auto_audience_segmentation.create_auto_audience_segmentation_dataset(\n  DATE_START DATE, DATE_END DATE, LOOKBACK_DAYS INT64\n)\nBEGIN\n\n    DECLARE RE_PAGE_PATH STRING DEFAULT "^https://shop.googlemerchandisestore.com/([-a-zA-Z0-9@:%_+.~#?//=]*)$";\n    \n    CREATE OR REPLACE TABLE `auto_audience_segmentation.auto_audience_segmentation_full_dataset`\n    AS\n    WITH \n        visitor_pool AS (\n            SELECT\n              user_pseudo_id,\n              MAX(event_timestamp) as feature_timestamp,\n              DATE(MAX(event_timestamp)) - LOOKBACK_DAYS as date_lookback\n            FROM `maj-train-infer-auto.marketing_ga4_v1_prod.event`\n            WHERE DATE(event_timestamp) BETWEEN DATE_START AND DATE_END\n            GROUP BY 1\n    )\n\n    SELECT\n        user_id,\n        feature_timestamp,\n        COUNTIF( REGEXP_EXTRACT(page_path, RE_PAGE_PATH) = \'\' ) as homepage,\n        COUNTIF( REGEXP_EXTRACT(page_path, RE_PAGE_PATH) = \'Google+Re

In [39]:
sql = t.render(
        PROJECT_ID=SRC_PROJECT_ID,
        DATASET_ID=SRC_DATASET_ID,
        DST_DATASET_ID=DST_DATASET_ID,
        re_page_path=RE_PAGE_PATH,
        features=df.feature.tolist()
)
client.query(query=sql).result()
client.query(
    query=f"CALL `{PROJECT_ID}.{DST_DATASET_ID}.create_auto_audience_segmentation_dataset`(@DATE_START, @DATE_END, @LOOKBACK_DAYS);",
    job_config=bigquery.QueryJobConfig(
        query_parameters=[
            bigquery.ScalarQueryParameter("DATE_START", "DATE", DATE_START),
            bigquery.ScalarQueryParameter("DATE_END", "DATE", DATE_END),
            bigquery.ScalarQueryParameter("LOOKBACK_DAYS", "INTEGER", LOOKBACK_DAYS)
        ]
    )
).result()

<google.cloud.bigquery.table._EmptyRowIterator at 0x7bb5f30c6940>

In [40]:
df = client.query(query=f"SELECT * FROM `{PROJECT_ID}.{DST_DATASET_ID}.auto_audience_segmentation_full_dataset`").to_dataframe()

In [None]:
df

## Cluster Model [for Interests on Site]

In [None]:
X = df.copy()
features = list(X.columns[2:])  # need to skip first two columns -> user_id, feature_timestamp
min_num_clusters = 3
max_num_clusters = len(features)

In [None]:
def create_model(params):
    model = Pipeline([
        ('transform', ColumnTransformer(
            transformers=[
                ('tfidf',
                 TfidfTransformer(norm='l2'),
                 list(range(2, len(features) + 2))  # need to skip first two columns -> user_id, feature_timestamp
                )
            ]
        )),
        ('model', KMeans(
            init='k-means++', n_init='auto',
            random_state=42,
            **params)
        )
    ])

    return model

def objective(trial):
    params = {
      "n_clusters": trial.suggest_int("n_clusters", min_num_clusters, max_num_clusters),
      "max_iter": trial.suggest_int("max_iter", 10, 1000, step=10),
      "tol": trial.suggest_float("tol", 1e-6, 1e-2, step=1e-6),
    }

    model = create_model(params)
    model.fit(X)
    labels = model.predict(X)

    return silhouette_score(
        model.named_steps['transform'].transform(X),
        labels, metric='euclidean',
        sample_size=int(len(df) * 0.1) if int(len(df) * 0.1) < 10_000 else 10_000,
        random_state=42
    ), params['n_clusters']

In [None]:
study = optuna.create_study(
    directions=["maximize", "minimize"],
    sampler=optuna.samplers.TPESampler(seed=42, n_startup_trials=25)
)
study.optimize(objective,
               n_trials=125,
               show_progress_bar=True,
               n_jobs=-1
)

### Optimization Results

In [None]:
fig = optuna.visualization.plot_pareto_front(study, target_names=['Silhuette', 'Num. Clusters'], include_dominated_trials=False)
fig.show()

#### Find a trial with the least number clusters while still retaining sufficient performance.
**P_WIGGLE** is max percentage a trial can be worse than the best trial to be considered based on the Silhuette Score.

In [None]:
P_WIGGLE = 10 #@param {type:"slider", min:1, max:99, step:1}

In [None]:
best_trials = sorted([(t.number, t.values[0], t.values[1], t.params) for t in study.best_trials], key=lambda x: x[1], reverse=True)
best_score = best_trials[0][1]
best_trials = sorted([(t.number, t.values[0], t.values[1], t.params) for t in study.best_trials], key=lambda x: (x[2], x[1]))
trial_chosen = None
for t in best_trials:
    if (1 - t[1]/best_score) <= P_WIGGLE/100:
        print (f'TRIAL {t[0]}:')
        print (f" Num. clusters: {int(t[2])}")
        print (f" Best score: {round(best_score, 4)} / Chosen trial Score: {round(t[1], 4)}")
        print (f" % worse than best: {100 * round((1 - t[1]/best_score), 4)}%")
        print (f" Params: {t[3]}")

        trial_chosen = t
        break

model = create_model(trial_chosen[3])
model.fit(X)
labels = model.predict(X)

## Visualization

### Silhouette Analysis

In [None]:
def silhouette_visualization(X, model):
    np.random.seed(42)
    # Create a subplot with 1 row and 2 columns
    fig, ax1 = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)

    sample = np.random.choice(len(X), 
                              size=int(len(X) * 0.1) if int(len(X) * 0.1) < 10_000 else 10_000)
    model_cluster_centers = model.named_steps['model'].cluster_centers_
    X_tr = model.named_steps['transform'].transform(X)

    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(sample) + (len(model_cluster_centers) + 1) * 10])

    cluster_labels = model.predict(X)
    print("Clustering done for n_clusters={}".format(len(model_cluster_centers)))

    cluster_labels = cluster_labels[sample]

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X_tr[ sample, :], cluster_labels)
    print("For n_clusters =", len(model_cluster_centers),
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample

    sample_silhouette_values = silhouette_samples(X_tr[ sample, :], cluster_labels)

    y_lower = 10
    for i in range(len(model_cluster_centers)):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / len(model_cluster_centers))
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    plt.suptitle(f"Silhouette analysis for KMeans clustering on sample data with n_clusters = {len(model_cluster_centers)}", 
                 fontsize=14, fontweight='bold')

    plt.show()

silhouette_visualization(X, model)

### Heatmap

In [None]:
model_cluster_centers = pd.DataFrame(model.named_steps['model'].cluster_centers_, columns=features)
mcc = model_cluster_centers.T / model_cluster_centers.T.sum(axis=1).values.reshape(1, len(features)).T

sns.set(font_scale=0.75)
fig, ax = plt.subplots(figsize=(int(model_cluster_centers.shape[1]/2), model_cluster_centers.shape[0]/2))
_ = sns.heatmap(mcc, ax=ax, cbar=False, annot=np.round(model_cluster_centers, 2).T)
ax.set_title('Heatmap')
ax.set(xlabel='Cluster')
plt.yticks(rotation=0)
plt.show()

### Cluster Sizes

In [None]:
c = np.array(np.bincount(labels), dtype=np.float64)
c /= c.sum() /100
df_c = pd.DataFrame(c, index=[f"Cluster {n}" for n in range(c.size)], columns=['Size %'])
df_c.plot.bar()

## [Optional] Deployment to Vertex AI Model Registry

In [None]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f) 

In [None]:
MODEL_NAME = f"interest-cluster-model"
GCS_BUCKET=f'{PROJECT_ID}-maj-models'
ARTIFACT_GCS_URI = f"gs://{GCS_BUCKET}/{MODEL_NAME}"
PREBUILT_CONTAINER_URI = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest"

In [None]:
!gsutil mb -p $PROJECT_ID -l $REGION gs://$GCS_BUCKET

In [None]:
!gsutil cp model.pkl $ARTIFACT_GCS_URI/model.pkl
!rm model.pkl

In [None]:
!gcloud ai models upload --region=$REGION --display-name=$MODEL_NAME --container-image-uri=$PREBUILT_CONTAINER_URI --artifact-uri=$ARTIFACT_GCS_URI

In [None]:
!gcloud ai models list --region=$REGION --filter=display_name=$MODEL_NAME

## [Optional] Delete GCP Resources

In [None]:
import google.cloud.aiplatform as aiplatform
aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
for m in aiplatform.Model.list(filter=f"display_name={MODEL_NAME}", order_by=f"create_time desc"):
    m.delete()