In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Anomaly detection in security logs with BQML

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/bigquery_ml/Anomaly_detection_in_Cloud_Audit_logs_with_BQML.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fvertex-ai-samples%2Fmain%2Fnotebooks%2Fcommunity%2Fbigquery_ml%2FAnomaly_detection_in_Cloud_Audit_logs_with_BQML.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/vertex-ai-samples/main/notebooks/community/bigquery_ml/Anomaly_detection_in_Cloud_Audit_logs_with_BQML.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/bigquery_ml/Anomaly_detection_in_Cloud_Audit_logs_with_BQML.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

**_NOTE_**: This notebook has been tested in the following environment:

* Python version = 3.9

## Overview

This Colab notebook demonstrates how you use BigQuery ML to detect anomalies in Cloud Audit logs. You use two different pre-built ML models for unsupervised anomaly detection: K-means clustering and Autoencoders. These models help you identify outliers, such as uncommon API usage by any user identity. Identifying anomalies in audit logs is critical for cloud administrators and operators to spot potential threats, from privilege escalation to API abuse.

### Objective

In this tutorial, you learn how to:

* Apply feature enginering by preprocessing Cloud Audit logs
* Use BigQuery ML for unsupervised anomaly detection in Cloud Audit logs
* Train and evaluate ML models such as K-means clustering and Autoencoders
* Extract and analyze outliers

This tutorial uses the following Google Cloud ML services and resources:

- BigQuery
- Cloud Storage
- Log Analytics

### Prerequisite
 If you haven't already done so, the only requirement is to [upgrade your existing log bucket](https://cloud.google.com/logging/docs/buckets#upgrade-bucket) to use Log Analytics which provides you with a linked BigQuery dataset with your own queryable logs data. This is a **one-click step without incurring additional costs**. By default, Cloud Audit Admin Activity logs are enabled, ingested and stored in every project's `_Required` bucket without any charges.

![one click prerequisite](https://services.google.com/fh/files/misc/upgrade_log_bucket.png)

### Dataset

In this notebook, you analyze your own Cloud Audit logs, such as Admin Activity logs, which are enabled and stored by default in every Google Cloud project. Unlike synthetic data, analyzing your real data provides you with actual insights, but the results can vary.

### Costs


This tutorial uses billable components of Google Cloud:

* BigQuery

Learn about [BigQuery pricing](https://cloud.google.com/bigquery/pricing)
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Get started

### Install required packages

In [None]:
! pip install --upgrade google-cloud-bigquery
! pip install pandas
! pip install db-dtypes

### Restart runtime (Colab only)

To use the newly installed packages, you must restart the runtime on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    import IPython

    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>

### Authenticate your notebook environment (Colab only)

Authenticate your environment on Google Colab.

In [None]:
import sys

if "google.colab" in sys.modules:

    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information

To get started using Bigquery, you must have an existing Google Cloud project and [Enable the BigQuery API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). 

In [None]:
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = " [your-location]"  # @param {type: "string"}
DATA_LOCATION = "US"  # @param {type: "string"}

### Import libraries

In [None]:
import time

from google.cloud import bigquery

### Create a linked BigQuery dataset

When you want to use the capabilities of BigQuery to analyze your log data, upgrade a log bucket to use Log Analytics, and then create a [linked dataset](https://cloud.google.com/logging/docs/buckets#link-bq-dataset). With this configuration, Cloud Logging stores your log data but BigQuery can read the log data.

Learn more about [creating a linked BigQuery dataset](https://cloud.google.com/logging/docs/buckets#link-bq-dataset).

Provide the Project ID, BigQuery dataset and BigQuery table where the audit logs are stored. You can find the linked BigQuery dataset ID for your log bucket from the [Logs Storage page](https://console.cloud.google.com/logs/storage).

In [None]:
# Set log source project id
logSourceProject = "[your-log-source-project-id]"  # @param {type:"string"} custom
# Set log source BigQuery dataset name
logSourceBqDataset = "[your-log-source-dataset]"  # @param {type:"string"} custom
# Set log source BigQuery table name
logSourceBqTable = "[your-log-source-table]"  # @param {type:"string"} custom

If you don't have a linked dataset, run the below cells to create one for a log bucket that is upgraded to use Log Analytics.

In [None]:
# Set log source Cloud Storage bucket id
bucket_id = "[your-log-source-bucket-id]"
# Set log source location
bucket_location = "[your-log-source-bucket-location]"

In [None]:
# Set the defaults if no values are provided
if logSourceProject == "[your-log-source-project-id]":
    logSourceProject = PROJECT_ID
    
if logSourceBqDataset == "[your-log-source-dataset]":
    logSourceBqDataset = "logs_linked_dataset"

if logSourceBqTable == "[your-log-source-table]":
    logSourceBqTable = "_AllLogs"
    
if bucket_id == "[your-log-source-bucket-id]":
    bucket_id = "_Required" 

if bucket_location = "[your-log-source-bucket-location]":
    bucket_location = "global"

In [None]:
# Create the linked BigQuery dataset
! gcloud logging links create {logSourceBqDataset} --bucket={bucket_id} --location={bucket_location}

Set the BigQuery dataset and BigQuery table where the preprocessed training dataset is stored.

In [None]:
BQ_DATASET_NAME = "bqml_approach"  # @param {type:"string"} custom
BQ_TABLE_NAME = "training_data"  # @param {type:"string"} custom

### Create a BigQuery dataset

Create a dataset or load an existing dataset in BigQuery.

In [None]:
BQ_DATASET_PATH = ".".join([PROJECT_ID, BQ_DATASET_NAME])

# Must be same location as TRAINING_DATASET_BQ_PATH.
client = bigquery.Client(project=PROJECT_ID)
bq_dataset_pre = bigquery.Dataset(BQ_DATASET_PATH)
bq_dataset_pre.location = DATA_LOCATION
try:
    bq_dataset = client.create_dataset(bq_dataset_pre)
except:
    bq_dataset = client.get_dataset(bq_dataset_pre)
print(f"Created bigquery dataset {BQ_DATASET_PATH} in {DATA_LOCATION}")

 Provide the BQML model names. These models are saved under the above mentioned BQ dataset.

In [None]:
KMEANS_MODEL = "KMEANS_HTUNED"  # @param {type:"string"} custom
AUTO_ENCODER_MODEL = "AUTOENCODER_HTUNED"  # @param {type:"string"} custom

## Training data preparation and analysis

Cloud Audit logs contain a wealth of important information but their volume, velocity and variety makes it challenging to analyze at scale. Each log entry has a relatively [complex schema](https://cloud.google.com/logging/docs/reference/v2/rest/v2/LogEntry) which makes it further challenging to analyze in their raw format.

Before running the ML models, you extract the relevant fields from these logs and aggregate (count) the **actions** by **day**, **actor**, **action**, and **source IP**. The objective behind training a machine learning model is to identify anomalous user behaviors for which the provided features are relevant and collectively sufficient.

In [None]:
# This helper function executes the sql query, wait for query execution completion and returns the results as dataframe
def execute_sql(sql_query: str):
    """The executes the sql.
    Args:
        sql_query:(:obj:`str`): SQL query to execute
    """
    from google.cloud import bigquery

    client = bigquery.Client()
    import traceback

    try:
        client = bigquery.Client()
        start = time.time()
        query_job = client.query(sql_query)  # Make an API request.
        print("Query Executed.Waiting for completion")
        results = query_job.result()  # Waits for job to complete.
        end = time.time()
        print("Query Execution completed")
        print("Time taken to execute:", end - start)
        if results.total_rows > 0:
            df = results.to_dataframe()
            df.head()
            return df
    except Exception as e:
        error = traceback.format_exc()
        print(error)
        print(e)
        raise RuntimeError(f"Can't execute the query {sql_query}")

The following user defined function(UDF) extracts the resourced ID that was acted on as per the audit log entry. In the audit log entry, the resource ID is specified within a resource label field based on the resource type. Hence, this UDF is needed to normalize that resource ID field.

In [None]:
# Deduce resource ID from a log entry resource field
UDF_NAME = "getResourceId"

sql = """
CREATE OR REPLACE FUNCTION `{}.{}.{}`(
  type STRING,
  labels JSON
)
RETURNS STRING
AS (
 COALESCE(
  JSON_VALUE(labels.email_id),     # service_account
  JSON_VALUE(labels.pod_id),       # container
  JSON_VALUE(labels.instance_id),  # gce_instance, spanner_instance, redis_instance, ...
  JSON_VALUE(labels.subnetwork_id),# gce_subnetwork,
  JSON_VALUE(labels.network_id),   # gce_network, gce_network_region, ...
  JSON_VALUE(labels.topic_id),     # pubsub_topic
  JSON_VALUE(labels.subscription_id), # pubsub_subscription
  JSON_VALUE(labels.endpoint_id),  # aiplatform.googleapis.com/Endpoint
  JSON_VALUE(labels.job_id),       # dataflow_step
  JSON_VALUE(labels.dataset_id),   # bigquery_dataset
  JSON_VALUE(labels.project_id),
  JSON_VALUE(labels.organization_id),
  JSON_VALUE(labels.id),
  "other")
);""".format(
    PROJECT_ID, BQ_DATASET_NAME, UDF_NAME
)

execute_sql(sql)
print(f"Created UDF {PROJECT_ID}.{BQ_DATASET_NAME}.{UDF_NAME}")

The following UDF deduces where a user or system action occured from as per the audit log entry. For example, an action might have occured through the Cloud Console, or using gcloud CLI, or via Terraform script or another unknown client or channel.

In [None]:
# Deduce channel from a log entry request user agent
UDF_NAME = "getChannelType"

sql = """CREATE OR REPLACE FUNCTION `{}.{}.{}`(
  caller_supplied_user_agent STRING
)
RETURNS STRING
AS (
  CASE
    WHEN caller_supplied_user_agent LIKE "Mozilla/%" THEN 'Cloud Console'
    WHEN caller_supplied_user_agent LIKE "google-cloud-sdk gcloud/%" THEN 'gcloud CLI'
    WHEN caller_supplied_user_agent LIKE "google-api-go-client/% Terraform/%" THEN 'Terraform'
    ELSE 'other'
  END
);""".format(
    PROJECT_ID, BQ_DATASET_NAME, UDF_NAME
)

execute_sql(sql)
print(f"Created UDF {PROJECT_ID}.{BQ_DATASET_NAME}.{UDF_NAME}")

Query the log source to extract the training data with fields of interest.

In [None]:
# Query to extract training data with fields of interest
query_str = """ SELECT
    EXTRACT(DATE FROM timestamp) AS day,
    IFNULL(proto_payload.audit_log.authentication_info.principal_email, "unknown") as principal_email,
    IFNULL(proto_payload.audit_log.method_name, "unknown") as action,
    IFNULL(resource.type, "unknown") as resource_type,
    {3}.getResourceId(resource.type, resource.labels) AS resource_id,
    -- proto_payload.audit_log.resource_name as resource_name,
    SPLIT(log_name, '/')[SAFE_OFFSET(0)] as container_type,
    SPLIT(log_name, '/')[SAFE_OFFSET(1)] as container_id,
    {3}.getChannelType(proto_payload.audit_log.request_metadata.caller_supplied_user_agent) AS channel,
    IFNULL(proto_payload.audit_log.request_metadata.caller_ip, "unknown") as ip,
    COUNT(*) counter,
    -- ANY_VALUE(resource) as resource,           -- for debugging
    -- ANY_VALUE(proto_payload) as proto_payload  -- for debugging
  FROM  `{0}.{1}.{2}`
  WHERE
    -- log_id = "cloudaudit.googleapis.com/activity" AND
    timestamp > TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 360 DAY)
  GROUP BY
    day, principal_email, action, resource_type, resource_id, container_type, container_id, channel, ip, log_name
  ORDER BY
    day DESC, principal_email, action""".format(
    logSourceProject, logSourceBqDataset, logSourceBqTable, BQ_DATASET_NAME
)

View the training data dataframe.

In [None]:
client = bigquery.Client(project=PROJECT_ID)
df = client.query(query_str).to_dataframe()
df.head()

In [None]:
df.info()

Create a table in BQ with the extracted data.

In [None]:
create_training_data_table = (
    """ CREATE OR REPLACE TABLE `{}.{}.{}` AS""".format(
        PROJECT_ID, BQ_DATASET_NAME, BQ_TABLE_NAME
    )
    + query_str
)
client.query(create_training_data_table)

## K-Means Clustering

Here's a brief description of each parameter for training a `k-means model`:

* `MODEL_TYPE`: Specify 'KMEANS' to use k-means clustering for data segmentation.
* `NUM_CLUSTERS`: Define the number of clusters. You can set a specific number, use a range (e.g., HPARAM_RANGE(2, 25)), or provide discrete values (e.g., HPARAM_CANDIDATES([5, 10, 50])).
* `KMEANS_INIT_METHOD`: Choose the initialization method for clusters. Options include 'RANDOM', 'KMEANS++', or 'CUSTOM'. If 'CUSTOM', specify a BOOL column with TRUE values as initial centroids using KMEANS_INIT_COL.
* `DISTANCE_TYPE`: Select the metric for distance computation. Options are 'EUCLIDEAN' (default) or 'COSINE'.
* `STANDARDIZE_FEATURES`: Decide whether to standardize numerical features. The default is TRUE.
* `MAX_ITERATIONS`: Set the maximum number of training iterations. The default is 20.
* `EARLY_STOP`: Choose whether to stop training early if the improvement in loss is below a threshold. The default is TRUE.
* `NUM_TRIALS`: Set the maximum number of trials for hyperparameter tuning. The default is 10, and you should use at least (number_of_hyperparameters * 10) trials.

For a full list of parameters for training the k-means model, see the [bigqueryml documentation](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-kmeans).

### Model Training

In [None]:
# Create the query for model training
train_kmeans = """CREATE MODEL IF NOT EXISTS `{0}.{1}`
OPTIONS(MODEL_TYPE = 'KMEANS',
NUM_CLUSTERS = HPARAM_RANGE(2, 10),
KMEANS_INIT_METHOD = 'KMEANS++',
DISTANCE_TYPE = 'COSINE',
STANDARDIZE_FEATURES = TRUE,
MAX_ITERATIONS = 10,
EARLY_STOP = TRUE,
NUM_TRIALS = 10
) AS
SELECT * FROM `{0}.{2}`;""".format(
    BQ_DATASET_NAME, KMEANS_MODEL, BQ_TABLE_NAME
)

In [None]:
# Execute the SQL query for training
execute_sql(train_kmeans)

### Model Evaluation

In [None]:
# Prepare the query to run model evaluation and fetch the results
eval_kmeans = """SELECT * FROM ML.EVALUATE(MODEL `{}.{}`);""".format(
    BQ_DATASET_NAME, KMEANS_MODEL
)
# Execute the query
model_evalution = execute_sql(eval_kmeans)
# Show the results
model_evalution

### Outlier Analysis

Here’s what you get from the `K-means model` output:

* `is_anomaly`: A BOOL value that shows whether the value is flagged as anomalous.
* `normalized_distance`: A FLOAT64 value representing the shortest distance among the normalized distances from your input data to each cluster centroid. You compute normalized distances by taking the absolute distance from the input data to a cluster centroid and dividing it by the cluster's radius. The cluster radius is the root mean square of all distances from the cluster's assigned data points to its centroid. You use normalized distance instead of absolute distance to detect anomalies more effectively, as normalized distances account for the cluster radius. The distance type depends on the DISTANCE_TYPE value set during model training.
* `centroid_id`: An INT64 value that indicates the ID of the centroid.


Here’s how to use `ML.DETECT_ANOMALIES` with the following arguments:

* `project_id`: Your project ID.
* `dataset`: The BigQuery dataset that contains the model.
* `model`: The name of the model.
* `table`: The name of the table to use to perform anomaly detection.
* `query_statement`: The GoogleSQL query that generates the data to use to perform anomaly detection.
* `contamination`: a FLOAT64 value that identifies the proportion of anomalies in the training dataset that are used to create the autoencoder, k-means, or PCA input models. The value must be in the range [0, 0.5]. For example, contamination value of 0.1 means that 10% of the training data that was used to create the input model is anomalous. The contamination value determines the cutoff threshold of the target metric to become anomalous, and any input data with a target metric greater than the cutoff threshold is identified as anomalous. The target metric is mean squared error for autoencoder and PCA models, and the target metric is normalized distance for k-means models.

Learn more about [outlier analysis](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-detect-anomalies).

In [None]:
# Create query to detect anomalies
detect_anomaly = """SELECT * FROM ML.DETECT_ANOMALIES(MODEL `{0}.{1}.{2}`,
STRUCT(0.001 AS contamination),
TABLE `{0}.{1}.{3}`)
WHERE is_anomaly=true
ORDER BY normalized_distance DESC;""".format(
    PROJECT_ID, BQ_DATASET_NAME, KMEANS_MODEL, BQ_TABLE_NAME
)
# Run the query and fetch the outliers
kmeans_outliers = execute_sql(detect_anomaly)

In [None]:
# Show the outliers
kmeans_outliers

## Auto Encoders

Here's a brief description of each parameter for training an `autoencoder model`:

* `MODEL_TYPE`: Specify 'AUTOENCODER' to indicate you're creating an autoencoder model.
* `L1_REG_ACTIVATION`: Apply L1 regularization to the activations in the latent space to promote sparsity. You can set a fixed value or tune it with a range or candidate values.
* `LEARN_RATE`: Determine the initial learning rate for training. You can use a specific value or explore a range of values for tuning.
* `OPTIMIZER`: Choose the optimizer for training, such as 'ADAM', 'ADAGRAD', 'FTRL', 'RMSPROP', or 'SGD'. For hyperparameter tuning, you can select multiple options from candidates.
* `ACTIVATION_FN`: Set the activation function for the neural network. Options include 'RELU', 'RELU6', 'ELU', 'SELU', 'SIGMOID', or 'TANH'. You can also use hyperparameter tuning to test different functions.
* `BATCH_SIZE`: Specify the mini-batch size for training. Use a fixed value or tune it with a range or list of candidates.
* `DROPOUT`: Define the dropout rate for the neural network units. You can set a specific value or explore a range or candidates for tuning.
* `HIDDEN_UNITS`: List the number of units in each hidden layer of the neural network. This can be an array of integers, or you can use hyperparameter tuning to select from candidate architectures.
* `TF_VERSION`: Choose the TensorFlow version for model training, either '1.15' or '2.8.0'.
* `EARLY_STOP`: Decide if training should stop when the relative loss improvement is below a threshold. Set to TRUE or FALSE.
* `MIN_REL_PROGRESS`: Set the minimum relative loss improvement required to continue training if EARLY_STOP is TRUE.
* `MAX_ITERATIONS`: Define the maximum number of training iterations.
* `WARM_START`: Choose whether to retrain the model with new data or options, while retaining previous model settings. Set to TRUE or FALSE.
* `NUM_TRIALS`: Specify the maximum number of submodels to train for hyperparameter tuning.
* `MAX_PARALLEL_TRIALS`: Set the maximum number of trials to run concurrently during hyperparameter tuning.
* `HPARAM_TUNING_ALGORITHM`: Choose the algorithm for hyperparameter tuning, such as 'VIZIER_DEFAULT', 'RANDOM_SEARCH', or 'GRID_SEARCH'.
* `HPARAM_TUNING_OBJECTIVES`: Define the objective for hyperparameter tuning, like 'MEAN_SQUARED_ERROR'.

For a full list of parameters for training the autoencoder model, see the [bigqueryml documentation](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-autoencoder).

### Model Training

In [None]:
# Create the query for training the model
train_auto_encoder = """
CREATE MODEL IF NOT EXISTS `{0}.{1}`
OPTIONS(
MODEL_TYPE='autoencoder',
L1_REG_ACTIVATION = HPARAM_CANDIDATES([0.001, 0.01, 0.1]),
LEARN_RATE = HPARAM_CANDIDATES([0.001, 0.01, 0.1]),
OPTIMIZER = HPARAM_CANDIDATES(['ADAGRAD', 'ADAM', 'FTRL', 'RMSPROP', 'SGD']),
ACTIVATION_FN='relu',
BATCH_SIZE = HPARAM_CANDIDATES([16, 32, 64]),
DROPOUT = HPARAM_CANDIDATES([0.1, 0.2]),
HIDDEN_UNITS = HPARAM_CANDIDATES([
    STRUCT([16, 8, 4, 8, 16]),
    STRUCT([32, 16, 4, 16, 32])
  ]),
TF_VERSION = '2.8.0',
EARLY_STOP = TRUE,
MIN_REL_PROGRESS = 0.01,
MAX_ITERATIONS=20,
WARM_START = TRUE,
NUM_TRIALS = 60,
MAX_PARALLEL_TRIALS = 1,
HPARAM_TUNING_ALGORITHM =  'VIZIER_DEFAULT',
HPARAM_TUNING_OBJECTIVES = ['MEAN_SQUARED_ERROR']
) AS
SELECT
*
FROM `{0}.{2}`;""".format(
    BQ_DATASET_NAME, AUTO_ENCODER_MODEL, BQ_TABLE_NAME
)

In [None]:
# Execute the query
execute_sql(train_auto_encoder)

### Model Evaluation

In [None]:
# Create query to detect anomalies
eval_auto_encoder = """SELECT * FROM ML.EVALUATE(MODEL `{}.{}`);""".format(
    BQ_DATASET_NAME, AUTO_ENCODER_MODEL
)
# Run the query and fetch the outliers
model_evalution = execute_sql(eval_auto_encoder)

model_evalution

### Outlier Analysis

Here’s what you get from the `Autoencoder model` output:

* `is_anomaly`: a BOOL value that indicates whether the value is anomalous.
* `mean_squared_error`: a FLOAT64 value that contains the mean squared error.

Here’s how to use `ML.DETECT_ANOMALIES` with the following arguments:

* `project_id`: Your project ID.
* `dataset`: The BigQuery dataset that contains the model.
* `model`: The name of the model.
* `table`: The name of the table to use to perform anomaly detection.
* `query_statement`: The GoogleSQL query that generates the data to use to perform anomaly detection.
* `contamination`: a FLOAT64 value that identifies the proportion of anomalies in the training dataset that are used to create the autoencoder, k-means, or PCA input models. The value must be in the range [0, 0.5]. For example, contamination value of 0.1 means that 10% of the training data that was used to create the input model is anomalous. The contamination value determines the cutoff threshold of the target metric to become anomalous, and any input data with a target metric greater than the cutoff threshold is identified as anomalous. The target metric is mean squared error for autoencoder and PCA models, and the target metric is normalized distance for k-means models.

Learn more about [outlier analysis](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-detect-anomalies).

In [None]:
# --- DETECT ANOMALIES --- #
detect_anomaly_auto_encoder = """SELECT * FROM ML.DETECT_ANOMALIES(MODEL `{0}.{1}.{2}`,
STRUCT(0.001 AS contamination),
TABLE `{0}.{1}.{3}`)
WHERE is_anomaly=true order by mean_squared_error desc;""".format(
    PROJECT_ID, BQ_DATASET_NAME, AUTO_ENCODER_MODEL, BQ_TABLE_NAME
)
# print(detect_anomaly_auto_encoder)
autoencoder_outliers = execute_sql(detect_anomaly_auto_encoder)

In [None]:
# Show the outliers
autoencoder_outliers

## Common Outliers

You compare the anomalies detected by the autoencoder model with those detected by the k-means model to find common outliers. First, you extract the relevant columns from both the k-means and autoencoder results. Then, you perform an inner join to find the rows that are present in both dataframes.

In [None]:
df1 = kmeans_outliers[
    [
        "day",
        "principal_email",
        "action",
        "resource_type",
        "resource_id",
        "container_type",
        "container_id",
        "channel",
        "ip",
        "counter",
    ]
]
df2 = autoencoder_outliers[
    [
        "day",
        "principal_email",
        "action",
        "resource_type",
        "resource_id",
        "container_type",
        "container_id",
        "channel",
        "ip",
        "counter",
    ]
]

In [None]:
common_outliers = df1.merge(
    df2,
    how="inner",
    on=[
        "day",
        "principal_email",
        "action",
        "resource_type",
        "resource_id",
        "container_type",
        "container_id",
        "channel",
        "ip",
        "counter",
    ],
)  # Replace 'column_name' if necessary

In [None]:
common_outliers

In [None]:
common_outliers.info()

## Uploading detected outliers to BQ table for further analysis

Create a new table in BigQuery to store the common outliers and then upload the dataframe to this table.

In [None]:
OUTLIERS_TABLE = "[your-common-outliers-table]"  # @param {type:"string"}

In [None]:
from google.cloud import bigquery


def create_table(client, table_id, schema):
    table = bigquery.Table(table_id, schema=schema)
    table = client.create_table(table, exists_ok=True)  # Make an API request
    print(
        "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id)
    )


def upload_df_into_bq(client, table_id, df):
    job_config = bigquery.LoadJobConfig(schema=schema)
    job_config.write_disposition = (
        bigquery.WriteDisposition.WRITE_TRUNCATE
    )  # If the table already exists, BigQuery overwrites the data, removes the constraints and uses the schema from the load job.
    job_config.autodetect = False
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()
    print("Uploaded dataframe into table {}.{}".format(PROJECT_ID, table_id))


schema = [
    bigquery.SchemaField("day", "DATE", mode="REQUIRED"),
    bigquery.SchemaField("principal_email", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("action", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("resource_type", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("resource_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("container_type", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("container_id", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("channel", "STRING", mode="NULLABLE"),
    bigquery.SchemaField("ip", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("counter", "INTEGER", mode="REQUIRED"),
]
client = bigquery.Client(PROJECT_ID)

table_id = "{}.{}.{}".format(PROJECT_ID, BQ_DATASET_NAME, OUTLIERS_TABLE)

create_table(client, table_id, schema)

upload_df_into_bq(client, table_id, common_outliers)

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial

In [None]:
# Delete the BigQuery dataset (including the models created & the tables)
!bq rm -r -f {PROJECT_ID}:{BQ_DATASET_NAME}

# Delete the bigquery dataset linked to logs
! gcloud logging links delete {logSourceBqDataset} --bucket={bucket_id} --location={bucket_location}