# BigQuery ML Anomaly Detection

In [None]:
import os

PROJECT = !(gcloud config get-value core/project)
PROJECT = PROJECT[0]
BUCKET = PROJECT

os.environ["PROJECT"] = PROJECT
os.environ["BUCKET"] = BUCKET

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from google.cloud import bigquery
from IPython import get_ipython
from IPython.core.magic import register_cell_magic
from matplotlib import pyplot as plt

bq = bigquery.Client(project=PROJECT)

In [None]:
# Allow you to easily have Python variables in SQL query.
@register_cell_magic("with_globals")
def with_globals(line, cell):
    contents = cell.format(**globals())
    if "print" in line:
        print(contents)
    get_ipython().run_cell(contents)

In [None]:
!bq mk anomaly_demo

## ARIMA

-- This statement trains an ARIMA_PLUS model using BigQuery ML.
-- The model will be partitioned by 'symbol', creating a separate model for each stock.
-- Replace `your_dataset.your_table` with your actual BigQuery table name.

In [None]:
%%bigquery --project {PROJECT}
SELECT *
FROM
  `bigquery-public-data`.new_york.citibike_trips
WHERE start_station_name LIKE '%Central Park%'
LIMIT 10;

In [None]:
%%bigquery --project {PROJECT}
CREATE MODEL anomaly_demo.nyc_citibike_arima_model
OPTIONS
  (model_type = 'ARIMA_PLUS',
   time_series_timestamp_col = 'date',
   time_series_data_col = 'num_trips',
   time_series_id_col = 'start_station_name'
  ) AS
SELECT
   start_station_name,
   EXTRACT(DATE from starttime) AS date,
   COUNT(*) AS num_trips
FROM
  `bigquery-public-data`.new_york.citibike_trips
WHERE start_station_name LIKE '%Central Park%'
GROUP BY start_station_name, date


In [None]:
%%bigquery --project {PROJECT}
SELECT
  *
FROM
ML.DETECT_ANOMALIES(
  MODEL anomaly_demo.nyc_citibike_arima_model,
  STRUCT(0.95 AS anomaly_prob_threshold)
)WHERE is_anomaly=True

In [None]:
%%bigquery --project {PROJECT}
# New data:
SELECT
  *
FROM
ML.DETECT_ANOMALIES(
  MODEL anomaly_demo.nyc_citibike_arima_model,
  STRUCT(0.95 AS anomaly_prob_threshold),
  (SELECT
   start_station_name,
   EXTRACT(DATE from starttime) AS date,
   COUNT(*) AS num_trips
FROM
  `bigquery-public-data`.new_york.citibike_trips
  GROUP BY start_station_name, date
  )
)

## k-means

### Explore dataset:

In [None]:
%%bigquery --project {PROJECT}
SELECT * EXCEPT(Time, Class)
FROM 
  `bigquery-public-data.ml_datasets.ulb_fraud_detection` LIMIT 10;

### Train k-means model

In [None]:
%%bigquery --project {PROJECT}
CREATE MODEL anomaly_demo.kmeans_model
OPTIONS(
  model_type='kmeans',
  num_clusters= 8,
  kmeans_init_method = 'kmeans++'
)
AS
SELECT * EXCEPT(Time, Class)
FROM 
  `bigquery-public-data.ml_datasets.ulb_fraud_detection`;

## BQML Autoencoder 

### Training

In [None]:
%%bigquery --project {PROJECT}
CREATE MODEL anomaly_demo.autoencoder_model2
OPTIONS(
  model_type='autoencoder',
  activation_fn='relu',
  batch_size=8,
  dropout=0.2,  
  hidden_units=[32, 16, 4, 16, 32],
  learn_rate=0.001,
  l1_reg_activation=0.0001,
  max_iterations=10,
  optimizer='adam'
)
AS
SELECT * EXCEPT(Time, Class)
FROM
  `bigquery-public-data.ml_datasets.ulb_fraud_detection`;

### Predictions

In [None]:
%%bigquery --project {PROJECT}
SELECT
  *
FROM
ML.DETECT_ANOMALIES(
  MODEL anomaly_demo.autoencoder_model,
  STRUCT(0.005 AS contamination),
  TABLE `bigquery-public-data.ml_datasets.ulb_fraud_detection`
)
WHERE is_anomaly=True
LIMIT 10;

Copyright 2019 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License