# Data Classification with AutoML for Structured Data

#### Import Libraries

In [None]:
! pip install google-cloud-automl

In [27]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
# AutoML library.
from google.cloud import automl_v1beta1 as automl
from google.cloud import bigquery
from google.cloud import storage

In [None]:
#import google.cloud.automl_v1beta1.proto.data_types_pb2 as data_types

In [None]:
import matplotlib.pyplot as plt
from ipywidgets import interact
import ipywidgets as widgets

#### Set Up GCP Parameters

In [35]:
PROJECT_ID = "crazy-hippo-01" #@param {type:"string"}
COMPUTE_REGION = "us-central1" # Currently only supported region.

In [37]:
# A name for the AutoML tables Dataset to create.
DATASET_DISPLAY_NAME = 'earnings_predictions' #@param {type: 'string'}
# The GCS data to import data from (doesn't need to exist).
INPUT_CSV_NAME = 'clv_census' #@param {type: 'string'}
# A name for the AutoML tables model to create.
MODEL_DISPLAY_NAME = 'earnings_model' #@param {type: 'string'}

assert all([
    PROJECT_ID,
    COMPUTE_REGION,
    DATASET_DISPLAY_NAME,
    INPUT_CSV_NAME,
    MODEL_DISPLAY_NAME,
])

#### Initialize AutoML Clients

In [38]:
# Initialize the clients.
automl_client = automl.AutoMlClient()
tables_client = automl.TablesClient(project=PROJECT_ID, region=COMPUTE_REGION)

#### List Datasets

In [39]:
# List the datasets.
list_datasets = tables_client.list_datasets()
datasets = { dataset.display_name: dataset.name for dataset in list_datasets }
datasets

{'earnings_predictions': 'projects/433654631026/locations/us-central1/datasets/TBL8051398744517640192',
 'construction_ver1': 'projects/433654631026/locations/us-central1/datasets/IOD4538266129482645504',
 'untitled_1591272188580': 'projects/433654631026/locations/us-central1/datasets/VOT4762109104711794688',
 'flowers': 'projects/433654631026/locations/us-central1/datasets/ICN6417780102095962112',
 'hippo_test_01': 'projects/433654631026/locations/us-central1/datasets/TBL4559696710619627520'}

#### List Models

In [40]:
# List the models.
list_models = tables_client.list_models()
models = { model.display_name: model.name for model in list_models }
models

{'flowers_20200527082025': 'projects/433654631026/locations/us-central1/models/ICN8327157910031302656',
 'construction_ver1_20200602083940': 'projects/433654631026/locations/us-central1/models/IOD9136961418027859968'}

#### Create Dataset

In [56]:
# Create dataset.
dataset = tables_client.create_dataset(
          dataset_display_name=DATASET_DISPLAY_NAME)
dataset_name = dataset.name
dataset

name: "projects/433654631026/locations/us-central1/datasets/TBL6184656698972569600"
display_name: "earnings_predictions"
create_time {
  seconds: 1612533311
  nanos: 204514000
}
etag: "AB3BwFqQEnlnsdngiOBwMi_Fj9Sn55CJe7AtlVIUpiF7jNwkLOhP3F9EuLJssqoBvz4="
tables_dataset_metadata {
  stats_update_time {
  }
}

#### Load Data into Dataset

In [57]:
GCS_DATASET_URI = 'gs://automl-data-demo/clv_census.csv'

In [58]:
# Read the data source from GCS. 
import_data_response = tables_client.import_data(
    dataset=dataset,
    gcs_input_uris=GCS_DATASET_URI
)
print('Dataset import operation: {}'.format(import_data_response.operation))

# Synchronous check of operation status. Wait until import is done.
print('Dataset import response: {}'.format(import_data_response.result()))

# Verify the status by checking the example_count field.
dataset = tables_client.get_dataset(dataset_name=dataset_name)
dataset

Dataset import operation: name: "projects/433654631026/locations/us-central1/operations/TBL3926338705479958528"
metadata {
  type_url: "type.googleapis.com/google.cloud.automl.v1beta1.OperationMetadata"
  value: "\032\014\010\303\234\365\200\006\020\330\232\366\264\003\"\014\010\303\234\365\200\006\020\330\232\366\264\003z\000"
}

Dataset import response: 


name: "projects/433654631026/locations/us-central1/datasets/TBL6184656698972569600"
display_name: "earnings_predictions"
create_time {
  seconds: 1612533311
  nanos: 204514000
}
etag: "AB3BwFrrJwFpUH1WzO7DEweTuXscfNqd2JLeG31s423IVPRvWlp5M4PPoWChAOW3uN37"
example_count: 32461
tables_dataset_metadata {
  primary_table_spec_id: "951055567793684480"
  stats_update_time {
    seconds: 1612533359
    nanos: 52828000
  }
}

#### Set Target Column

In [62]:
column_spec_display_name = 'income' #@param {type:'string'}

update_dataset_response = tables_client.set_target_column(
    dataset=dataset,
    column_spec_display_name=column_spec_display_name,
)
update_dataset_response

name: "projects/433654631026/locations/us-central1/datasets/TBL6184656698972569600"
display_name: "earnings_predictions"
create_time {
  seconds: 1612533311
  nanos: 204514000
}
etag: "AB3BwFrPawlNbTCTBpQ3oUvx2g36ImzamW7CylynxBxekRa84S2LidnXwr3pHXtJLvkj"
example_count: 32461
tables_dataset_metadata {
  primary_table_spec_id: "951055567793684480"
  target_column_spec_id: "4833814305273020416"
  stats_update_time {
    seconds: 1612533359
    nanos: 52828000
  }
}

#### Set Training Parameters

In [64]:
# The number of hours to train the model.
model_train_hours = 1 #@param {type:'integer'}

create_model_response = tables_client.create_model(
    model_display_name=MODEL_DISPLAY_NAME,
    dataset=dataset,
    train_budget_milli_node_hours=model_train_hours*1000,
    exclude_column_spec_names=['fnlwgt','income'],
)

operation_id = create_model_response.operation.name

print('Create model operation: {}'.format(create_model_response.operation))

Create model operation: name: "projects/433654631026/locations/us-central1/operations/TBL6447650809365659648"
metadata {
  type_url: "type.googleapis.com/google.cloud.automl.v1beta1.OperationMetadata"
  value: "\032\014\010\261\247\365\200\006\020\250\301\230\265\003\"\014\010\261\247\365\200\006\020\250\301\230\265\003R\000"
}



#### Kick off Training

In [None]:
# Wait until model training is done.
model = create_model_response.result()
model_name = model.name
model

In [66]:
model

name: "projects/433654631026/locations/us-central1/models/TBL713937089170767872"

#### Deploying Model

In [67]:
tables_client.deploy_model(model=model).result()




#### Get Model Info

In [68]:
model = tables_client.get_model(model_name=model_name)
model

name: "projects/433654631026/locations/us-central1/models/TBL713937089170767872"
display_name: "earnings_model"
dataset_id: "TBL6184656698972569600"
create_time {
  seconds: 1612534705
  nanos: 916857000
}
deployment_state: DEPLOYED
update_time {
  seconds: 1612701780
  nanos: 86151000
}
tables_model_metadata {
  target_column_spec {
    name: "projects/433654631026/locations/us-central1/datasets/TBL6184656698972569600/tableSpecs/951055567793684480/columnSpecs/4833814305273020416"
    data_type {
      type_code: CATEGORY
    }
    display_name: "income"
  }
  input_feature_column_specs {
    name: "projects/433654631026/locations/us-central1/datasets/TBL6184656698972569600/tableSpecs/951055567793684480/columnSpecs/2527971296059326464"
    data_type {
      type_code: FLOAT64
    }
    display_name: "age"
  }
  input_feature_column_specs {
    name: "projects/433654631026/locations/us-central1/datasets/TBL6184656698972569600/tableSpecs/951055567793684480/columnSpecs/8436694007169417216