# Initial Model training & Data Labeling 

In [1]:
# standard library
import codecs
import os
import re
import sys
import warnings
from pathlib import Path

# 3rd party
import googlemaps as gmaps
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import datetime as dt
from google.cloud import storage
from google.cloud import automl

import matplotlib.pyplot as plt

In [2]:
# project config variables
BUCKET_NAME = 'YOUR-BUCKET'
INPUT_BUCKET_PREFIX = 'YOUR/DIRECTORIES/'
OUTPUT_BUCKET_PATH = 'YOUR/DIRECTORIES/'
URI_PREFIX = 'gs://'
ROOT_DIR = Path.cwd().parent

In [3]:
%load_ext autoreload
%autoreload 2

# add src library to module path
sys.path.append(str(ROOT_DIR))

In [4]:
# import modules from src
from src.utils import to_snake_case, clean_byte_unicode_chars, fix_encoding

# import secrets
from creds import API_KEY

#### set project wide parameters

In [6]:
project_id = "YOUR-ID"

### Iteration 1 - Base Model

#### Create the dataset in automl

This notebook is an alternative to using the UI within GCP.

In [8]:
client = automl.AutoMlClient()

In [None]:
# TODO(developer): Uncomment and set the following variables
fmt = '{mname}_%Y%m%d'
m_name = "YOUR-NAME"
display_name = dt.datetime.now().strftime(fmt).format(mname = m_name)

# A resource that represents Google Cloud Platform location.
project_location = client.location_path(project_id, "us-central1")

metadata = automl.types.TextClassificationDatasetMetadata(
    classification_type=automl.enums.ClassificationType.MULTILABEL
)
dataset = automl.types.Dataset(
    display_name=display_name,
    text_classification_dataset_metadata=metadata,
)

# Create a dataset with the dataset metadata in the region.
response = client.create_dataset(project_location, dataset)

created_dataset = response.result()

# Display the dataset information
print("Dataset name: {}".format(created_dataset.name))
print("Dataset id: {}".format(created_dataset.name.split("/")[-1]))

#### Import the training data into created dataset

In [None]:
# TODO(developer): Uncomment and set the following variables
dataset_id = created_dataset.name.split("/")[-1]
path = "gs://YOUR-BUCKET/TrainingDataLabeled.csv"

# Get the full path of the dataset.
dataset_full_id = client.dataset_path(
    project_id, "us-central1", dataset_id
)
# Get the multiple Google Cloud Storage URIs
input_uris = path.split(",")
gcs_source = automl.types.GcsSource(input_uris=input_uris)
input_config = automl.types.InputConfig(gcs_source=gcs_source)
# Import data from the input URI
response = client.import_data(dataset_full_id, input_config)

print("Processing import...")
print("Data imported. {}".format(response.result()))

Processing import...


#### Train Initial model with manually labeled data

In [None]:
fmt = '{mname}_%Y%m%d'
m_name = "YOUR-NAME"
display_name = dt.datetime.now().strftime(fmt).format(mname = m_name)


# A resource that represents Google Cloud Platform location.
project_location = client.location_path(project_id, "us-central1")
# Leave model unset to use the default base model provided by Google
metadata = automl.types.TextClassificationModelMetadata(
    classification_type=automl.enums.ClassificationType.MULTILABEL
)
model = automl.types.Model(
    display_name=display_name,
    dataset_id=dataset_id,
    text_classification_model_metadata=metadata,
)

automl.types

# Create a model with the model metadata in the region.
response = client.create_model(project_location, model)

print(u"Training operation name: {}".format(response.operation.name))
print("Training started...")

#### Model Evaluation

In [None]:
model_id = "YOUR-MODEL-ID"

# Get the full path of the model.
model_full_id = client.model_path(project_id, "us-central1", model_id)

print("List of model evaluations:")
for evaluation in client.list_model_evaluations(model_full_id, ""):
    print("Model evaluation name: {}".format(evaluation.name))
    print(
        "Model annotation spec id: {}".format(
            evaluation.annotation_spec_id
        )
    )
    print("Create Time:")
    print("\tseconds: {}".format(evaluation.create_time.seconds))
    print("\tnanos: {}".format(evaluation.create_time.nanos / 1e9))
    print(
        "Evaluation example count: {}".format(
            evaluation.evaluated_example_count
        )
    )
    print(
        "Translation model evaluation metrics: {}".format(
            evaluation.translation_evaluation_metrics
        )
    )

In [None]:
# Get the full path of the model.
model_full_id1 = client.model_path(project_id, "us-central1", model_id)
print(model_full_id1)

#### Prediction test

In [17]:
content = "I really love the El Mirage location of Rio Mirage. The food tastes great the service is awesome the atmosphere is nice and the Monday margarita deals are phenomenal! I will mention the Surprise location does not earn the same rating but the El Mirage location is the best Mexican around!"

prediction_client = automl.PredictionServiceClient()

# Get the full path of the model.
model_full_id = prediction_client.model_path(
    project_id, "us-central1", model_id
)

# Supported mime_types: 'text/plain', 'text/html'
# https://cloud.google.com/automl/docs/reference/rpc/google.cloud.automl.v1#textsnippet
text_snippet = automl.types.TextSnippet(
    content=content, mime_type="text/plain"
)
payload = automl.types.ExamplePayload(text_snippet=text_snippet)

response = prediction_client.predict(model_full_id, payload)

for annotation_payload in response.payload:
    print(
        u"Predicted class name: {}".format(annotation_payload.display_name)
    )
    print(
        u"Predicted class score: {}".format(
            annotation_payload.classification.score
        )
    )

Predicted class name: Ambience
Predicted class score: 0.868212103843689
Predicted class name: Service
Predicted class score: 0.7831429243087769
Predicted class name: Location
Predicted class score: 0.12061762809753418
Predicted class name: Value
Predicted class score: 0.007106959819793701


In [18]:
#### Get dataset ready for batch prediction for labeling
str = 'So the only place on the strip that makes them hot is at Planet Hollywood Casino. This one does actually have a real location with variety as opposed to some gift shop in a casino that will sell you a donut.' 

In [19]:
content = str

prediction_client = automl.PredictionServiceClient()

# Get the full path of the model.
model_full_id = prediction_client.model_path(
    project_id, "us-central1", model_id
)

# Supported mime_types: 'text/plain', 'text/html'
# https://cloud.google.com/automl/docs/reference/rpc/google.cloud.automl.v1#textsnippet
text_snippet = automl.types.TextSnippet(
    content=content, mime_type="text/plain"
)
payload = automl.types.ExamplePayload(text_snippet=text_snippet)

response = prediction_client.predict(model_full_id, payload)

for annotation_payload in response.payload:
    print(
        u"Predicted class name: {}".format(annotation_payload.display_name)
    )
    print(
        u"Predicted class score: {}".format(
            annotation_payload.classification.score
        )
    )

Predicted class name: Location
Predicted class score: 0.8133878707885742
Predicted class name: Value
Predicted class score: 0.19717225432395935
Predicted class name: Ambience
Predicted class score: 0.19159570336341858
Predicted class name: Service
Predicted class score: 0.004075139760971069
