In [1]:
#@title ###### Licensed to the Apache Software Foundation (ASF), Version 2.0 (the "License")

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# Creating a Vertex Pipeline to extract training data

This notebook (the second in a five-part series) creates a Vertex AI pipeline that scrapes images from an online source (e.g. Reddit) and stores the image metadata in Firestore. Here, you will build a pipeline that 

This notebook covers the following steps:

1. Creating a pipeline component to collect images from Reddit
1. Creating a pipeline component to store images in Cloud Storage
1. Creating a pipeline component to store metadata about the images in Firestore.

### Set IAM permissions

When you run a notebook on Vertex Workbench, the notebook runs in a Compute Engine context that has its own service account. You will need to give your service account IAM permissions to access Secret Manager before you can use it (in a pipeline).



### Enable the Cloud resources

For this notebook, you must have a Google Cloud project with the following resources:

+ A Cloud Storage bucket
+ The following APIs enabled:
  - Cloud Firestore
  - Vertex AI
  - Storage
  - Secret Manager
  
If you completed the [first](1_firestore.ipynb) notebook in this series, you should have these APIs already enabled.

In [17]:
# Get your GCP project id from gcloud
shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID=shell_output[0]
print("Project ID: ", PROJECT_ID)

Project ID:  fantasymaps-334622


In [34]:
BUCKET = "fantasy-maps" # Google Cloud Storage bucket
COLLECTION_NAME = "FantasyMapsTest" # Firestore collection name
LOCATION = "us-central1"
GCS_PREFIX = "ScrapedData"
SUBREDDIT_NAME = "battlemaps"
LIMIT=300

### Install the required Python libraries

In [19]:
! rm -rfd requirements.txt

In [20]:
%%writefile requirements.txt
google-cloud-secret-manager
google-cloud-aiplatform
google-cloud-pipeline-components
kfp
praw
pandas
spacy
pillow

Writing requirements.txt


In [21]:
! pip install -r requirements.txt



In [22]:
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     |████████████████████████████████| 12.8 MB 5.1 MB/s            
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Create a custom Reddit pipelines component

The pipeline and all it components need to be compiled into a runnable format. We use the Kubeflow Pipelines (`kfp`) SDK to create this uploadable pipelines file.

In [23]:
from typing import NamedTuple

import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, ClassificationMetrics, Metrics, component)
from kfp.v2.google.client import AIPlatformClient

from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip

Now we can define the pipeline. For this component, we are going to store the `pandas.DataFrame` that we compose from the Redit posts as a CSV file on Cloud Storage. We'll pass the URI of this Storage file onto the next piece of the pipeline.

In [29]:
@component(packages_to_install=["praw",
                                "google-cloud-secret-manager",
                                "google-cloud-storage",
                                "numpy",
                                "pandas",
                                "spacy"])
def reddit(
    secret_name: str,
    subreddit_name: str,
    gcs_bucket_name: str,
    gcs_prefix_name: str,
    project_id: str,
    limit: int,
) -> str:
    from datetime import datetime
    import numpy as np
    import pandas as pd
    import praw
    import re
    
    from google.cloud import storage

    def get_reddit_credentials(project_id):
        """Gets the Reddit API key out of Secrets Manager
    
        Arguments:
            project_id (str): the current project ID

        Returns:
            JSON object (dict)
        """
        from google.cloud import secretmanager
        import json

        client = secretmanager.SecretManagerServiceClient()

        secret_resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/1"
        response = client.access_secret_version(request={"name": secret_resource_name})
        payload = response.payload.data.decode("UTF-8")

        return json.loads(payload)
    
    def get_reddit_posts(reddit_credentials, subreddit_name, limit):
        """Gets posts from a subreddit.

        Arguments:
            reddit_credentials (dict): a dictionary with client_id, secret, and user_agent
            subreddit_name (str): the name of the subreddit to scrape posts from
            limit (int): the maximum number of posts to grab

        Returns:
            List of Reddit API objects
        """
        import praw

        reddit = praw.Reddit(client_id=reddit_credentials["client_id"], 
                     client_secret=reddit_credentials["secret"],
                     user_agent=reddit_credentials["user_agent"])

        return reddit.subreddit(subreddit_name).hot(limit=limit)

    def convert_posts_to_dataframe(posts, columns):
        import numpy as np
        import pandas as pd

        filtered_posts = [[s.title, s.selftext, s.id, s.url] for s in posts]
        filtered_posts = np.array(filtered_posts)
        reddit_posts_df = pd.DataFrame(filtered_posts,
                                   columns=columns)

        return reddit_posts_df
    
    COLUMNS = ['Title', 'Post', 'ID', 'URL']
    
    # Get the data from Reddit
    credentials = get_reddit_credentials(project_id=project_id)
    posts = get_reddit_posts(reddit_credentials=credentials, subreddit_name=subreddit_name,
                             limit=limit)
    
    reddit_posts_df = convert_posts_to_dataframe(posts=posts, columns=COLUMNS)
    
    # Remove all of the posts that don't meet our criteria
    import re
    jpg_df = reddit_posts_df[(reddit_posts_df["URL"].str.contains("jpg")) &
                             (reddit_posts_df["Title"].str.contains(pat = "\d+x\d"))]
    
    # Save the dataframe as CSV in Storage
    csv_str = jpg_df.to_csv()
    
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(gcs_bucket_name)
    
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    
    csv_file_uri = f"{gcs_prefix_name}/reddit-scraped-{subreddit_name}-{timestamp}.csv"
    
    file_blob = bucket.blob(csv_file_uri)
    file_blob.upload_from_string(csv_str)
    
    return csv_file_uri
    

## Create the Cloud Storage component

In [None]:
c

## Create the Firestore component

In [125]:
from typing import NamedTuple

@component(packages_to_install=["Pillow",
                                "google-cloud-firestore",
                                "google-cloud-storage",
                                "numpy",
                                "pandas"])
def firestore(
    subreddit_name: str,
    collection_name: str,
    gcs_bucket_name: str,
    gcs_prefix_name: str,
    csv_input_file: str,
    project_id: str,
) -> NamedTuple(
    "Outputs",
    [
        ("batch_predict_file_uri", str),
        ("bp_inputs_count", int),
    ]
):
    
    from datetime import datetime
    import hashlib
    from io import BytesIO
    import json
    import pandas as pd
    from PIL import Image
    import re
    import requests
    import shutil

    from google.cloud import firestore
    from google.cloud import storage

    def make_nice_filename(name, *, rows=None, cols=None):
        regex = "[\s|\(|\"|\)]"
        new_name = re.sub(regex, "_", name)
        new_name = new_name.lower()[:30]
        new_name = new_name.replace("__", "_")
        
        if rows is not None and cols is not None:
            new_name += f".{cols}x{rows}"
        return f"{new_name}.jpg"


    def create_vtt_json(content, title):
        img = Image.open(BytesIO(content))
        w, h = img.size

        dims = re.findall("\d+x\d+", title)
        if len(dims) is 0:
            return None

        dims = dims[0].split("x")

        if len(dims) is not 2:
            return None

        cols = int(dims[0])
        rows = int(dims[1])

        cell_w = w / rows
        cell_h = h / cols
        if cell_w != cell_h:
            return None

        return {
            "cols": cols,
            "rows": rows,
            "imageWidth": w,
            "imageHeight": h,
            "cellOffsetX": 0,
            'cellOffsetY': 0, 
            'cellWidth': cell_w, 
            'cellHeight': cell_h, 
        }

    def compute_bboxes(vtt_data):
        bboxes = []

        cols = vtt_data["cols"]
        rows = vtt_data["rows"]

        for x in range(1, cols):
            for y in range(1, rows):
               x_min_tmp = vtt_data["cellOffsetX"] + (vtt_data["cellWidth"] * x) - 2
               x_max_tmp = x_min_tmp + vtt_data["cellWidth"] + 4
               y_min_tmp = vtt_data["cellOffsetY"] + (vtt_data["cellHeight"] * y) - 2
               y_max_tmp = y_min_tmp + vtt_data["cellHeight"] + 4

               x_min_train = x_min_tmp / vtt_data["imageWidth"]
               x_max_train = x_max_tmp / vtt_data["imageWidth"]
               y_min_train = y_min_tmp / vtt_data["imageHeight"]
               y_max_train = y_max_tmp / vtt_data["imageHeight"]

               bboxes.append({
                   "xMin": x_min_train,
                   "yMin": y_min_train,
                   "xMax": x_max_train,
                   "yMax": y_max_train,
                   "displayName": "cell"
               })

        return bboxes

    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(gcs_bucket_name)

    firestore_client = firestore.Client(project=project_id)
    collection_ref = firestore_client.collection(collection_name)

    blob = bucket.blob(csv_input_file)
    csv_bytes = blob.download_as_string()
    csv_buffer = BytesIO(csv_bytes)

    jpg_df = pd.read_csv(csv_buffer)

    hashes = [None] * len(jpg_df.index)
    jpg_df.insert(1, "HashId", hashes, True)
    jpg_df.insert(6, "GcsURI", hashes, True)

    # Concatenate string of batch prediction inputs
    bp_inputs = ""
    bp_inputs_count = 0
    
    # Iterate over JPG URIs, download them in batches, convert to sha values
    for i, r in jpg_df.iterrows():
        jpg_url = r["URL"]
        title = r["Title"]

        req = requests.get(jpg_url, stream=True)
        if req.status_code == 200:
            req.raw.decode_content = True
            sha1 = hashlib.sha1()
            jpg_hash = sha1.update(req.content)
            jpg_hash = sha1.hexdigest()

            jpg_df["HashId"][i] = jpg_hash
            #print(f"Index {i}, hash {jpg_hash}")
            hashes.append(jpg_hash)

            # Try to fetch each document from Firestore. If it does not exist,
            # overwrite and download the image.
            doc_ref = collection_ref.document(jpg_hash)
            doc = doc_ref.get()
            if not doc.exists:

                img_data = create_vtt_json(req.content, title)
                
                if img_data is not None:
                    file_name = make_nice_filename(title,
                                                   rows=img_data["rows"],
                                                   cols=img_data["cols"])
                else:
                    file_name = make_nice_filename(title)
                
                img_gcs_uri = f"gs://{gcs_bucket_name}/{gcs_prefix_name}/{file_name}"
                blob_name = f"{gcs_prefix_name}/{file_name}"

                file_blob = bucket.blob(blob_name)
                image_buffer = BytesIO(req.content)

                # Get image grid metadata
                #img_data = create_vtt_json(req.content, title)
                print(img_data)

                file_blob.upload_from_file(BytesIO(req.content))

                data = {
                    u"filename": file_name,
                    u"gcsURI": img_gcs_uri,
                    u"source": gcs_prefix_name,
                    u"userId": "None",
                    u"sourceUrl": jpg_url,
                }

                if img_data is not None:
                    bboxes = compute_bboxes(img_data)
                    data["vtt"] = img_data
                    data["computedBBoxes"] = bboxes

                    doc_ref.set(data)
                    print(f"Set data: {data}")
                    bp_inputs += json.dumps({ "content": img_gcs_uri, "mimeType": "image/jpeg"})
                    bp_inputs += "\n"
                    bp_inputs_count = bp_inputs_count + 1

    # No fresh JPGs in this scraping; return empty string
    if bp_inputs is "":
        print("no inputs")
        return ("", 0)

    print(f"First ten: {jpg_df.head(10)}")

    # Save the batch_predict file
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S") 
    batch_predict_file_uri = f"gs://{gcs_bucket_name}/{gcs_prefix_name}/bp_input_{timestamp}.jsonl"

    bp_blob_name = f"{gcs_prefix_name}/bp_input_{timestamp}.jsonl"
    bp_blob = bucket.blob(bp_blob_name)

    bp_blob.upload_from_string(bp_inputs)

    return (batch_predict_file_uri, bp_inputs_count)  

## Build a simple pipeline

In [30]:
@dsl.pipeline(
    name="reddit-scraper-pipeline",
    description="Gets data from a subreddit",
    pipeline_root=f"gs://{BUCKET}/pipeline_root",
)
def reddit_pipeline(
    collection_name: str = COLLECTION_NAME,
    secret_name: str = "reddit-api-key",
    subreddit_name: str = SUBREDDIT_NAME,
    gcs_bucket: str = BUCKET,
    gcs_prefix: str = GCS_PREFIX,
    project_id: str = PROJECT_ID,
    location: str = LOCATION,
    limit: int = LIMIT,
):
    
    # Get the images from Reddit
    reddit_op_1 = reddit(
        secret_name=secret_name,
        subreddit_name=subreddit_name,
        gcs_bucket_name=gcs_bucket,
        gcs_prefix_name=gcs_prefix,
        project_id=project_id,
        limit=limit,
    )
    
    reddit_csv_file_1 = reddit_op_1.output
    
    # Firestore operation

In [31]:
compiler.Compiler().compile(
    pipeline_func=reddit_pipeline, package_path="artifacts/reddit_scraper_pipeline_job.json"
)

In [32]:
api_client = AIPlatformClient(
    project_id=PROJECT_ID,
    region=LOCATION,
)

When we run the pipeline, we don't want it to cache the pipeline, since caching the pipeline will likely result in producing the exact same results.

In [33]:
response = api_client.create_run_from_job_spec(
    job_spec_path="artifacts/reddit_scraper_pipeline_job.json",
    enable_caching=False
)