# Call an external API from a managed pipeline

This notebook uses Cloud Secret Manager to import an API key into a

## Install all dependencies

In [2]:
import os

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"

In [3]:
!pip3 install {USER_FLAG} google-cloud-secret-manager --upgrade
!pip3 install {USER_FLAG} google-cloud-aiplatform --upgrade
!pip3 install {USER_FLAG} kfp google-cloud-pipeline-components --upgrade

Collecting google-cloud-secret-manager
  Downloading google_cloud_secret_manager-2.5.0-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 2.4 MB/s eta 0:00:011
Installing collected packages: google-cloud-secret-manager
Successfully installed google-cloud-secret-manager-2.5.0


### Set project information

In [6]:
# Get your GCP project id from gcloud
shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID=shell_output[0]
print("Project ID: ", PROJECT_ID)

Project ID:  erschmid-test-291318


### Set IAM permissions on your service account

`secretmanager.versions.access`

## Store your API key in Cloud Secret Manager

Although you can [create a new secret in Cloud Secret Manager programmatically](https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets#create), in this notebook you must create it using the Cloud Console.

To create a new secret in the Cloud Console, do the following:

  1. Open the [Cloud Console](https://console.cloud.google.com/security/secret-manager).
  1. Click **Create secret**.
  1. In the **Create secret** page, do the following:
     
     + Give your secret a memorable name. This notebook uses the Reddit API, so the name of the secret
       is 'reddit-api-key'.
     + Upload the credentials file. In this example, the `client_id`, `secret`, and `user_agent` credentials
       provided by Reddit are stored as JSON in a single file.
  
  1. Click **Create secret** at the bottom of the page.
  

## Access the key programmatically

In [4]:
! pip install {USER_FLAG} praw

Collecting praw
  Downloading praw-7.3.0-py3-none-any.whl (165 kB)
[K     |████████████████████████████████| 165 kB 5.0 MB/s eta 0:00:01
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.2.0-py3-none-any.whl (15 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update-checker, prawcore, praw
Successfully installed praw-7.3.0 prawcore-2.2.0 update-checker-0.18.0


In [11]:
from google.cloud import secretmanager
import json

client = secretmanager.SecretManagerServiceClient()

secret_resource_name = f"projects/{PROJECT_ID}/secrets/reddit-api-key/versions/1"

response = client.access_secret_version(request={"name": secret_resource_name})

payload = response.payload.data.decode("UTF-8")

reddit_key_json = json.loads(payload)

### Construct a request to the Reddit API

In [13]:
import praw

reddit = praw.Reddit(client_id=reddit_key_json["client_id"], 
                     client_secret=reddit_key_json["secret"],
                     user_agent=reddit_key_json["user_agent"])
print(f'Reddit is in read-only mode: {reddit.read_only}')

Reddit is in read-only mode: True


In [14]:
import numpy as np
import pandas as pd

nan_value = float("NaN")
sciatica_sub = "sciatica"

In [20]:
posts = reddit.subreddit(sciatica_sub).hot(limit=100)
filtered_posts = [[s.title, s.selftext, s.id] for s in posts]

filtered_posts = np.array(filtered_posts)
reddit_posts_df = pd.DataFrame(filtered_posts,
                               columns=['Title', 'Posts', 'ID'])

# Drop all the rows with empty values
reddit_posts_df.replace("", nan_value, inplace=True)
reddit_posts_df = reddit_posts_df[reddit_posts_df.Posts != nan_value]


# Print 
reddit_posts_df.head(10)

print(reddit_posts_df.iloc[8]['Title'])

A student with sciatic


## Create a custom pipelines component

In [23]:
from typing import NamedTuple

import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, ClassificationMetrics, Metrics, component)
from kfp.v2.google.client import AIPlatformClient

from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip

In [None]:
! pip install {USER_FLAG} google-auth

In [57]:
from typing import NamedTuple
from google.cloud import secretmanager
import json

def get_google_cloud_credentials():
    from google import auth
    creds, project = google.auth.default()

    LocalCredentials = NamedTuple("LocalCredentials",
    [
        ("creds", str),
        ("project", str),
    ])
    return LocalCredentials(creds, project)

local_creds = get_google_cloud_credentials()

client = secretmanager.SecretManagerServiceClient(credentials=local_creds.creds)

secret_resource_name = f"projects/{project}/secrets/reddit-api-key/versions/1"
response = client.access_secret_version(request={"name": secret_resource_name})
payload = response.payload.data.decode("UTF-8")

print(json.loads(payload))

{'secret': '_XDRI2jgcVAJ6xKIWmA46yz8CZw', 'client_id': 'Z0g7xbmKNB9Mew', 'user_agent': 'script:ScrapeForNLP:v1.0 (by u/Telpirion-78)', 'user_name': 'Telpirion-78'}


In [68]:
@component(packages_to_install=["google-auth", "praw", "google-cloud-secret-manager", "numpy", "pandas"],
           output_component_file="reddit.yaml")
def reddit(
    secret_name: str,
    subreddit_name: str,
) -> str:
    import praw
    import pandas as pd
    import numpy as np

    
    def get_google_cloud_credentials():
        import google.auth
        creds, project = google.auth.default()

        LocalCredentials = NamedTuple("LocalCredentials",
        [
            ("creds", str),
            ("project", str),
        ])
        return LocalCredentials(creds, project)

    
    def get_reddit_credentials(creds, project_id):
        from google.cloud import secretmanager
        import json

        client = secretmanager.SecretManagerServiceClient(credentials=creds)

        secret_resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/1"
        response = client.access_secret_version(request={"name": secret_resource_name})
        payload = response.payload.data.decode("UTF-8")

        return json.loads(payload)
    
    def get_reddit_posts(reddit_credentials):
        import praw

        reddit = praw.Reddit(client_id=reddit_credentials["client_id"], 
                     client_secret=reddit_credentials["secret"],
                     user_agent=reddit_credentials["user_agent"])
        print(f"Reddit is in read-only mode: {reddit.read_only}")
        return reddit.subreddit(subreddit_name).hot(limit=100)
    
    nan_value = float("NaN")
    
    google_cloud_credentials = get_google_cloud_credentials()
    project_id = google_cloud_credentials.project
    service_account_credentials = google_cloud_credentials.creds
    
    credentials = get_reddit_credentials(service_account_credentials, project_id)
    posts = get_reddit_posts(credentials)
    
    filtered_posts = [[s.title, s.selftext, s.id] for s in posts]

    filtered_posts = np.array(filtered_posts)
    reddit_posts_df = pd.DataFrame(filtered_posts,
                                   columns=['Title', 'Posts', 'ID'])

    reddit_posts_df.replace("", nan_value, inplace=True)
    reddit_posts_df = reddit_posts_df[reddit_posts_df.Posts != nan_value]

    output = reddit_posts_df.iloc[6]['Title']
    print(f"Output should be: {output}")
    
    return output
    

## Build a simple pipeline

In [69]:
@dsl.pipeline(
    name="simple-reddit",
    description="Gets data from a subreddit",
    pipeline_root="gs://erschmid-test-291318-bucket/pipeline_root",
)

def simple_reddit_pipeline(
    secret_name: str = "reddit-api-key",
    subreddit_name: str = "googlecloud"
):
    consumer_task = reddit(
        secret_name,
        subreddit_name,
    )

In [70]:
compiler.Compiler().compile(
    pipeline_func=simple_reddit_pipeline, package_path="simple_reddit_pipeline_job.json"
)

In [71]:
api_client = AIPlatformClient(
    project_id=PROJECT_ID,
    region='us-central1',
)

In [72]:
response = api_client.create_run_from_job_spec(
    job_spec_path="simple_reddit_pipeline_job.json",
)