# BDCC project - Loader Cloud Function development

**[Big Data and Cloud Computing](https://www.dcc.fc.up.pt/~edrdo/aulas/bdcc), Project 1**

Make sure you go through the __[Google Cloud Functions Pub/Sub tutorial](https://cloud.google.com/functions/docs/tutorials/pubsub)__ before you start developing the LCF.



## GCP authentication function

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# The authentication method 
def google_colab_authenticate(projectId, keyFile=None, debug=True):  
    import os
    from google.colab import auth
    if keyFile == None:
      keyFile='/content/bdcc-colab.json'
    if os.access(keyFile,os.R_OK):
      if debug:
        print('Using key file "%s"' % keyFile)
      os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '%s' % keyFile
      os.environ['GCP_PROJECT'] = projectId 
      os.environ['GCP_ACCOUNT'] = 'bdcc-colab@' + projectId + '.iam.gserviceaccount.com'
      !gcloud auth activate-service-account "$GCP_ACCOUNT" --key-file="$GOOGLE_APPLICATION_CREDENTIALS" --project="$GCP_PROJECT"
    else:
      if debug:
        print('No key file given. You may be redirected to the verification code procedure.')
      auth.authenticate_user()
      !gcloud config set project $projectId
    !gcloud info | grep -e Account -e Project

# Copy key file from Google Drive if available 
# to a path without spaces (it usually creates problems)
!test -f "/content/drive/My Drive/bdcc-colab.json" && cp "/content/drive/My Drive/bdcc-colab.json" /content/bdcc-colab.json



## Cloud function code

This should be placed in a single cell to facilitate cloud function.

Note that you cannot use "magic" notebook extensions such as `! shell command` or `%%bigquery`.

In [0]:
# Imports
import base64
import pandas as pd
import os
import tempfile
import time
from zipfile import ZipFile
import google.cloud.bigquery as bq
import google.cloud.storage as gcs


# Parameters
PROJECT_ID = 'bigdata-269209'  # TODO change to your project id
BUCKET_NAME = 'zaoutputbucket' # TODO change to your bucket name
PUBSUB_TOPIC = 'new_output' 
OUTPUT_ZIP_FILE = 'output.zip'

DEBUG = True 
RUNNING_IN_COLAB = os.environ.get('COLAB_GPU') != None

TMP_DIR=tempfile.mkdtemp(prefix='LCF_')

# Debug method
def debug(message):
  if DEBUG:
     print(message)

# Authenticate to GCP if running in Colab
if RUNNING_IN_COLAB:
  google_colab_authenticate(PROJECT_ID)

# Initialize interface to BigQuery and GCS
BQ_CLIENT = bq.Client(PROJECT_ID)
GCS_CLIENT = gcs.Client(PROJECT_ID)
BUCKET = gcs.Bucket(GCS_CLIENT, BUCKET_NAME)

def get_data_from_cloud_storage(dataset_id):
  bucket_path = '%s/%s' % (dataset_id, OUTPUT_ZIP_FILE)
  local_zip_file = '%s/%s' % (TMP_DIR, OUTPUT_ZIP_FILE)
  debug('Downloading gs://%s/%s to %s' % (BUCKET_NAME,bucket_path,local_zip_file))
  blob = gcs.Blob(bucket_path, BUCKET)
  with open(local_zip_file, 'wb') as out:
    blob.download_to_file(out)
  

def unzip_data_file():
  local_zip_file = '%s/%s' % (TMP_DIR, OUTPUT_ZIP_FILE)
  debug('Unzipping %s' % local_zip_file)
  with ZipFile(local_zip_file) as zf:
    zf.extractall(TMP_DIR)
  debug('Unzipping done')

def load_movie_agg_data(dataset_id):
  tid = 'movies_agg'
  table_name = '%s.%s.%s' % (PROJECT_ID, dataset_id, tid)
  
  # Read parquet file
  parquet_files_path = '%s/%s.parquet' % (TMP_DIR, tid)
  debug('Reading Parquet files from %s' % parquet_files_path)
  pdf = pd.read_parquet(parquet_files_path)
  debug(str(pdf.head(5)))


  # Create BigQuery table
  table = bq.Table(table_name)
  
  table.schema = (
        bq.SchemaField("movieId", "INTEGER", "REQUIRED"),
        bq.SchemaField("title",  "STRING", "REQUIRED"),
        bq.SchemaField("year", "INTEGER", "REQUIRED"),
        bq.SchemaField("imdbId", "INTEGER", "REQUIRED"),
        bq.SchemaField("numRatings", "INTEGER", "REQUIRED"),
        bq.SchemaField("avgRating", "FLOAT", "REQUIRED"),
  )
  debug('Creating %s' % table_name)
  BQ_CLIENT.create_table(table)

  debug('Populating %s with %d rows' % (table_name, len(pdf)))
  load_job = BQ_CLIENT.load_table_from_dataframe(pdf, table)

  while load_job.running():
     debug('waiting for load job to complete')
     time.sleep(1)

  debug('Done with table %s' % table_name)

def load_tfidf_data(dataset_id):
  tid = 'tfidf'
  table_name = '%s.%s.%s' % (PROJECT_ID, dataset_id, tid)
  
  # Read parquet file
  parquet_files_path = '%s/%s.parquet' % (TMP_DIR, tid)
  debug('Reading Parquet files from %s' % parquet_files_path)
  pdf = pd.read_parquet(parquet_files_path)
  pdf.movieId = pdf.movieId.astype("int32")
  debug(str(pdf.head(5)))


  # Create BigQuery table
  table = bq.Table(table_name)
  
  table.schema = (
        bq.SchemaField("movieId", "INTEGER", "REQUIRED"),
        bq.SchemaField("word",  "STRING", "REQUIRED"),
        bq.SchemaField("tf_idf", "FLOAT", "REQUIRED"),
  )
  debug('Creating %s' % table_name)
  BQ_CLIENT.create_table(table)

  debug('Populating %s with %d rows' % (table_name, len(pdf)))
  load_job = BQ_CLIENT.load_table_from_dataframe(pdf, table)

  while load_job.running():
     debug('waiting for load job to complete')
     time.sleep(1)

  debug('Done with table %s' % table_name)

def load_ratings_tags_data(dataset_id):
  tid = 'ratings_tags'
  table_name = '%s.%s.%s' % (PROJECT_ID, dataset_id, tid)
  
  # Read parquet file
  parquet_files_path = '%s/%s.parquet' % (TMP_DIR, tid)
  debug('Reading Parquet files from %s' % parquet_files_path)
  pdf = pd.read_parquet(parquet_files_path)
  debug(str(pdf.head(5)))


  # Create BigQuery table
  table = bq.Table(table_name)
  
  table.schema = (
        bq.SchemaField("movieId", "INTEGER", "REQUIRED"),
        bq.SchemaField("title", "STRING", "REQUIRED"),
        bq.SchemaField("userId",  "INTEGER", "REQUIRED")
  )
  debug('Creating %s' % table_name)
  BQ_CLIENT.create_table(table)

  debug('Populating %s with %d rows' % (table_name, len(pdf)))
  load_job = BQ_CLIENT.load_table_from_dataframe(pdf, table)

  while load_job.running():
     debug('waiting for load job to complete')
     time.sleep(1)

  debug('Done with table %s' % table_name)

def handle_pubsub_message(event, context):
  debug('Event: %s' % event)
  debug('Context: %s' % context)

  if RUNNING_IN_COLAB:
    dataset_id = event['data']
  else:
    dataset_id =  base64.b64decode(event['data']).decode('utf-8')
  
  debug('Dataset: %s' % dataset_id)
  
  get_data_from_cloud_storage(dataset_id)
  unzip_data_file()
  
  debug('Deleting previous BiqQuery dataset (if any)')

  BQ_CLIENT.delete_dataset(dataset_id, delete_contents = True, not_found_ok = True)
  BQ_CLIENT.create_dataset(dataset_id)

  debug('Created BiqQuery dataset')

  load_movie_agg_data(dataset_id)
  load_tfidf_data(dataset_id)
  load_ratings_tags_data(dataset_id)
  debug('Done for data set %s' % dataset_id)



No key file given. You may be redirected to the verification code procedure.
Updated property [core/project].
Account: [jprfpa@gmail.com]
Project: [bigdata-269209]


## Test cloud function locally

In [0]:
dataset = 'large5' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
handle_pubsub_message({ 'data': dataset}, None)

Event: {'data': 'large5'}
Context: None
Dataset: large5
Downloading gs://zaoutputbucket/large5/output.zip to /tmp/LCF_cj_ff41j/output.zip
Unzipping /tmp/LCF_cj_ff41j/output.zip
Unzipping done
Deleting previous BiqQuery dataset (if any)
Created BiqQuery dataset
Reading Parquet files from /tmp/LCF_cj_ff41j/movies_agg.parquet
   movieId                        title    year  imdbId  numRatings  avgRating
0        1                    Toy Story  1995.0  114709       57309   3.893708
1        2                      Jumanji  1995.0  113497       24228   3.251527
2        3             Grumpier Old Men  1995.0  113228       11804   3.142028
3        4            Waiting to Exhale  1995.0  114885        2523   2.853547
4        5  Father of the Bride Part II  1995.0  113041       11714   3.058434
Creating bigdata-269209.large5.movies_agg
Populating bigdata-269209.large5.movies_agg with 62345 rows
waiting for load job to complete
waiting for load job to complete
waiting for load job to complete


## Trigger cloud function once it is deployed

Before deployment do not forget to add the following dependencies to __REQUIREMENTS.txt__ in the function definitions (note that __pyarrow__ is required for Parquet data handling using Pandas):

```
pyarrow 
pandas
google.cloud.bigquery
google.cloud.storage
```

In [0]:
dataset = 'large5' #@param ["tiny1", "tiny2", "tiny3", "tiny4", "medium1", "medium2", "medium3", "medium4", "large1", "large2", "large3", "large4", "large5"] {allow-input: true}
!gcloud pubsub topics publish "$PUBSUB_TOPIC" --message "$dataset"

messageIds:
- '1101138481946456'


In [0]:
# TODO You may now check, as in previous notebooks:
# - your BigQuery data is ok with some queries in the notebook and/or in the BigQuery Web UI
# - inspect function logs

from google.cloud import bigquery
client = bigquery.Client(project=PROJECT_ID)

#check if tfidf was loaded properly 
tfidf_test = client.query(
  '''
  SELECT *
  FROM  `%s.tfidf` 
  ORDER BY tf_idf DESC
  LIMIT 10
  ''' % (dataset)
)

print(tfidf_test.to_dataframe())
print("\n\n")

#check if movies_agg was loaded properly
movies_agg_test = tfidf_test = client.query(
  '''
  SELECT *
  FROM  `%s.movies_agg` 
  LIMIT 10
  ''' % (dataset)
)

ratings_tags_test = client.query(
  '''
  SELECT *
  FROM  `%s.ratings_tags` 
  LIMIT 10
  ''' % (dataset)
)

print(ratings_tags_test.to_dataframe())
print("\n\n")

cloudFunctionName = "LCF"
!gcloud functions logs read $cloudFunctionName --limit 1000 

   movieId      word    tf_idf
0    65894    blutch  15.92979
1   133437     bicek  15.92979
2     1764   brouwer  15.92979
3   192981  bromhead  15.92979
4   193629   belugou  15.92979
5   169388  cammaert  15.92979
6   161054     bärin  15.92979
7    52551   bjorlin  15.92979
8    92514  beradino  15.92979
9   157316   arnetia  15.92979



   movieId      title  userId
0        1  Toy Story       2
1        1  Toy Story       3
2        1  Toy Story       4
3        1  Toy Story       5
4        1  Toy Story       8
5        1  Toy Story      10
6        1  Toy Story      12
7        1  Toy Story      13
8        1  Toy Story      18
9        1  Toy Story      26



LEVEL  NAME  EXECUTION_ID      TIME_UTC                 LOG
D      LCF   1081643503488292  2020-04-02 14:15:59.264  Function execution started
I      LCF   1081643503488292  2020-04-02 14:15:59.276  Event: {'@type': 'type.googleapis.com/google.pubsub.v1.PubsubMessage', 'attributes': None, 'data': 'dGlueTE='}
I      LCF   