<a href="https://colab.research.google.com/github/JingliSHI0206/finetune_bert_sentiment_analysis/blob/main/cba_bert_finetune_sentiment_analysis_gcp_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune BERT for Sentiment Analysis on Google Cloud Platform

## 1. Setup Environment

In [13]:
import os
import sys
from datetime import datetime

PROJECT_ID = "ajcai2021-text-summarization"  
APP_NAME = "finetuned-bert-sentiment-analysis"

# The Google Cloud Notebook product has specific requirements
IS_GOOGLE_CLOUD_NOTEBOOK = os.path.exists("/opt/deeplearning/metadata/env_version")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Google Cloud Notebook requires dependencies to be installed with '--user'
USER_FLAG = ""
if IS_GOOGLE_CLOUD_NOTEBOOK:
    USER_FLAG = "--user"


def get_timestamp():
    return datetime.now().strftime("%Y%m%d%H%M%S")


TIMESTAMP = get_timestamp()
print(f"TIMESTAMP = {TIMESTAMP}")

TIMESTAMP = 20211118130043


In [14]:
!pip -q install {USER_FLAG} --upgrade transformers
!pip -q install {USER_FLAG} --upgrade datasets
!pip -q install {USER_FLAG} --upgrade tqdm
!pip -q install {USER_FLAG} --upgrade cloudml-hypertune
!pip -q install {USER_FLAG} --upgrade google-cloud-aiplatform

### 1.1 Setup Google Authentication for Google Colab Notebook

***
<font color="red">(**!!!** Skip 1.1 if GCP Notebook is used.) </font>
***


In [15]:
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="ajcai2021-text-summarization-d01b3423bafd.json"
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

### 1.2 Setup GCP Storage

In [16]:
# create bucket to save model
BUCKET_NAME = "gs://bert-sentiment-analysis" 
REGION = "us-central1" 

! gsutil mb -p $PROJECT_ID -l $REGION $BUCKET_NAME
! gsutil ls -al $BUCKET_NAME

Creating gs://bert-sentiment-analysis/...
ServiceException: 409 A Cloud Storage bucket named 'bert-sentiment-analysis' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


## 2. Prepare Python Package Distribuation

In [22]:
PYTHON_PACKAGE_CODE_DIR = "finetune_bert_sentiment_analysis"

PATH_PYTHON_DISTRIBUATION_LOCAL = f"{PYTHON_PACKAGE_CODE_DIR}/dist/trainer-0.1.tar.gz"
PATH_PYTHON_DISTRIBUATION_GCS = (f"{BUCKET_NAME}/code/trainer-0.1.tar.gz")

CUSTOM_TRAIN_IMAGE_URI = f"gcr.io/{PROJECT_ID}/{APP_NAME}_train_k80"
MODULE_NAME = "trainer.task"

In [18]:
# pack source code
!cd {PYTHON_PACKAGE_CODE_DIR} && python3 setup.py sdist --formats=gztar
# upload package to gcs
!gsutil cp {PATH_PYTHON_DISTRIBUATION_LOCAL} {PATH_PYTHON_DISTRIBUATION_GCS}


running sdist
running egg_info
creating trainer.egg-info
writing trainer.egg-info/PKG-INFO
writing dependency_links to trainer.egg-info/dependency_links.txt
writing requirements to trainer.egg-info/requires.txt
writing top-level names to trainer.egg-info/top_level.txt
writing manifest file 'trainer.egg-info/SOURCES.txt'
reading manifest file 'trainer.egg-info/SOURCES.txt'
writing manifest file 'trainer.egg-info/SOURCES.txt'

running check


creating trainer-0.1
creating trainer-0.1/trainer.egg-info
copying files to trainer-0.1...
copying setup.py -> trainer-0.1
copying trainer.egg-info/PKG-INFO -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/SOURCES.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/dependency_links.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/requires.txt -> trainer-0.1/trainer.egg-info
copying trainer.egg-info/top_level.txt -> trainer-0.1/trainer.egg-info
Writing trainer-0.1/setup.cfg
creating dist
Creating tar archive
removing 'tra

# 3. Build Custom Docker and Push to *GCP*

In [29]:

!cd {PYTHON_PACKAGE_CODE_DIR}/ && docker build -f Dockerfile -t $CUSTOM_TRAIN_IMAGE_URI {PYTHON_PACKAGE_CODE_DIR}

/bin/bash: docker: command not found


In [None]:
!docker push $CUSTOM_TRAIN_IMAGE_URI

# 4. Start Training Job

In [None]:
JOB_NAME = f"job-{APP_NAME}-{get_timestamp()}"

print(f"APP_NAME={APP_NAME}")
print(f"CUSTOM_TRAIN_IMAGE_URI={CUSTOM_TRAIN_IMAGE_URI}")
print(f"JOB_NAME={JOB_NAME}")

aiplatform.init(project=PROJECT_ID, staging_bucket=BUCKET_NAME)

# configure the job with container image spec
job = aiplatform.CustomContainerTrainingJob(display_name=f"{JOB_NAME}", container_uri=f"{CUSTOM_TRAIN_IMAGE_URI}")

# define training code arguments
training_args = ["--num-epochs", "2", "--model-name", "finetuned-bert-classifier"]
# submit the custom job to Vertex training service
model = job.run( replica_count=1, machine_type="n1-standard-4",accelerator_type="NVIDIA_TESLA_K80",accelerator_count=1,args=training_args,sync=False)


# 5. Training Job Monitoring

## Go to "Vertex AI --> Training"