### 1/ Make sure you have activated the Python virtual environment that is used as the notebook kernel

In [1]:
!which python

/Users/kenly/Documents/Work/ISS-IS02PT/PRS-PM-ISY5002-GROUP5/SystemCode/.venv/sandbox/bin/python


### 2/ Install the package for google-cloud-storage

In [2]:
!pip install google-cloud-storage



### 3/ Assume we have a numpy object that we want to save its state. The steps will be as following:
#### * 3a: Serialize the object in Notebook to a binary file
#### * 3b: Upload that binary file to the Google Cloud Storage (Make sure you have the service account's private json key file ready, and please don't ever commit the json file to git --> I do this by putting the file in an '.env' folder, which is ignored by git)

In [3]:
import numpy as np
import pickle

# Assuming we have 2 np arrays as follow:
my_arr_1 = np.arange(0,10,1)
print(my_arr_1)

my_arr_2 = np.arange(0,10,2)
print(my_arr_2)

[0 1 2 3 4 5 6 7 8 9]
[0 2 4 6 8]


In [4]:
# Add the objects into a dictionary to export - This is just to standardize and mentally get an overview of what objects are being exported
export_dict = {}
export_dict["my_arr_1"] = my_arr_1
export_dict["my_arr_2"] = my_arr_2

# Serialize and export the objects in individual files
for key in export_dict:
    with open(key + ".pkl", "wb") as export_pkl:
        pickle.dump(export_dict[key], export_pkl)  

In [5]:
### Upload the serialized object to Google Cloud Storage
from datetime import datetime
from google.cloud import storage

# Explicitly use service account credentials by specifying the private key file.
gcs_storage_client = storage.Client.from_service_account_json('.env/my-spark-iss-0cc3a9e9a54d.json')

# Use the same bucket_name as below for our project - this has been created
gcs_bucket_name = "my-spark-iss-us-central1"                       

# Separated with individual folder (like git branch) for each of us
gcs_folder_name = "ken.ldk"     

# Upload each file
bucket = gcs_storage_client.bucket(gcs_bucket_name)
for key in export_dict:
    source_file_name = key + ".pkl"
    # The destination blob name is appended with the current time to maintain different versions
    gcs_blob_name = gcs_folder_name + "/" + source_file_name + "." + datetime.now().strftime("%d-%b-%Y_%H:%M:%S") 
    blob = bucket.blob(gcs_blob_name)
    blob.upload_from_filename(source_file_name)
    print("File {} uploaded to {}".format(source_file_name, gcs_blob_name))

File my_arr_1.pkl uploaded to ken.ldk/my_arr_1.pkl.04-Sep-2020_10:11:06
File my_arr_2.pkl uploaded to ken.ldk/my_arr_2.pkl.04-Sep-2020_10:11:06


### 4/ Restore the state of the object to use in further processing in notebook. The steps will be as following:
#### * 4a: Download the binary files from Google Cloud Storage
#### * 4b: Recreate each object in Notebook from the binary files

In [6]:
# Add the objects into a dictionary to import - This is just to standardize and mentally get an overview of what objects are being imported
import_dict = {}

# The corresponding exported file names from above -- Share this with other members for them to download on their own
import_dict["my_arr_1"] = "ken.ldk/my_arr_1.pkl.04-Sep-2020_10:11:06"
import_dict["my_arr_2"] = "ken.ldk/my_arr_2.pkl.04-Sep-2020_10:11:06"

In [7]:
from google.cloud import storage

# Explicitly use service account credentials by specifying the private key file.
gcs_storage_client = storage.Client.from_service_account_json('.env/my-spark-iss-0cc3a9e9a54d.json')

# Make an authenticated API request
"""Downloads a blob from the bucket to the current folder."""
# Use the same bucket_name as below for our project - this has been created
gcs_bucket_name = "my-spark-iss-us-central1"

bucket = gcs_storage_client.bucket(gcs_bucket_name)
for key,value in import_dict.items():
    source_blob_name = value
    destination_file_name = "./" + key + ".pkl"
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

In [8]:
# Load the object 
for key in import_dict:
    with open(key + ".pkl", "rb") as import_pkl:
        import_dict[key] = pickle.load(import_pkl)

# We are getting back the same object
my_arr_1 = import_dict["my_arr_1"]
my_arr_2 = import_dict["my_arr_2"]

print(my_arr_1)
print(my_arr_2)

[0 1 2 3 4 5 6 7 8 9]
[0 2 4 6 8]
