# Welcome to the Labelbox <> Databricks Connector

This script guides through the set up the Labelbox <> Databricks connector. Once set up, it will ingest data from Databricks to Labelbox.</b>

## Step 0: Installs

In [2]:
!pip install ipywidgets --quiet
import ipywidgets as widgets
from IPython.display import display, clear_output


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


<b>

## Step 1: User inputs

#### Data is ingested from a Databricks table: enter your Databricks info

In [None]:
# @title
# Databricks instance URL
databricks_instance_label = widgets.Label('Enter your Databricks cloud instance URL:')
databricks_instance = widgets.Text(value=' <workspace_id>.<cloud>.databricks.com')
display(databricks_instance_label, databricks_instance)

# Personal access token for Databricks authentication
databricks_api_key_label = widgets.Label('Enter your Databricks personal access token:')
databricks_api_key = widgets.Password(value='')
display(databricks_api_key_label, databricks_api_key)

# Databricks table path
table_path_label = widgets.Label('Enter the path to the Databricks table to be ingested:')
table_path = widgets.Text(value='<metastore>.<database>.<table>')
display(table_path_label, table_path)

#### Data is connected to a Labelbox dataset: enter your Labelbox info

In [None]:
# @title
# Labelbox API Key
labelbox_api_key_label = widgets.Label('Labelbox API Key. This can be generated from the Labelbox settings page:')
labelbox_api_key = widgets.Password(value='')
display(labelbox_api_key_label, labelbox_api_key)


# Create a new dataset in Labelbox?
create_new_dataset_label = widgets.Label('Do you wish to create a new dataset in Labelbox for this data ingestion process?')
create_new_dataset = widgets.Dropdown(options=["yes", "no"], value="no")

# Dataset name to create in Labelbox
dataset_name_label = widgets.Label('Enter the name of the dataset to create in Labelbox for this data ingestion process:')
dataset_name = widgets.Text(value='')

# Labelbox dataset ID
dataset_id_label = widgets.Label('Enter the Labelbox dataset ID to use for this data ingestion process:')
dataset_id = widgets.Text(value='')

# Box for dataset name
dataset_name_box = widgets.VBox([dataset_name_label, dataset_name])
# Box for dataset id
dataset_id_box = widgets.VBox([dataset_id_label, dataset_id])

# Output box to conditionally display widgets
output_box = widgets.Output()

def on_change(change):
    with output_box:
        if change['new'] == "yes":
            display(dataset_name_box)
        else:
            display(dataset_id_box)
        output_box.clear_output(wait=True)  # Clear the previous widgets

# Attach the function as observer to the dropdown's value
create_new_dataset.observe(on_change, names='value')

display(create_new_dataset_label, create_new_dataset, output_box)

# Trigger the observer to display the correct box initially
on_change({'new': create_new_dataset.value})

#### Set up the parameters of the data ingestion connector

In [None]:
# @title
# Schedule type input
schedule_type_label = widgets.Label('Do you wish to trigger the data ingestion: manually from Labelbox and/or Databricks or, on a set interval (eg weekly)?')
schedule_type = widgets.Dropdown(options=["manual", "interval"], value="interval")

# Frequency input
frequency_label = widgets.Label('If you have selected "interval", enter the interval to use to trigger the data ingestion pipeline:')
frequency = widgets.Dropdown(options=["daily", "weekly"], value="weekly", description='Frequency:')

# Start and end date inputs
start_ingestion = widgets.DatePicker(value=None, description='Start Date:')
end_ingestion = widgets.DatePicker(value=None, description='End Date:')

# Box for interval details
interval_box = widgets.VBox([frequency_label, frequency, start_ingestion, end_ingestion])
empty_box = widgets.VBox([])

# Output box to conditionally display widgets
output_box = widgets.Output()

def on_schedule_type_change(change):
    with output_box:
        output_box.clear_output(wait=True)  # Clear previous widgets
        if change['new'] == "interval":
            display(interval_box)
        else:
            display(empty_box)
        output_box.clear_output(wait=True)  # Clear the previous widgets

# Attach the function as observer to the dropdown's value
schedule_type.observe(on_schedule_type_change, names='value')

display(schedule_type_label, schedule_type, output_box)

# Trigger the observer to display the correct box initially
on_schedule_type_change({'new': schedule_type.value})


## Step 2: Dry run of the data ingestion pipeline

TODO

Check that that it worked: links links links

## Step 3: Set up the data ingestion pipeline

In [None]:
email = None

In [None]:
import requests
import json

# ----- JOB SCHEDULING LOGIC -----

# If the job needs to run continuously, use the "continuous" block
# Else, use the "schedule" block with the specified cron frequency
if frequency == "continuous":
    schedule_block = {
        "continuous": {
            "pause_status": "UNPAUSED"
        }
    }
else:
    schedule_block = {
        "schedule": {
            "quartz_cron_expression": frequency,
            "timezone_id": "UTC",
            "pause_status": "UNPAUSED"
        }
    }

# ----- JOB DEFINITION -----

# Define the parameters and structure of the job to be created in Databricks
payload = {
    "run_as": {"user_name": email},
    "name": "upload_to_labelbox",
    "email_notifications": {"no_alert_for_skipped_runs": False},
    "webhook_notifications": {},
    "timeout_seconds": 0,
    "max_concurrent_runs": 1,
    "tasks": [
        {
            "task_key": "upload_to_labelbox",
            "run_if": "ALL_SUCCESS",
            "notebook_task": {
                "notebook_path": "notebooks/databricks_pipeline_creator/upload_to_labelbox",
                "base_parameters": {
                    "dataset_id": dataset_id,
                    "table_path": table_path,
                    "labelbox_api_key": labelbox_api_key,
                },
                "source": "GIT"
            },
            "job_cluster_key": "Job_cluster",
            "libraries": [
                {"pypi": {"package": "labelspark"}},
                {"pypi": {"package": "labelbox==3.49.1"}},
                {"pypi": {"package": "numpy==1.25"}},
                {"pypi": {"package": "opencv-python==4.8.0.74"}}
            ],
            "timeout_seconds": 0,
            "email_notifications": {},
            "notification_settings": {
                "no_alert_for_skipped_runs": False,
                "no_alert_for_canceled_runs": False,
                "alert_on_last_attempt": False
            }
        }
    ],
    "job_clusters": [
        {
            "job_cluster_key": "Job_cluster",
            "new_cluster": {
                "cluster_name": "",
                "spark_version": "13.3.x-scala2.12",
                "gcp_attributes": {
                    "use_preemptible_executors": False,
                    "availability": "ON_DEMAND_GCP",
                    "zone_id": "HA"
                },
                "node_type_id": "n2-highmem-4",
                "enable_elastic_disk": True,
                "data_security_mode": "SINGLE_USER",
                "runtime_engine": "STANDARD",
                "autoscale": {
                    "min_workers": 1,
                    "max_workers": 10
                }
            }
        }
    ],
    "git_source": {
        "git_url": "https://github.com/Labelbox/labelspark.git",
        "git_provider": "gitHub",
        "git_branch": "master"
    },
    "format": "MULTI_TASK"
}

# Merge the scheduling configuration into the main job payload
payload.update(schedule_block)

# ----- JOB CREATION -----

# Formulate the endpoint URL for the Databricks REST API job creation
url = f"https://{databricks_instance}/api/2.0/jobs/create"
# Define the authentication headers
headers = {
    "Authorization": f"Bearer {databricks_api_key}",
    "Content-Type": "application/json",
}

# Send the POST request to Databricks to create the job
response = requests.post(url, data=json.dumps(payload), headers=headers)

# ----- RESPONSE HANDLING -----

# Print the response
# If the response code is 200, it means the job was created successfully.
# Otherwise, print the error message received.
if response.status_code == 200:
    print("Job created successfully.")
else:
    print(f"Failed to create job. Error: {response.text}")

Check that that it worked: links links links

## Step 4: Monitoring and troubleshooting the data ingestion pipeline

We print all the links for users

In [None]:
print(f"Link to Databricks instance: {'xxxx'}")
print(f"Link to Databricks table: {'xxxx'}")
print(f"Link to view ingestion job: {'xxxx'}")
print(f"Link to failed ingestion job run: {'xxxx'}")