# VegaFusion

## Environment Setup

### Import Libraries

In [1]:
# Standard Libraries
import pathlib

# External Libraries
import altair as alt
import vegafusion as vf
import delta_sharing
import requests

In [2]:
# URL of profile for example Delta Sharing Server
url = 'https://databricks-datasets-oregon.s3-us-west-2.amazonaws.com/delta-sharing/share/open-datasets.share'

# Path of profile for example Delta Sharing Server
path = pathlib.Path('./profile.share').resolve().as_posix()

### Read Delta Table

In [3]:
profile = requests.get(url).text
print(profile)

{
  "shareCredentialsVersion": 1,
  "endpoint": "https://sharing.delta.io/delta-sharing/",
  "bearerToken": "faaie590d541265bcab1f2de9813274bf233"
}


In [4]:
with open(path, 'w') as file:
  file.write(profile)

In [5]:
client = delta_sharing.SharingClient(path)
tables = client.list_all_tables()
tables

[Table(name='COVID_19_NYT', share='delta_sharing', schema='default'),
 Table(name='boston-housing', share='delta_sharing', schema='default'),
 Table(name='flight-asa_2008', share='delta_sharing', schema='default'),
 Table(name='lending_club', share='delta_sharing', schema='default'),
 Table(name='nyctaxi_2019', share='delta_sharing', schema='default'),
 Table(name='nyctaxi_2019_part', share='delta_sharing', schema='default'),
 Table(name='owid-covid-data', share='delta_sharing', schema='default')]

In [6]:
table = tables[5]  # NYC Taxi Dataset
pdf = delta_sharing.load_as_pandas(
  url=f'{path}#{table.share}.{table.schema}.{table.name}',
  limit=1000000,
)

pdf

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code_id,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,total_amount,yyyyMM
0,1,2019-10-01 00:19:55,2019-10-01 00:23:57,1,0.40,1.0,,48,163,2.0,4.5,3,0.5,0.00,0.0,0.3,8.30,2.5,2019-10
1,1,2019-10-01 00:06:52,2019-10-01 00:21:23,1,5.00,1.0,,137,80,1.0,17.0,3,0.5,5.20,0.0,0.3,26.00,2.5,2019-10
2,2,2019-10-01 00:36:08,2019-10-01 00:36:15,1,0.00,1.0,,25,25,4.0,-2.5,-0.5,-0.5,0.00,0.0,-0.3,-3.80,0.0,2019-10
3,2,2019-10-01 00:36:08,2019-10-01 00:36:15,1,0.00,1.0,,25,25,2.0,2.5,0.5,0.5,0.00,0.0,0.3,3.80,0.0,2019-10
4,2,2019-10-01 00:20:15,2019-10-01 00:20:29,1,0.00,1.0,,193,193,1.0,2.5,0.5,0.5,0.00,0.0,0.3,3.80,0.0,2019-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2,2019-10-07 20:47:56,2019-10-07 21:00:32,1,1.74,1.0,,107,233,1.0,9.5,0.5,0.5,2.66,0.0,0.3,15.96,2.5,2019-10
999996,2,2019-10-07 20:05:14,2019-10-07 20:12:43,1,1.41,1.0,,262,237,1.0,7.0,0.5,0.5,1.30,0.0,0.3,12.10,2.5,2019-10
999997,2,2019-10-07 20:13:55,2019-10-07 20:22:57,1,1.29,1.0,,237,263,1.0,8.0,0.5,0.5,2.36,0.0,0.3,14.16,2.5,2019-10
999998,2,2019-10-07 20:47:30,2019-10-07 20:55:58,1,1.85,1.0,,140,233,1.0,9.0,0.5,0.5,2.56,0.0,0.3,15.36,2.5,2019-10


## Plot Dataset

By default Vega-Altair restricts the number of rows to 5K. It's able to handle larger datasets up to ~50K rows no problem, but to handle our 1 million rows we'll need to use VegaFusion. VegaFusion provides serverside scaling of Vega-Altair visualization by offloading expensive computations to Rust which can handle the added load.

In [11]:
# Turn off the 5K limit
# Turn on Rust powered serverside scaling of Vega-Altair!
vf.enable(row_limit=5000000)

vegafusion.enable(mimetype='html', row_limit=5000000, embed_options=None)

In [8]:
columns = ['trip_distance', 'fare_amount', 'tip_amount', 'tolls_amount', 'total_amount', ]
chart = alt.Chart(pdf[columns]).mark_bar().encode(
  x=alt.X('trip_distance', type='quantitative', bin=alt.Bin(maxbins=20)),
  y='count()'
).properties(width=600, height=200)

chart