# 07-02 : Trino To Dataset

Test Trino and MLRun integration.

In [1]:
import mlrun

## 1. Configuration

In [2]:
project_name = "trino-demo"
function_path = "../../functions/development"

query_file = "queries/ingest.sql"
schema = "lakehouse"
catalog = "iceberg"

mlrun_image = "registry-service.mlrun.svc.cluster.local/mlrun/mlrun:1.9.1"

ingest_schema_file = "assets/tfdv_schema_training.pbtxt"

## 2. Setup Project

### 2.1 Load The Project

In [3]:
project = mlrun.get_or_create_project(
    name=project_name,
    user_project=False)

# Display the current project name
project_name = project.metadata.name
print(f'Full project name: {project_name}')

> 2025-08-22 16:38:31,372 [info] Loading project from path: {"path":"./","project_name":"trino-demo","user_project":false}
> 2025-08-22 16:38:31,390 [info] Project loaded successfully: {"path":"./","project_name":"trino-demo","stored_in_db":true}
Full project name: trino-demo


### 2.2 Project Secrets

In [4]:
project.set_secrets({
    "TRINO_HOST": "dragon.lan",
    "TRINO_PORT": "9191",
    "TRINO_USER": "johnny",
})

### 2.3 Prepare Project 

In [5]:
# read the query file
with open(query_file, "r") as f:
    query = f.read()

# add the query as an artifact
query_artifact = project.log_artifact(
    item="ingest-query",
    local_path=query_file,
    format="sql")

# add the schema artefact
schema_artifact = project.log_artifact(
    item="ingest-schema",
    local_path=ingest_schema_file,
    format="pbtxt"
)

## 3. Generate Query

In [6]:
# load the function
generate_query = mlrun.code_to_function(
    name="generate_query",
    filename=f"{function_path}/sql/generate_query/generate_query.py",
    kind="job",
    image=mlrun_image,
    handler="generate_query",
    requirements_file=f"{function_path}/sql/generate_query/requirements.txt")

In [7]:
# generate the query
ingest_query = generate_query.run(
    inputs= {
        "input_file": query_artifact.uri
    },
    params={
        "replacements": {
            "catalog": catalog,
            "schema": schema,
            "source_table": "taxi_trips",
            "filter_column": "trip_start_timestamp",
            "filter_start_value": "2023-06-01",
            "target_column": "total_fare" 
        }
    },
    local=False,
    auto_build=True
).outputs["sql"]

> 2025-08-22 16:38:31,556 [error] error getting build status: details: MLRunNotFoundError('Function tag not found trino-demo/generate-query'), caused by: 404 Client Error: Not Found for url: http://dragon.local:30070/api/v1/build/status?name=generate-query&project=trino-demo&tag=&logs=no&offset=0&events_offset=0&last_log_timestamp=0.0&verbose=no
> 2025-08-22 16:38:31,556 [info] Function is not deployed and auto_build flag is set, starting deploy...
> 2025-08-22 16:38:31,562 [error] error getting build status: details: MLRunNotFoundError('Function tag not found trino-demo/generate-query'), caused by: 404 Client Error: Not Found for url: http://dragon.local:30070/api/v1/build/status?name=generate-query&project=trino-demo&tag=&logs=no&offset=0&events_offset=0&last_log_timestamp=0.0&verbose=no
> 2025-08-22 16:38:31,632 [info] Started building image: .mlrun/func-trino-demo-generate-query:latest
> 2025-08-22 16:39:26,109 [info] Storing function: {"db":"http://dragon.local:30070","name":"gene

project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results
trino-demo,...141906,0,Aug 22 14:39:28,2025-08-22 14:39:28.452644+00:00,completed,run,generate-query-generate-query,v3io_user=johanneskind=jobowner=johannesmlrun/client_version=1.9.1mlrun/client_python_version=3.10.18host=generate-query-generate-query-wnk9x,input_file,"replacements={'catalog': 'iceberg', 'schema': 'lakehouse', 'source_table': 'taxi_trips', 'filter_column': 'trip_start_timestamp', 'filter_start_value': '2023-06-01', 'target_column': 'total_fare'}","sql=WITH filtered_data AS (\n SELECT\n *\n FROM\n ""iceberg"".""lakehouse"".""taxi_trips""\n WHERE\n -- Trino uses CAST(... AS DATE) and the standard subtraction operator for intervals\n CAST(trip_start_timestamp AS DATE) BETWEEN\n (CAST('2023-06-01' AS DATE) - INTERVAL '3' MONTH) AND\n (CAST('2023-06-01' AS DATE) - INTERVAL '2' MONTH)\n)\n\n, mean_time AS (\n SELECT\n -- Trino's equivalent of INT64 is BIGINT\n CAST(AVG(trip_seconds) AS BIGINT) AS avg_trip_seconds\n FROM\n filtered_data\n)\n\nSELECT\n -- Function names for date/time extraction are slightly different\n CAST(day_of_week(trip_start_timestamp) AS DOUBLE) AS dayofweek, -- Note: Sunday=7 in Trino\n CAST(hour(trip_start_timestamp) AS DOUBLE) AS hourofday,\n \n -- Geospatial functions use a different syntax\n ST_Distance(\n to_spherical_geography(ST_Point(pickup_longitude, pickup_latitude)),\n to_spherical_geography(ST_Point(dropoff_longitude, dropoff_latitude))\n ) AS trip_distance,\n \n trip_miles,\n \n -- Trino's equivalent of FLOAT64 is DOUBLE\n CAST(\n CASE\n WHEN trip_seconds IS NULL THEN m.avg_trip_seconds\n WHEN trip_seconds <= 0 THEN m.avg_trip_seconds\n ELSE trip_seconds\n END AS DOUBLE\n ) AS trip_seconds,\n \n payment_type,\n company,\n \n -- Use double quotes for identifiers if needed, not backticks\n (fare + tips + tolls + extras) AS ""total_fare""\nFROM\n filtered_data AS t\n-- Explicit CROSS JOIN is clearer than a comma\nCROSS JOIN mean_time AS m\nWHERE\n trip_miles > 0 AND fare > 0 AND fare < 1500\n -- The Jinja templating part does not need to change at all\n \n AND ""fare"" IS NOT NULL\n \n AND ""trip_start_timestamp"" IS NOT NULL\n \n AND ""pickup_longitude"" IS NOT NULL\n \n AND ""pickup_latitude"" IS NOT NULL\n \n AND ""dropoff_longitude"" IS NOT NULL\n \n AND ""dropoff_latitude"" IS NOT NULL\n \n AND ""payment_type"" IS NOT NULL\n \n AND ""company"" IS NOT NULL\n"





> 2025-08-22 16:39:33,385 [info] Run execution finished: {"name":"generate-query-generate-query","status":"completed"}


## 4. Ingest Data

In [8]:
# load the function
query_to_dataset = mlrun.code_to_function(
    name="query_to_dataset",
    filename=f"{function_path}/trino/query_to_dataset/query_to_dataset.py",
    kind="job",
    image=mlrun_image,
    handler="query_to_dataset",
    requirements_file=f"{function_path}/trino/query_to_dataset/requirements.txt"
)

In [9]:
# run the query
dataset_uri = query_to_dataset.run(
    name="ingest",
    params={
        "query": ingest_query,
        "schema": schema,
        "catalog": catalog,
        "dataset_name": "train-source",
    },    
    local=False,
    auto_build=True
).outputs["train-source"]

> 2025-08-22 16:39:33,411 [error] error getting build status: details: MLRunNotFoundError('Function tag not found trino-demo/query-to-dataset'), caused by: 404 Client Error: Not Found for url: http://dragon.local:30070/api/v1/build/status?name=query-to-dataset&project=trino-demo&tag=&logs=no&offset=0&events_offset=0&last_log_timestamp=0.0&verbose=no
> 2025-08-22 16:39:33,411 [info] Function is not deployed and auto_build flag is set, starting deploy...
> 2025-08-22 16:39:33,415 [error] error getting build status: details: MLRunNotFoundError('Function tag not found trino-demo/query-to-dataset'), caused by: 404 Client Error: Not Found for url: http://dragon.local:30070/api/v1/build/status?name=query-to-dataset&project=trino-demo&tag=&logs=no&offset=0&events_offset=0&last_log_timestamp=0.0&verbose=no
> 2025-08-22 16:39:33,499 [info] Started building image: .mlrun/func-trino-demo-query-to-dataset:latest
> 2025-08-22 16:40:38,098 [info] Storing function: {"db":"http://dragon.local:30070","n

project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results,artifacts
trino-demo,...68dd8d,0,Aug 22 14:40:40,2025-08-22 14:40:44.497991+00:00,completed,run,ingest,v3io_user=johanneskind=jobowner=johannesmlrun/client_version=1.9.1mlrun/client_python_version=3.10.18host=ingest-8rkzk,,"query=WITH filtered_data AS (\n SELECT\n *\n FROM\n ""iceberg"".""lakehouse"".""taxi_trips""\n WHERE\n -- Trino uses CAST(... AS DATE) and the standard subtraction operator for intervals\n CAST(trip_start_timestamp AS DATE) BETWEEN\n (CAST('2023-06-01' AS DATE) - INTERVAL '3' MONTH) AND\n (CAST('2023-06-01' AS DATE) - INTERVAL '2' MONTH)\n)\n\n, mean_time AS (\n SELECT\n -- Trino's equivalent of INT64 is BIGINT\n CAST(AVG(trip_seconds) AS BIGINT) AS avg_trip_seconds\n FROM\n filtered_data\n)\n\nSELECT\n -- Function names for date/time extraction are slightly different\n CAST(day_of_week(trip_start_timestamp) AS DOUBLE) AS dayofweek, -- Note: Sunday=7 in Trino\n CAST(hour(trip_start_timestamp) AS DOUBLE) AS hourofday,\n \n -- Geospatial functions use a different syntax\n ST_Distance(\n to_spherical_geography(ST_Point(pickup_longitude, pickup_latitude)),\n to_spherical_geography(ST_Point(dropoff_longitude, dropoff_latitude))\n ) AS trip_distance,\n \n trip_miles,\n \n -- Trino's equivalent of FLOAT64 is DOUBLE\n CAST(\n CASE\n WHEN trip_seconds IS NULL THEN m.avg_trip_seconds\n WHEN trip_seconds <= 0 THEN m.avg_trip_seconds\n ELSE trip_seconds\n END AS DOUBLE\n ) AS trip_seconds,\n \n payment_type,\n company,\n \n -- Use double quotes for identifiers if needed, not backticks\n (fare + tips + tolls + extras) AS ""total_fare""\nFROM\n filtered_data AS t\n-- Explicit CROSS JOIN is clearer than a comma\nCROSS JOIN mean_time AS m\nWHERE\n trip_miles > 0 AND fare > 0 AND fare < 1500\n -- The Jinja templating part does not need to change at all\n \n AND ""fare"" IS NOT NULL\n \n AND ""trip_start_timestamp"" IS NOT NULL\n \n AND ""pickup_longitude"" IS NOT NULL\n \n AND ""pickup_latitude"" IS NOT NULL\n \n AND ""dropoff_longitude"" IS NOT NULL\n \n AND ""dropoff_latitude"" IS NOT NULL\n \n AND ""payment_type"" IS NOT NULL\n \n AND ""company"" IS NOT NULL\n schema=lakehousecatalog=icebergdataset_name=train-source",rows=473285,train-source





> 2025-08-22 16:40:49,321 [info] Run execution finished: {"name":"ingest","status":"completed"}


### 4.1. Results

In [10]:
# load the ingested dataset
df_train = mlrun.get_dataitem(dataset_uri).as_df()

print(f"rows read: {df_train.shape[0]:,}")
display(df_train.head())

rows read: 473,285


Unnamed: 0,dayofweek,hourofday,trip_distance,trip_miles,trip_seconds,payment_type,company,total_fare
0,5.0,16.0,21343.087544,15.0,3360.0,Unknown,Taxi Affiliation Services,39.5
1,5.0,8.0,21812.983595,15.61,1249.0,Cash,Flash Cab,39.0
2,5.0,9.0,15437.22943,0.7,1440.0,Cash,Taxi Affiliation Services,32.0
3,5.0,16.0,11425.401388,7.1,1140.0,Unknown,Taxi Affiliation Services,19.5
4,5.0,10.0,15228.663561,12.23,1620.0,Prcard,Flash Cab,32.75


## 5. Generate Statistics

In [11]:
function = mlrun.code_to_function(
    name="generate_statistics",
    filename=f"{function_path}/tfdv/generate_statistics/generate_statistics.py",
    kind="job",
    image=mlrun_image,
    handler="generate_statistics",
    requirements_file=f"{function_path}/tfdv/generate_statistics/requirements.txt"
)

In [12]:
# generate statistics
result = function.run(
    name="ingest-statistics",
    inputs={
        "dataset_uri": dataset_uri,
        "schema_file": schema_artifact.uri,
    },
    params={
        "stats_options": {
            "infer_type_from_schema": True
        }
    },
    local=True,
    auto_build=True
)

> 2025-08-22 16:40:49,617 [info] Storing function: {"db":"http://dragon.local:30070","name":"ingest-statistics","uid":"e3fd394adccf4ca29c91feba305a805f"}


2025-08-22 16:40:50.253186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-22 16:40:50.265817: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-22 16:40:50.265835: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-22 16:40:50.274912: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


> 2025-08-22 16:40:51,236 [info] downloading s3://mlrun/projects/trino-demo/artifacts/ingest/0/train-source.parquet to local temp file
> 2025-08-22 16:40:51,249 [info] downloading s3://mlrun/projects/trino-demo/artifacts/ingest-schema.pbtxt to local temp file
> 2025-08-22 16:40:51,252 [info] Loading Dataset
> 2025-08-22 16:40:51,293 [info] Computing statistics
> 2025-08-22 16:40:51,614 [info] Generating statistics HTML


project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results,artifact_uris
trino-demo,...5a805f,0,Aug 22 14:40:49,NaT,completed,run,ingest-statistics,v3io_user=johanneskind=localowner=johanneshost=dragon,dataset_urischema_file,stats_options={'infer_type_from_schema': True},"statistics=datasets {\n num_examples: 473285\n features {\n type: FLOAT\n num_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n mean: 3.8733194586771185\n std_dev: 1.8365823500379994\n min: 1.0\n median: 4.0\n max: 7.0\n histograms {\n buckets {\n low_value: 1.0\n high_value: 1.6\n sample_count: 61396.74222222216\n }\n buckets {\n low_value: 1.6\n high_value: 2.2\n sample_count: 64100.58836601286\n }\n buckets {\n low_value: 2.2\n high_value: 2.8\n sample_count: 282.9917647058822\n }\n buckets {\n low_value: 2.8\n high_value: 3.4\n sample_count: 79993.54056840701\n }\n buckets {\n low_value: 3.4\n high_value: 4.0\n sample_count: 84299.13707865178\n }\n buckets {\n low_value: 4.0\n high_value: 4.6\n sample_count: 283.931707317073\n }\n buckets {\n low_value: 4.6\n high_value: 5.2\n sample_count: 77418.42889874363\n }\n buckets {\n low_value: 5.2\n high_value: 5.8\n sample_count: 283.081818181818\n }\n buckets {\n low_value: 5.8\n high_value: 6.3999999999999995\n sample_count: 62088.931488801194\n }\n buckets {\n low_value: 6.3999999999999995\n high_value: 7.0\n sample_count: 43137.626086956545\n }\n }\n histograms {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 61111.0\n }\n buckets {\n low_value: 1.0\n high_value: 2.0\n sample_count: 64292.0\n }\n buckets {\n low_value: 2.0\n high_value: 3.0\n sample_count: 40090.5\n }\n buckets {\n low_value: 3.0\n high_value: 3.0\n sample_count: 40090.5\n }\n buckets {\n low_value: 3.0\n high_value: 4.0\n sample_count: 42244.5\n }\n buckets {\n low_value: 4.0\n high_value: 4.0\n sample_count: 42244.5\n }\n buckets {\n low_value: 4.0\n high_value: 5.0\n sample_count: 77608.0\n }\n buckets {\n low_value: 5.0\n high_value: 6.0\n sample_count: 31139.0\n }\n buckets {\n low_value: 6.0\n high_value: 6.0\n sample_count: 31139.0\n }\n buckets {\n low_value: 6.0\n high_value: 7.0\n sample_count: 43326.0\n }\n type: QUANTILES\n }\n }\n path {\n step: ""dayofweek""\n }\n }\n features {\n type: FLOAT\n num_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n mean: 13.826514679315846\n std_dev: 5.08374514335329\n num_zeros: 7239\n median: 14.0\n max: 23.0\n histograms {\n buckets {\n high_value: 2.3\n sample_count: 14491.775\n }\n buckets {\n low_value: 2.3\n high_value: 4.6\n sample_count: 4690.705\n }\n buckets {\n low_value: 4.6\n high_value: 6.8999999999999995\n sample_count: 13383.805294117643\n }\n buckets {\n low_value: 6.8999999999999995\n high_value: 9.2\n sample_count: 66466.30756302536\n }\n buckets {\n low_value: 9.2\n high_value: 11.5\n sample_count: 55223.11026785724\n }\n buckets {\n low_value: 11.5\n high_value: 13.799999999999999\n sample_count: 61417.19990530283\n }\n buckets {\n low_value: 13.799999999999999\n high_value: 16.099999999999998\n sample_count: 96734.31994266971\n }\n buckets {\n low_value: 16.099999999999998\n high_value: 18.4\n sample_count: 68245.75480480457\n }\n buckets {\n low_value: 18.4\n high_value: 20.7\n sample_count: 52037.202222222186\n }\n buckets {\n low_value: 20.7\n high_value: 23.0\n sample_count: 40594.819999999934\n }\n }\n histograms {\n buckets {\n high_value: 7.0\n sample_count: 48097.0\n }\n buckets {\n low_value: 7.0\n high_value: 9.0\n sample_count: 50840.0\n }\n buckets {\n low_value: 9.0\n high_value: 11.0\n sample_count: 55080.0\n }\n buckets {\n low_value: 11.0\n high_value: 13.0\n sample_count: 61282.0\n }\n buckets {\n low_value: 13.0\n high_value: 14.0\n sample_count: 30847.0\n }\n buckets {\n low_value: 14.0\n high_value: 16.0\n sample_count: 66214.0\n }\n buckets {\n low_value: 16.0\n high_value: 17.0\n sample_count: 34945.0\n }\n buckets {\n low_value: 17.0\n high_value: 18.0\n sample_count: 33161.0\n }\n buckets {\n low_value: 18.0\n high_value: 20.0\n sample_count: 51892.0\n }\n buckets {\n low_value: 20.0\n high_value: 23.0\n sample_count: 40927.0\n }\n type: QUANTILES\n }\n }\n path {\n step: ""hourofday""\n }\n }\n features {\n type: FLOAT\n num_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n mean: 8755.942752481174\n std_dev: 8782.683019274042\n num_zeros: 42911\n median: 4763.7936165410665\n max: 46829.875875031576\n histograms {\n buckets {\n high_value: 4682.987587503158\n sample_count: 235527.76245776968\n }\n buckets {\n low_value: 4682.987587503158\n high_value: 9365.975175006315\n sample_count: 66801.71560503563\n }\n buckets {\n low_value: 9365.975175006315\n high_value: 14048.962762509473\n sample_count: 42733.06918088119\n }\n buckets {\n low_value: 14048.962762509473\n high_value: 18731.95035001263\n sample_count: 37425.688886202086\n }\n buckets {\n low_value: 18731.95035001263\n high_value: 23414.937937515788\n sample_count: 30227.76722692914\n }\n buckets {\n low_value: 23414.937937515788\n high_value: 28097.925525018945\n sample_count: 58335.98696044794\n }\n buckets {\n low_value: 28097.925525018945\n high_value: 32780.9131125221\n sample_count: 841.7539263438031\n }\n buckets {\n low_value: 32780.9131125221\n high_value: 37463.90070002526\n sample_count: 999.0793536197137\n }\n buckets {\n low_value: 37463.90070002526\n high_value: 42146.88828752842\n sample_count: 196.08820138541085\n }\n buckets {\n low_value: 42146.88828752842\n high_value: 46829.875875031576\n sample_count: 196.08820138541085\n }\n }\n histograms {\n buckets {\n high_value: 666.330132154062\n sample_count: 47390.0\n }\n buckets {\n low_value: 666.330132154062\n high_value: 1423.0238023277125\n sample_count: 47668.0\n }\n buckets {\n low_value: 1423.0238023277125\n high_value: 2330.1799183163716\n sample_count: 46852.0\n }\n buckets {\n low_value: 2330.1799183163716\n high_value: 3000.2416650061255\n sample_count: 47437.0\n }\n buckets {\n low_value: 3000.2416650061255\n high_value: 4763.7936165410665\n sample_count: 47752.0\n }\n buckets {\n low_value: 4763.7936165410665\n high_value: 7701.916151507995\n sample_count: 49252.0\n }\n buckets {\n low_value: 7701.916151507995\n high_value: 12433.801512657337\n sample_count: 44929.0\n }\n buckets {\n low_value: 12433.801512657337\n high_value: 17783.845765676084\n sample_count: 47366.0\n }\n buckets {\n low_value: 17783.845765676084\n high_value: 24838.99889692287\n sample_count: 50579.0\n }\n buckets {\n low_value: 24838.99889692287\n high_value: 46829.875875031576\n sample_count: 44060.0\n }\n type: QUANTILES\n }\n }\n path {\n step: ""trip_distance""\n }\n }\n features {\n type: FLOAT\n num_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n mean: 6.386785636561481\n std_dev: 6.545140408650087\n min: 0.01\n median: 3.2\n max: 662.9\n histograms {\n buckets {\n low_value: 0.01\n high_value: 66.299\n sample_count: 472836.9833092541\n }\n buckets {\n low_value: 66.299\n high_value: 132.588\n sample_count: 49.77963230507882\n }\n buckets {\n low_value: 132.588\n high_value: 198.877\n sample_count: 49.77963230507884\n }\n buckets {\n low_value: 198.877\n high_value: 265.166\n sample_count: 49.77963230507882\n }\n buckets {\n low_value: 265.166\n high_value: 331.455\n sample_count: 49.77963230507882\n }\n buckets {\n low_value: 331.455\n high_value: 397.744\n sample_count: 49.779632305078856\n }\n buckets {\n low_value: 397.744\n high_value: 464.033\n sample_count: 49.77963230507882\n }\n buckets {\n low_value: 464.033\n high_value: 530.322\n sample_count: 49.77963230507882\n }\n buckets {\n low_value: 530.322\n high_value: 596.611\n sample_count: 49.77963230507882\n }\n buckets {\n low_value: 596.611\n high_value: 662.9\n sample_count: 49.77963230507882\n }\n }\n histograms {\n buckets {\n low_value: 0.01\n high_value: 0.65\n sample_count: 47544.0\n }\n buckets {\n low_value: 0.65\n high_value: 1.0\n sample_count: 48054.0\n }\n buckets {\n low_value: 1.0\n high_value: 1.43\n sample_count: 46699.0\n }\n buckets {\n low_value: 1.43\n high_value: 2.05\n sample_count: 47264.0\n }\n buckets {\n low_value: 2.05\n high_value: 3.2\n sample_count: 47187.0\n }\n buckets {\n low_value: 3.2\n high_value: 5.66\n sample_count: 47254.0\n }\n buckets {\n low_value: 5.66\n high_value: 9.68\n sample_count: 47337.0\n }\n buckets {\n low_value: 9.68\n high_value: 12.6\n sample_count: 47743.0\n }\n buckets {\n low_value: 12.6\n high_value: 17.12\n sample_count: 46966.0\n }\n buckets {\n low_value: 17.12\n high_value: 662.9\n sample_count: 47237.0\n }\n type: QUANTILES\n }\n }\n path {\n step: ""trip_miles""\n }\n }\n features {\n type: FLOAT\n num_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n mean: 1199.387033182966\n std_dev: 1613.054956467258\n min: 1.0\n median: 903.0\n max: 82570.0\n histograms {\n buckets {\n low_value: 1.0\n high_value: 8257.9\n sample_count: 472725.0957662696\n }\n buckets {\n low_value: 8257.9\n high_value: 16514.8\n sample_count: 136.46772594395674\n }\n buckets {\n low_value: 16514.8\n high_value: 24771.699999999997\n sample_count: 52.929563473240535\n }\n buckets {\n low_value: 24771.699999999997\n high_value: 33028.6\n sample_count: 52.92956347324055\n }\n buckets {\n low_value: 33028.6\n high_value: 41285.5\n sample_count: 52.92956347324055\n }\n buckets {\n low_value: 41285.5\n high_value: 49542.399999999994\n sample_count: 52.929563473240506\n }\n buckets {\n low_value: 49542.399999999994\n high_value: 57799.299999999996\n sample_count: 52.92956347324055\n }\n buckets {\n low_value: 57799.299999999996\n high_value: 66056.2\n sample_count: 52.92956347324055\n }\n buckets {\n low_value: 66056.2\n high_value: 74313.09999999999\n sample_count: 52.929563473240506\n }\n buckets {\n low_value: 74313.09999999999\n high_value: 82570.0\n sample_count: 52.9295634732406\n }\n }\n histograms {\n buckets {\n low_value: 1.0\n high_value: 300.0\n sample_count: 48336.0\n }\n buckets {\n low_value: 300.0\n high_value: 433.0\n sample_count: 46374.0\n }\n buckets {\n low_value: 433.0\n high_value: 568.0\n sample_count: 47541.0\n }\n buckets {\n low_value: 568.0\n high_value: 720.0\n sample_count: 49131.0\n }\n buckets {\n low_value: 720.0\n high_value: 903.0\n sample_count: 45381.0\n }\n buckets {\n low_value: 903.0\n high_value: 1163.0\n sample_count: 47303.0\n }\n buckets {\n low_value: 1163.0\n high_value: 1452.0\n sample_count: 47340.0\n }\n buckets {\n low_value: 1452.0\n high_value: 1814.0\n sample_count: 47276.0\n }\n buckets {\n low_value: 1814.0\n high_value: 2400.0\n sample_count: 47934.0\n }\n buckets {\n low_value: 2400.0\n high_value: 82570.0\n sample_count: 46669.0\n }\n type: QUANTILES\n }\n }\n path {\n step: ""trip_seconds""\n }\n }\n features {\n type: STRING\n string_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n unique: 7\n top_values {\n value: ""Credit Card""\n frequency: 177804.0\n }\n top_values {\n value: ""Cash""\n frequency: 125399.0\n }\n top_values {\n value: ""Mobile""\n frequency: 79449.0\n }\n top_values {\n value: ""Prcard""\n frequency: 63610.0\n }\n top_values {\n value: ""Unknown""\n frequency: 26742.0\n }\n top_values {\n value: ""No Charge""\n frequency: 169.0\n }\n top_values {\n value: ""Dispute""\n frequency: 112.0\n }\n avg_length: 7.406304836273193\n rank_histogram {\n buckets {\n label: ""Credit Card""\n sample_count: 177804.0\n }\n buckets {\n low_rank: 1\n high_rank: 1\n label: ""Cash""\n sample_count: 125399.0\n }\n buckets {\n low_rank: 2\n high_rank: 2\n label: ""Mobile""\n sample_count: 79449.0\n }\n buckets {\n low_rank: 3\n high_rank: 3\n label: ""Prcard""\n sample_count: 63610.0\n }\n buckets {\n low_rank: 4\n high_rank: 4\n label: ""Unknown""\n sample_count: 26742.0\n }\n buckets {\n low_rank: 5\n high_rank: 5\n label: ""No Charge""\n sample_count: 169.0\n }\n buckets {\n low_rank: 6\n high_rank: 6\n label: ""Dispute""\n sample_count: 112.0\n }\n }\n }\n path {\n step: ""payment_type""\n }\n }\n features {\n type: STRING\n string_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n unique: 31\n top_values {\n value: ""Flash Cab""\n frequency: 87060.0\n }\n top_values {\n value: ""Taxi Affiliation Services""\n frequency: 80975.0\n }\n top_values {\n value: ""Sun Taxi""\n frequency: 57790.0\n }\n top_values {\n value: ""City Service""\n frequency: 53809.0\n }\n top_values {\n value: ""Taxicab Insurance Agency Llc""\n frequency: 47687.0\n }\n top_values {\n value: ""Chicago Independents""\n frequency: 29105.0\n }\n top_values {\n value: ""Globe Taxi""\n frequency: 18634.0\n }\n top_values {\n value: ""5 Star Taxi""\n frequency: 17866.0\n }\n top_values {\n value: ""Blue Ribbon Taxi Association""\n frequency: 16210.0\n }\n top_values {\n value: ""Medallion Leasin""\n frequency: 15879.0\n }\n top_values {\n value: ""Star North Taxi Management Llc""\n frequency: 12443.0\n }\n top_values {\n value: ""Taxicab Insurance Agency, LLC""\n frequency: 9380.0\n }\n top_values {\n value: ""Choice Taxi Association""\n frequency: 8597.0\n }\n top_values {\n value: ""Top Cab Affiliation""\n frequency: 5918.0\n }\n top_values {\n value: ""U Taxicab""\n frequency: 2928.0\n }\n top_values {\n value: ""Koam Taxi Association""\n frequency: 1577.0\n }\n top_values {\n value: ""Choice Taxi Association Inc""\n frequency: 1400.0\n }\n top_values {\n value: ""Chicago Taxicab""\n frequency: 1148.0\n }\n top_values {\n value: ""Patriot Taxi Dba Peace Taxi Associat""\n frequency: 1074.0\n }\n top_values {\n value: ""Top Cab""\n frequency: 887.0\n }\n avg_length: 17.130142211914062\n rank_histogram {\n buckets {\n label: ""Flash Cab""\n sample_count: 87060.0\n }\n buckets {\n low_rank: 1\n high_rank: 1\n label: ""Taxi Affiliation Services""\n sample_count: 80975.0\n }\n buckets {\n low_rank: 2\n high_rank: 2\n label: ""Sun Taxi""\n sample_count: 57790.0\n }\n buckets {\n low_rank: 3\n high_rank: 3\n label: ""City Service""\n sample_count: 53809.0\n }\n buckets {\n low_rank: 4\n high_rank: 4\n label: ""Taxicab Insurance Agency Llc""\n sample_count: 47687.0\n }\n buckets {\n low_rank: 5\n high_rank: 5\n label: ""Chicago Independents""\n sample_count: 29105.0\n }\n buckets {\n low_rank: 6\n high_rank: 6\n label: ""Globe Taxi""\n sample_count: 18634.0\n }\n buckets {\n low_rank: 7\n high_rank: 7\n label: ""5 Star Taxi""\n sample_count: 17866.0\n }\n buckets {\n low_rank: 8\n high_rank: 8\n label: ""Blue Ribbon Taxi Association""\n sample_count: 16210.0\n }\n buckets {\n low_rank: 9\n high_rank: 9\n label: ""Medallion Leasin""\n sample_count: 15879.0\n }\n buckets {\n low_rank: 10\n high_rank: 10\n label: ""Star North Taxi Management Llc""\n sample_count: 12443.0\n }\n buckets {\n low_rank: 11\n high_rank: 11\n label: ""Taxicab Insurance Agency, LLC""\n sample_count: 9380.0\n }\n buckets {\n low_rank: 12\n high_rank: 12\n label: ""Choice Taxi Association""\n sample_count: 8597.0\n }\n buckets {\n low_rank: 13\n high_rank: 13\n label: ""Top Cab Affiliation""\n sample_count: 5918.0\n }\n buckets {\n low_rank: 14\n high_rank: 14\n label: ""U Taxicab""\n sample_count: 2928.0\n }\n buckets {\n low_rank: 15\n high_rank: 15\n label: ""Koam Taxi Association""\n sample_count: 1577.0\n }\n buckets {\n low_rank: 16\n high_rank: 16\n label: ""Choice Taxi Association Inc""\n sample_count: 1400.0\n }\n buckets {\n low_rank: 17\n high_rank: 17\n label: ""Chicago Taxicab""\n sample_count: 1148.0\n }\n buckets {\n low_rank: 18\n high_rank: 18\n label: ""Patriot Taxi Dba Peace Taxi Associat""\n sample_count: 1074.0\n }\n buckets {\n low_rank: 19\n high_rank: 19\n label: ""Top Cab""\n sample_count: 887.0\n }\n buckets {\n low_rank: 20\n high_rank: 20\n label: ""Setare Inc""\n sample_count: 562.0\n }\n buckets {\n low_rank: 21\n high_rank: 21\n label: ""Taxi Affiliation Services Llc - Yell""\n sample_count: 545.0\n }\n buckets {\n low_rank: 22\n high_rank: 22\n label: ""312 Medallion Management Corp""\n sample_count: 530.0\n }\n buckets {\n low_rank: 23\n high_rank: 23\n label: ""Metro Jet Taxi A.""\n sample_count: 312.0\n }\n buckets {\n low_rank: 24\n high_rank: 24\n label: ""3556 - 36214 RC Andrews Cab""\n sample_count: 197.0\n }\n buckets {\n low_rank: 25\n high_rank: 25\n label: ""6574 - Babylon Express Inc.""\n sample_count: 188.0\n }\n buckets {\n low_rank: 26\n high_rank: 26\n label: ""5167 - 71969 5167 Taxi Inc""\n sample_count: 168.0\n }\n buckets {\n low_rank: 27\n high_rank: 27\n label: ""Petani Cab Corp""\n sample_count: 130.0\n }\n buckets {\n low_rank: 28\n high_rank: 28\n label: ""2733 - 74600 Benny Jona""\n sample_count: 118.0\n }\n buckets {\n low_rank: 29\n high_rank: 29\n label: ""Leonard Cab Co""\n sample_count: 110.0\n }\n buckets {\n low_rank: 30\n high_rank: 30\n label: ""4053 - 40193 Adwar H. Nikola""\n sample_count: 58.0\n }\n }\n }\n path {\n step: ""company""\n }\n }\n features {\n type: FLOAT\n num_stats {\n common_stats {\n num_non_missing: 473285\n min_num_values: 1\n max_num_values: 1\n avg_num_values: 1.0\n num_values_histogram {\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n buckets {\n low_value: 1.0\n high_value: 1.0\n sample_count: 47328.5\n }\n type: QUANTILES\n }\n tot_num_values: 473285\n }\n mean: 24.2937007088752\n std_dev: 23.998327846575645\n min: 0.01\n median: 16.16\n max: 6716.41\n histograms {\n buckets {\n low_value: 0.01\n high_value: 671.65\n sample_count: 472853.77242079785\n }\n buckets {\n low_value: 671.65\n high_value: 1343.29\n sample_count: 47.91417546691575\n }\n buckets {\n low_value: 1343.29\n high_value: 2014.93\n sample_count: 47.91417546691575\n }\n buckets {\n low_value: 2014.93\n high_value: 2686.57\n sample_count: 47.91417546691575\n }\n buckets {\n low_value: 2686.57\n high_value: 3358.21\n sample_count: 47.914175466915744\n }\n buckets {\n low_value: 3358.21\n high_value: 4029.8500000000004\n sample_count: 47.91417546691577\n }\n buckets {\n low_value: 4029.8500000000004\n high_value: 4701.49\n sample_count: 47.91417546691571\n }\n buckets {\n low_value: 4701.49\n high_value: 5373.13\n sample_count: 47.91417546691577\n }\n buckets {\n low_value: 5373.13\n high_value: 6044.77\n sample_count: 47.91417546691577\n }\n buckets {\n low_value: 6044.77\n high_value: 6716.41\n sample_count: 47.91417546691571\n }\n }\n histograms {\n buckets {\n low_value: 0.01\n high_value: 7.0\n sample_count: 50442.0\n }\n buckets {\n low_value: 7.0\n high_value: 9.120000000000001\n sample_count: 44288.0\n }\n buckets {\n low_value: 9.120000000000001\n high_value: 11.15\n sample_count: 47417.0\n }\n buckets {\n low_value: 11.15\n high_value: 12.52\n sample_count: 47284.0\n }\n buckets {\n low_value: 12.52\n high_value: 16.16\n sample_count: 47215.0\n }\n buckets {\n low_value: 16.16\n high_value: 23.24\n sample_count: 47132.0\n }\n buckets {\n low_value: 23.24\n high_value: 30.0\n sample_count: 47606.0\n }\n buckets {\n low_value: 30.0\n high_value: 43.25\n sample_count: 47608.0\n }\n buckets {\n low_value: 43.25\n high_value: 54.5\n sample_count: 47203.0\n }\n buckets {\n low_value: 54.5\n high_value: 6716.41\n sample_count: 47090.0\n }\n type: QUANTILES\n }\n }\n path {\n step: ""total_fare""\n }\n }\n}\n",html=store://artifacts/trino-demo/ingest-statistics_html#0@e3fd394adccf4ca29c91feba305a805f^6136ca0ceb641b425f8cd043cb406cd7708f628a





> 2025-08-22 16:40:51,767 [info] Run execution finished: {"name":"ingest-statistics","status":"completed"}
