# 07-02 : Trino To Dataset

Test Trino and MLRun integration.

In [1]:
import mlrun

## 1. Configuration

In [2]:
project_name = "trino-demo"
function_path = "../../functions/development"

query_file = "queries/ingest.sql"
schema = "lakehouse"
catalog = "iceberg"

mlrun_image = "registry-service.mlrun.svc.cluster.local/mlrun/mlrun:1.9.1"

## 2. Setup Project

### 2.1 Load The Project

In [3]:
project = mlrun.get_or_create_project(
    name=project_name,
    user_project=False)

# Display the current project name
project_name = project.metadata.name
print(f'Full project name: {project_name}')

> 2025-08-22 00:22:48,413 [info] Project loaded successfully: {"project_name":"trino-demo"}
Full project name: trino-demo


### 2.2 Project Secrets

In [4]:
project.set_secrets({
    "TRINO_HOST": "dragon.lan",
    "TRINO_PORT": "9191",
    "TRINO_USER": "johnny",
})

### 2.3 Prepare Project 

In [5]:
# read the query file
with open(query_file, "r") as f:
    query = f.read()

# add the query as an artifact
query_artifact = project.log_artifact(
    item="ingest-query",
    local_path=query_file,
    format="sql")

## 3. Generate Query

In [6]:
# load the function
generate_query = mlrun.code_to_function(
    name="generate_query",
    filename=f"{function_path}/sql/generate_query/generate_query.py",
    kind="job",
    image=mlrun_image,
    handler="generate_query",
    requirements_file=f"{function_path}/sql/generate_query/requirements.txt")

In [7]:
# generate the query
ingest_query = generate_query.run(
    inputs= {
        "input_file": query_artifact.uri
    },
    params={
        "replacements": {
            "catalog": catalog,
            "schema": schema,
            "source_table": "taxi_trips",
            "filter_column": "trip_start_timestamp",
            "filter_start_value": "2023-06-01",
            "target_column": "total_fare" 
        }
    },
    local=False,
    auto_build=True
).outputs["sql"]

> 2025-08-22 00:22:48,610 [info] Storing function: {"db":"http://dragon.local:30070","name":"generate-query-generate-query","uid":"f911f59333794ab4b7d1fc294c1e6e46"}
> 2025-08-22 00:22:48,697 [info] Job is running in the background, pod: generate-query-generate-query-5s585
> 2025-08-21 22:22:50,597 [info] downloading s3://mlrun/projects/trino-demo/artifacts/ingest-query.sql to local temp file
> 2025-08-21 22:22:50,601 [info] Loading SQL template from artifact
> 2025-08-21 22:22:50,601 [info] Performing replacements in the SQL template
> 2025-08-21 22:22:50,631 [info] To track results use the CLI: {"info_cmd":"mlrun get run f911f59333794ab4b7d1fc294c1e6e46 -p trino-demo","logs_cmd":"mlrun logs f911f59333794ab4b7d1fc294c1e6e46 -p trino-demo"}
> 2025-08-21 22:22:50,631 [info] Run execution finished: {"name":"generate-query-generate-query","status":"completed"}


project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results
trino-demo,...1e6e46,0,Aug 21 22:22:50,2025-08-21 22:22:50.625020+00:00,completed,run,generate-query-generate-query,v3io_user=johanneskind=jobowner=johannesmlrun/client_version=1.9.1mlrun/client_python_version=3.10.18host=generate-query-generate-query-5s585,input_file,"replacements={'catalog': 'iceberg', 'schema': 'lakehouse', 'source_table': 'taxi_trips', 'filter_column': 'trip_start_timestamp', 'filter_start_value': '2023-06-01', 'target_column': 'total_fare'}","sql=WITH filtered_data AS (\n SELECT\n *\n FROM\n ""iceberg"".""lakehouse"".""taxi_trips""\n WHERE\n -- Trino uses CAST(... AS DATE) and the standard subtraction operator for intervals\n CAST(trip_start_timestamp AS DATE) BETWEEN\n (CAST('2023-06-01' AS DATE) - INTERVAL '3' MONTH) AND\n (CAST('2023-06-01' AS DATE) - INTERVAL '2' MONTH)\n)\n\n, mean_time AS (\n SELECT\n -- Trino's equivalent of INT64 is BIGINT\n CAST(AVG(trip_seconds) AS BIGINT) AS avg_trip_seconds\n FROM\n filtered_data\n)\n\nSELECT\n -- Function names for date/time extraction are slightly different\n CAST(day_of_week(trip_start_timestamp) AS DOUBLE) AS dayofweek, -- Note: Sunday=7 in Trino\n CAST(hour(trip_start_timestamp) AS DOUBLE) AS hourofday,\n \n -- Geospatial functions use a different syntax\n ST_Distance(\n to_spherical_geography(ST_Point(pickup_longitude, pickup_latitude)),\n to_spherical_geography(ST_Point(dropoff_longitude, dropoff_latitude))\n ) AS trip_distance,\n \n trip_miles,\n \n -- Trino's equivalent of FLOAT64 is DOUBLE\n CAST(\n CASE\n WHEN trip_seconds IS NULL THEN m.avg_trip_seconds\n WHEN trip_seconds <= 0 THEN m.avg_trip_seconds\n ELSE trip_seconds\n END AS DOUBLE\n ) AS trip_seconds,\n \n payment_type,\n company,\n \n -- Use double quotes for identifiers if needed, not backticks\n (fare + tips + tolls + extras) AS ""total_fare""\nFROM\n filtered_data AS t\n-- Explicit CROSS JOIN is clearer than a comma\nCROSS JOIN mean_time AS m\nWHERE\n trip_miles > 0 AND fare > 0 AND fare < 1500\n -- The Jinja templating part does not need to change at all\n \n AND ""fare"" IS NOT NULL\n \n AND ""trip_start_timestamp"" IS NOT NULL\n \n AND ""pickup_longitude"" IS NOT NULL\n \n AND ""pickup_latitude"" IS NOT NULL\n \n AND ""dropoff_longitude"" IS NOT NULL\n \n AND ""dropoff_latitude"" IS NOT NULL\n \n AND ""payment_type"" IS NOT NULL\n \n AND ""company"" IS NOT NULL\n"





> 2025-08-22 00:23:00,747 [info] Run execution finished: {"name":"generate-query-generate-query","status":"completed"}


## 4. Ingest Data

In [8]:
# load the function
query_to_dataset = mlrun.code_to_function(
    name="query_to_dataset",
    filename=f"{function_path}/trino/query_to_dataset/query_to_dataset.py",
    kind="job",
    image=mlrun_image,
    handler="query_to_dataset",
    requirements_file=f"{function_path}/trino/query_to_dataset/requirements.txt"
)

In [9]:
# run the query
dataset_uri = query_to_dataset.run(
    name="ingest",
    params={
        "query": ingest_query,
        "schema": schema,
        "catalog": catalog,
        "dataset_name": "train-source",
    },    
    local=False,
    auto_build=True
).outputs["train-source"]

> 2025-08-22 00:23:00,791 [info] Storing function: {"db":"http://dragon.local:30070","name":"ingest","uid":"74a0e8e769be4ec7a2552ceb6e59f696"}
> 2025-08-22 00:23:00,876 [info] Job is running in the background, pod: ingest-lglbs
> 2025-08-21 22:23:02,609 [info] Connecting to Trino
> 2025-08-21 22:23:02,610 [info] Executing SQL query
> 2025-08-21 22:23:06,279 [info] Logging dataset
> 2025-08-21 22:23:06,861 [info] To track results use the CLI: {"info_cmd":"mlrun get run 74a0e8e769be4ec7a2552ceb6e59f696 -p trino-demo","logs_cmd":"mlrun logs 74a0e8e769be4ec7a2552ceb6e59f696 -p trino-demo"}
> 2025-08-21 22:23:06,861 [info] Run execution finished: {"name":"ingest","status":"completed"}


project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results,artifacts
trino-demo,...59f696,0,Aug 21 22:23:02,2025-08-21 22:23:06.854964+00:00,completed,run,ingest,v3io_user=johanneskind=jobowner=johannesmlrun/client_version=1.9.1mlrun/client_python_version=3.10.18host=ingest-lglbs,,"query=WITH filtered_data AS (\n SELECT\n *\n FROM\n ""iceberg"".""lakehouse"".""taxi_trips""\n WHERE\n -- Trino uses CAST(... AS DATE) and the standard subtraction operator for intervals\n CAST(trip_start_timestamp AS DATE) BETWEEN\n (CAST('2023-06-01' AS DATE) - INTERVAL '3' MONTH) AND\n (CAST('2023-06-01' AS DATE) - INTERVAL '2' MONTH)\n)\n\n, mean_time AS (\n SELECT\n -- Trino's equivalent of INT64 is BIGINT\n CAST(AVG(trip_seconds) AS BIGINT) AS avg_trip_seconds\n FROM\n filtered_data\n)\n\nSELECT\n -- Function names for date/time extraction are slightly different\n CAST(day_of_week(trip_start_timestamp) AS DOUBLE) AS dayofweek, -- Note: Sunday=7 in Trino\n CAST(hour(trip_start_timestamp) AS DOUBLE) AS hourofday,\n \n -- Geospatial functions use a different syntax\n ST_Distance(\n to_spherical_geography(ST_Point(pickup_longitude, pickup_latitude)),\n to_spherical_geography(ST_Point(dropoff_longitude, dropoff_latitude))\n ) AS trip_distance,\n \n trip_miles,\n \n -- Trino's equivalent of FLOAT64 is DOUBLE\n CAST(\n CASE\n WHEN trip_seconds IS NULL THEN m.avg_trip_seconds\n WHEN trip_seconds <= 0 THEN m.avg_trip_seconds\n ELSE trip_seconds\n END AS DOUBLE\n ) AS trip_seconds,\n \n payment_type,\n company,\n \n -- Use double quotes for identifiers if needed, not backticks\n (fare + tips + tolls + extras) AS ""total_fare""\nFROM\n filtered_data AS t\n-- Explicit CROSS JOIN is clearer than a comma\nCROSS JOIN mean_time AS m\nWHERE\n trip_miles > 0 AND fare > 0 AND fare < 1500\n -- The Jinja templating part does not need to change at all\n \n AND ""fare"" IS NOT NULL\n \n AND ""trip_start_timestamp"" IS NOT NULL\n \n AND ""pickup_longitude"" IS NOT NULL\n \n AND ""pickup_latitude"" IS NOT NULL\n \n AND ""dropoff_longitude"" IS NOT NULL\n \n AND ""dropoff_latitude"" IS NOT NULL\n \n AND ""payment_type"" IS NOT NULL\n \n AND ""company"" IS NOT NULL\n schema=lakehousecatalog=icebergdataset_name=train-source",rows=473285,train-source





> 2025-08-22 00:23:11,914 [info] Run execution finished: {"name":"ingest","status":"completed"}


## 5. Results

In [10]:
# load the ingested dataset
df_train = mlrun.get_dataitem(dataset_uri).as_df()

print(f"rows read: {df_train.shape[0]:,}")
display(df_train.head())

rows read: 473,285


Unnamed: 0,dayofweek,hourofday,trip_distance,trip_miles,trip_seconds,payment_type,company,total_fare
0,5.0,16.0,21343.087544,15.0,3360.0,Unknown,Taxi Affiliation Services,39.5
1,5.0,8.0,21812.983595,15.61,1249.0,Cash,Flash Cab,39.0
2,5.0,9.0,15437.22943,0.7,1440.0,Cash,Taxi Affiliation Services,32.0
3,5.0,16.0,11425.401388,7.1,1140.0,Unknown,Taxi Affiliation Services,19.5
4,5.0,10.0,15228.663561,12.23,1620.0,Prcard,Flash Cab,32.75
