# query_to_dataset

In [1]:
import mlrun
from os import environ

## 1. Configuration

In [2]:
project_name = "debug"
image = "registry-service.mlrun.svc.cluster.local/mlrun/mlrun:1.9.1"

query_file = "sample.sql"
schema = "lakehouse"
catalog = "iceberg"

tag = "0.0.1"

In [3]:
# get the debug project
project = mlrun.get_or_create_project(
    name=project_name,
    context="./",
    user_project=True
)

> 2025-08-21 16:42:43,182 [info] Project loaded successfully: {"project_name":"debug-johannes"}


In [4]:
# load sample query
with open(query_file, "r") as f:
    query = f.read()

## 1.1 Project Secrets

In [5]:
# set secrets for trino access
# project.set_secrets({
#     "TRINO_HOST": "dragon.lan",
#     "TRINO_PORT": "9191",
#     "TRINO_USER": "johnny",
# })

environ["TRINO_HOST"] = "dragon.lan"
environ["TRINO_PORT"] = "9191"
environ["TRINO_USER"] = "johnny"

## 2. Run Locally

In [6]:
function = mlrun.code_to_function(
    name="query_to_dataset",
    filename="query_to_dataset.py",
    kind="job",
    image=image,
    handler="query_to_dataset",
    requirements_file="requirements.txt"
)

In [7]:
result = function.run(
    name="example",
    params={
        "query": query,
        "schema": schema,
        "catalog": catalog,
        "dataset_name": "sample-dataset",
        "table_name": "sample_table",
        "tag": tag
    },
    tag=tag,
    local=True,
    auto_build=False
)

> 2025-08-21 16:42:43,208 [info] Storing function: {"db":"http://dragon.local:30070","name":"example","uid":"095b7a53fea84b9687aa483021c187a4"}
> 2025-08-21 16:42:43,260 [info] Connecting to Trino
> 2025-08-21 16:42:43,262 [info] Executing SQL query
> 2025-08-21 16:42:57,426 [info] Logging dataset


project,uid,iter,start,end,state,kind,name,labels,inputs,parameters,results,artifact_uris
debug-johannes,...c187a4,0,Aug 21 14:42:43,NaT,completed,run,example,v3io_user=johanneskind=localowner=johanneshost=dragon,,"query=SELECT\n *\nFROM\n ""iceberg"".""lakehouse"".""taxi_trips""\nWHERE\n CAST(trip_start_timestamp AS DATE) BETWEEN\n (CAST('2023-05-01' AS DATE) - INTERVAL '3' MONTH) AND\n (CAST('2023-05-01' AS DATE) - INTERVAL '2' MONTH)schema=lakehousecatalog=icebergdataset_name=sample-datasettable_name=sample_tabletag=0.0.1",rows=465977,sample-dataset=store://datasets/debug-johannes/example_sample-dataset#0:0.0.1@095b7a53fea84b9687aa483021c187a4^353a0b3e7db61586947b4d871d155051425bf1c7





> 2025-08-21 16:42:58,817 [info] Run execution finished: {"name":"example","status":"completed"}


In [8]:
print(result.outputs["sample-dataset"])

store://datasets/debug-johannes/example_sample-dataset:0.0.1@095b7a53fea84b9687aa483021c187a4^353a0b3e7db61586947b4d871d155051425bf1c7
