# End-to-end pipeline

We can combine the preprocessing and modeling steps in one end-to-end pipeline that can be run via an orchestration framework such as Airflow.

In [1]:
# NBVAL_IGNORE_OUTPUT
import lineapy
lineapy.options.set("is_demo", True) # Not for normal use

We can see artifacts we have saved previously using `lineapy.artifact_store()`

In [2]:
# NBVAL_IGNORE_OUTPUT
lineapy.artifact_store()

iris_diff_avg_length:0 created on 2022-08-29 17:54:51.834330
iris_diff_avg_length:1 created on 2022-08-29 17:54:51.973005
iris_diff_avg_width:0 created on 2022-08-29 17:54:52.151950
iris_preprocessed:0 created on 2022-08-29 17:56:00.929400
iris_model:0 created on 2022-08-29 17:56:00.997712
cleaned_data_housing_lineapy:0 created on 2022-08-29 17:56:34.312448
cleaned_data_housing:0 created on 2022-08-29 17:56:34.331562
cleaned_data_housing_lineapy:1 created on 2022-08-29 17:57:32.020983
cleaned_data_housing:1 created on 2022-08-29 17:57:32.043274
cleaned_data_housing_lineapy:2 created on 2022-08-29 18:00:17.831095
cleaned_data_housing:2 created on 2022-08-29 18:00:17.849345
linea_model_housing:0 created on 2022-08-29 18:01:22.026070
cleaned_data_housing_lineapy:3 created on 2022-08-29 18:02:42.707959
cleaned_data_housing:3 created on 2022-08-29 18:02:42.728037

Let's now create a pipeline containing the data preprocessing artifact `cleaned_data_housing_lineapy` and the model training artifact `linea_model_housing`

In [3]:
# NBVAL_IGNORE_OUTPUT
preprocessing_art = lineapy.get("cleaned_data_housing_lineapy")
preprocessing_art

LineaArtifact(name='cleaned_data_housing_lineapy', _version=3)

In [4]:
# NBVAL_IGNORE_OUTPUT
modeling_art = lineapy.get("linea_model_housing")
modeling_art

LineaArtifact(name='linea_model_housing', _version=0)

In [5]:
# NBVAL_SKIP
import os
directory = lineapy.to_pipeline(
    [preprocessing_art.name, modeling_art.name], 
    framework="AIRFLOW",
    pipeline_name="data_housing_pipeline",
    dependencies={ modeling_art.name: { preprocessing_art.name } },
    output_dir="airflow/dags/data_housing_pipeline/"
)

Generated module file: airflow/dags/data_housing_pipeline/data_housing_pipeline_module.py                                                                                                                   
Generated requirements file: airflow/dags/data_housing_pipeline/data_housing_pipeline_requirements.txt                                                                                                      
Generated DAG file: airflow/dags/data_housing_pipeline/data_housing_pipeline_dag.py                                                                                                                         
Generated Docker file: airflow/dags/data_housing_pipeline/data_housing_pipeline_Dockerfile                                                                                                                  


In [6]:
# NBVAL_SKIP
os.listdir(directory)

['data_housing_pipeline_requirements.txt',
 'data_housing_pipeline_module.py',
 'data_housing_pipeline_Dockerfile',
 'data_housing_pipeline_dag.py']

In [7]:
# NBVAL_SKIP
os.system(f"cat {directory}/data_housing_pipeline_dag.py");

import data_housing_pipeline_module
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago

from airflow import DAG

default_dag_args = {
    "owner": "airflow",
    "retries": 2,
    "start_date": days_ago(1),
}

with DAG(
    dag_id="data_housing_pipeline_dag",
    schedule_interval="*/15 * * * *",
    max_active_runs=1,
    catchup=False,
    default_args=default_dag_args,
) as dag:

    run_session_including_cleaned_data_housing_lineapy = PythonOperator(
        task_id="run_session_including_cleaned_data_housing_lineapy_task",
        python_callable=data_housing_pipeline_module.run_session_including_cleaned_data_housing_lineapy,
    )

    run_session_including_linea_model_housing = PythonOperator(
        task_id="run_session_including_linea_model_housing_task",
        python_callable=data_housing_pipeline_module.run_session_including_linea_model_housing,
    )

    (
        run_session_including_cleaned_data_housing_lineapy
     