# End-to-end pipeline

We can combine the preprocessing and modeling steps in one end-to-end pipeline that can be run via an orchestration framework such as Airflow.

In [1]:
# NBVAL_IGNORE_OUTPUT
import lineapy

We can see artifacts we have saved previously using `lineapy.artifact_store()`

In [2]:
# NBVAL_IGNORE_OUTPUT
lineapy.artifact_store()

cleaned_data_housing_lineapy:2022-04-14T23:06:24 created on 2022-04-14 23:06:24.270191
cleaned_data_housing:2022-04-14T23:06:24 created on 2022-04-14 23:06:24.553136
linea_model_housing:2022-04-14T23:07:34 created on 2022-04-14 23:07:34.111009

Let's now create a pipeline containing the data preprocessing artifact `cleaned_data_housing_lineapy` and the model training artifact `linea_model_housing`

In [3]:
# NBVAL_IGNORE_OUTPUT
preprocessing_art = lineapy.get("cleaned_data_housing_lineapy")
preprocessing_art

LineaArtifact(name='cleaned_data_housing_lineapy', version='2022-04-14T23:06:24')

In [4]:
# NBVAL_IGNORE_OUTPUT
modeling_art = lineapy.get("linea_model_housing")
modeling_art

LineaArtifact(name='linea_model_housing', version='2022-04-14T23:07:34')

In [5]:
# NBVAL_SKIP
import os
directory = lineapy.to_pipeline(
    [preprocessing_art.name, modeling_art.name], 
    framework = 'AIRFLOW',
    pipeline_name = "data_housing_pipeline",
    dependencies = {'data_housing_pipeline_linea_model_housing':{'data_housing_pipeline_cleaned_data_housing'}},
    output_dir = os.environ.get("AIRFLOW_HOME","~/airflow")+"/dags")


In [6]:
# NBVAL_SKIP

os.listdir(directory)

['data_housing_pipeline_requirements.txt',
 'cleaned_data_housing_Dockerfile',
 'cleaned_data_housing_requirements.txt',
 'data_housing_pipeline_Dockerfile',
 'cleaned_data_housing.py',
 'data_housing_pipeline.py',
 'cleaned_data_housing_dag.py',
 'data_housing_pipeline_dag.py']

In [7]:
# NBVAL_SKIP

os.system(f"cat {directory}/data_housing_pipeline_dag.py");

import os

import data_housing_pipeline
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago

default_dag_args = {"owner": "airflow", "retries": 2, "start_date": days_ago(1)}

dag = DAG(
    dag_id="data_housing_pipeline_dag",
    schedule_interval="*/15 * * * *",
    max_active_runs=1,
    catchup=False,
    default_args=default_dag_args,
)


cleaned_data_housing_lineapy = PythonOperator(
    dag=dag,
    task_id="cleaned_data_housing_lineapy_task",
    python_callable=data_housing_pipeline.cleaned_data_housing_lineapy,
)

linea_model_housing = PythonOperator(
    dag=dag,
    task_id="linea_model_housing_task",
    python_callable=data_housing_pipeline.linea_model_housing,
)


data_housing_pipeline_cleaned_data_housing >> data_housing_pipeline_linea_model_housing
