# End-to-end pipeline

We can combine the preprocessing and modeling steps in one end-to-end pipeline that can be run via an orchestration framework such as Airflow.

In [1]:
# NBVAL_IGNORE_OUTPUT
import lineapy
lineapy.options.set("is_demo", True) # Not for normal use

We can see artifacts we have saved previously using `lineapy.artifact_store()`

In [2]:
# NBVAL_IGNORE_OUTPUT
lineapy.artifact_store()

iris_diff_avg_length:0 created on 2022-08-29 18:57:13.045471
iris_diff_avg_length:1 created on 2022-08-29 18:57:13.191326
iris_diff_avg_width:0 created on 2022-08-29 18:57:13.367284
iris_preprocessed:0 created on 2022-08-29 18:57:39.134790
iris_model:0 created on 2022-08-29 18:57:39.203364
iris_diff_avg_length:2 created on 2022-08-29 18:58:14.715932
iris_diff_avg_length:3 created on 2022-08-29 18:58:14.868242
iris_diff_avg_width:1 created on 2022-08-29 18:58:15.040715
cleaned_data_housing_lineapy:0 created on 2022-08-29 18:58:51.433123
cleaned_data_housing:0 created on 2022-08-29 18:58:51.454668
linea_model_housing:0 created on 2022-08-29 18:59:13.418737

Let's now create a pipeline containing the data preprocessing artifact `cleaned_data_housing_lineapy` and the model training artifact `linea_model_housing`

In [3]:
# NBVAL_IGNORE_OUTPUT
preprocessing_art = lineapy.get("cleaned_data_housing_lineapy")
preprocessing_art

LineaArtifact(name='cleaned_data_housing_lineapy', _version=0)

In [4]:
# NBVAL_IGNORE_OUTPUT
modeling_art = lineapy.get("linea_model_housing")
modeling_art

LineaArtifact(name='linea_model_housing', _version=0)

In [5]:
# NBVAL_SKIP
import os
directory = lineapy.to_pipeline(
    [preprocessing_art.name, modeling_art.name], 
    framework="AIRFLOW",
    pipeline_name="data_housing_pipeline",
    dependencies={ modeling_art.name: { preprocessing_art.name } },
    output_dir="airflow/dags/data_housing_pipeline/"
)

Generated module file: airflow/dags/data_housing_pipeline/data_housing_pipeline_module.py                                                                                                                   
Generated requirements file: airflow/dags/data_housing_pipeline/data_housing_pipeline_requirements.txt                                                                                                      
Generated DAG file: airflow/dags/data_housing_pipeline/data_housing_pipeline_dag.py                                                                                                                         
Generated Docker file: airflow/dags/data_housing_pipeline/data_housing_pipeline_Dockerfile                                                                                                                  


In [6]:
# NBVAL_SKIP
os.listdir(directory)

['data_housing_pipeline_requirements.txt',
 'data_housing_pipeline_module.py',
 'data_housing_pipeline_Dockerfile',
 'data_housing_pipeline_dag.py']

In [7]:
# NBVAL_SKIP
os.system(f"cat {directory}/data_housing_pipeline_dag.py");

import pathlib
import pickle

import data_housing_pipeline_module
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago

from airflow import DAG


def dag_setup():
    pickle_folder = pathlib.Path("/tmp").joinpath("data_housing_pipeline")
    if not pickle_folder.exists():
        pickle_folder.mkdir()


def dag_teardown():
    pickle_files = (
        pathlib.Path("/tmp").joinpath("data_housing_pipeline").glob("*.pickle")
    )
    for f in pickle_files:
        f.unlink()


def task_cleaned_data_housing_lineapy():

    training_data = data_housing_pipeline_module.get_cleaned_data_housing_lineapy()

    pickle.dump(
        training_data,
        open("/tmp/data_housing_pipeline/variable_training_data.pickle", "wb"),
    )


def task_linea_model_housing():

    linear_model = data_housing_pipeline_module.get_linea_model_housing()

    pickle.dump(
        linear_model,
        open("/tmp/data_housing_pipeline/variable_linear_model.pickle