<a href="https://colab.research.google.com/github/HenryZumaeta/DataScienceMasters/blob/main/CICLO03/MLOPS/Level_0_P02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Adaptación Manualmente

In [None]:
USER_FLAG = "--user"
!pip3 install {USER_FLAG} google-cloud-aiplatform --upgrade
!pip3 install {USER_FLAG} kfp==2.4.0 google-cloud-pipeline-components
!pip3 install {USER_FLAG} gcsfs



In [None]:
import os

if not os.getenv("IS_TESTING"):
    # Reiniciar automáticamente el kernel tras la instalación de dependencias
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

In [None]:
!python3 -c "import kfp; print('KFP SDK version: {}'.format(kfp.__version__))"
!python3 -c "import google_cloud_pipeline_components; print('google_cloud_pipeline_components version: {}'.format(google_cloud_pipeline_components.__version__))"


KFP SDK version: 2.4.0
google_cloud_pipeline_components version: 2.8.0


In [None]:
import os

PROJECT_ID = ""
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID: ", PROJECT_ID)


Project ID:  famous-tree-448915-m8


In [None]:
# Definir el nombre del bucket de forma automática
BUCKET_NAME = "gs://" + PROJECT_ID + "-bucket"

# Obtener la lista de buckets existentes
existing_buckets = !gsutil ls

# Verificar si el bucket ya existe; si no, crearlo
if BUCKET_NAME not in existing_buckets:
    print("Creando el bucket:", BUCKET_NAME)
    !gsutil mb -l us-central1 {BUCKET_NAME}
else:
    print("El bucket ya existe:", BUCKET_NAME)

Creando el bucket: gs://famous-tree-448915-m8-bucket
Creating gs://famous-tree-448915-m8-bucket/...


In [None]:
# Copia el archivo 'train.csv' al bucket de GCS y lista el contenido del bucket para verificar la copia.
!gsutil cp ./train.csv $BUCKET_NAME
!gsutil ls -al $BUCKET_NAME

Copying file://./train.csv [Content-Type=text/csv]...
/ [1 files][449.9 KiB/449.9 KiB]                                                
Operation completed over 1 objects/449.9 KiB.                                    
    460676  2025-02-09T11:26:55Z  gs://famous-tree-448915-m8-bucket/train.csv#1739100415978976  metageneration=1
TOTAL: 1 objects, 460676 bytes (449.88 KiB)


In [None]:
import os
import pprint as pp
import sys
import pickle
import argparse

# Se importa el modelo de regresión lineal para el problema de House Prices
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_absolute_percentage_error
import numpy as np
import pandas as pd

from google.cloud import storage
from google.cloud import aiplatform

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Construir la URL del archivo 'train.csv' en el bucket de GCS
url = BUCKET_NAME + "/train.csv"

# Lectura y mostrar
data = pd.read_csv(url)
data.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
# Definir las columnas relevantes para el modelado en el dataset de House Prices
modelling_columns = [
    "OverallQual",   # Calidad general de la vivienda
    "GrLivArea",     # Área habitable en pies cuadrados
    "GarageCars",    # Número de autos en el garaje
    "TotalBsmtSF",   # Área total del sótano en pies cuadrados
    "FullBath",      # Número de baños completos
    "YearBuilt",     # Año de construcción
    "YearRemodAdd",  # Año de remodelación
    "SalePrice"      # Precio de venta (variable objetivo)
]

data = data[modelling_columns]

data.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,TotalBsmtSF,FullBath,YearBuilt,YearRemodAdd,SalePrice
0,7,1710,2,856,2,2003,2003,208500
1,6,1262,2,1262,2,1976,1976,181500
2,7,1786,2,920,2,2001,2002,223500
3,7,1717,3,756,1,1915,1970,140000
4,8,2198,3,1145,2,2000,2000,250000


In [None]:
train_size = 0.8
test_size = 0.1
valid_size = 0.1

# Mezclar aleatoriamente el dataset y dividirlo en 3 subconjuntos:
# - Primer subconjunto: 80% de los datos (entrenamiento)
# - Segundo subconjunto: 10% de los datos (validación)
# - Tercer subconjunto: 10% de los datos (prueba)
train_ds, valid_ds, test_ds = np.split(
    data.sample(frac=1, random_state=42),
    [int(train_size * len(data)), int((1 - test_size) * len(data))]
)

In [None]:
# Definir la variable objetivo para el dataset de House Prices
target = "SalePrice"

# Separar las variables predictoras y la variable objetivo en cada subconjunto
x_train = train_ds.drop(columns=target, axis=1)
y_train = train_ds[target]

x_valid = valid_ds.drop(columns=target, axis=1)
y_valid = valid_ds[target]

x_test = test_ds.drop(columns=target, axis=1)
y_test = test_ds[target]

In [None]:
# Entrenar el modelo de regresión lineal para predecir el precio de venta de viviendas
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
# Realizar predicciones en el conjunto de validación
y_pred = model.predict(x_valid)

In [None]:
# Evaluación del modelo
r2 = r2_score(y_true=y_valid, y_pred=y_pred)
mae = mean_absolute_error(y_true=y_valid, y_pred=y_pred)
mse = mean_squared_error(y_true=y_valid, y_pred=y_pred)
mape = mean_absolute_percentage_error(y_true=y_valid, y_pred=y_pred)
rmse = np.sqrt(mse)

print(f"R2: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Absolute Percentage Error: {round(mape, 4) * 100}%")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

R2: 0.812156441673599
Mean Absolute Error: 23281.12128604443
Mean Absolute Percentage Error: 14.52%
Mean Squared Error: 959344988.2221619
Root Mean Squared Error: 30973.29475890742


In [None]:
# Definir la ruta en el bucket para almacenar el modelo
MODEL_PATH = BUCKET_NAME + "/models/"
model_path = "./model.pkl"  # Se guarda el modelo en formato pickle

# Serializar y guardar el modelo entrenado (modelo de regresión lineal)
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

# Copiar el archivo del modelo a GCS
!gsutil cp "model.pkl" $MODEL_PATH

Copying file://model.pkl [Content-Type=application/octet-stream]...
/ [1 files][  693.0 B/  693.0 B]                                                
Operation completed over 1 objects/693.0 B.                                      


In [None]:
# Contenedor de predicción preconstruido para scikit-learn (compatible con modelos de regresión)
serving_container_uri = "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-3:latest"

# Definir el URI de los artefactos del modelo en GCS
artifact_uri = MODEL_PATH

# Subir el modelo al Model Registry de Vertex AI utilizando el SDK de Python
model = aiplatform.Model.upload(
    display_name="House-Prices-Linear-Regression-Model",
    artifact_uri=artifact_uri,
    serving_container_image_uri=serving_container_uri
)

Creating Model
Create Model backing LRO: projects/436225358028/locations/us-central1/models/6540155746267430912/operations/7377562390600089600
Model created. Resource name: projects/436225358028/locations/us-central1/models/6540155746267430912@1
To use this Model in another session:
model = aiplatform.Model('projects/436225358028/locations/us-central1/models/6540155746267430912@1')


In [None]:
# Desplegar el modelo en un endpoint de Vertex AI
endpoint = model.deploy(
    machine_type="n1-standard-4",
    min_replica_count=1,
    max_replica_count=1
)

Creating Endpoint
Create Endpoint backing LRO: projects/436225358028/locations/us-central1/endpoints/379913153233813504/operations/5035690584367431680
Endpoint created. Resource name: projects/436225358028/locations/us-central1/endpoints/379913153233813504
To use this Endpoint in another session:
endpoint = aiplatform.Endpoint('projects/436225358028/locations/us-central1/endpoints/379913153233813504')
Deploying model to Endpoint : projects/436225358028/locations/us-central1/endpoints/379913153233813504
Deploy Endpoint model backing LRO: projects/436225358028/locations/us-central1/endpoints/379913153233813504/operations/8495018048141393920
Endpoint model deployed. Resource name: projects/436225358028/locations/us-central1/endpoints/379913153233813504


In [None]:
# Predicción:
# [OverallQual, GrLivArea, GarageCars, TotalBsmtSF, FullBath, YearBuilt, YearRemodAdd]
instances = [
    [7, 2000, 2, 1500, 2, 1975, 2000]
]

# Solicitar la predicción al endpoint de Vertex AI
prediction = endpoint.predict(instances=instances)
print("Predicción:", prediction)

Predicción: Prediction(predictions=[243099.1529034788], deployed_model_id='255241728783548416', metadata=None, model_version_id='1', model_resource_name='projects/436225358028/locations/us-central1/models/6540155746267430912', explanations=None)


In [None]:
# Para liberar recursos, se procede a desactivar el endpoint y eliminar el modelo:
endpoint.undeploy_all()
endpoint.delete()
model.delete()

Undeploying Endpoint model: projects/436225358028/locations/us-central1/endpoints/379913153233813504
Undeploy Endpoint model backing LRO: projects/436225358028/locations/us-central1/endpoints/379913153233813504/operations/1009472517498208256
Endpoint model undeployed. Resource name: projects/436225358028/locations/us-central1/endpoints/379913153233813504
Deleting Endpoint : projects/436225358028/locations/us-central1/endpoints/379913153233813504
Endpoint deleted. . Resource name: projects/436225358028/locations/us-central1/endpoints/379913153233813504
Deleting Endpoint resource: projects/436225358028/locations/us-central1/endpoints/379913153233813504
Delete Endpoint backing LRO: projects/436225358028/locations/us-central1/operations/6828123236060889088
Endpoint resource projects/436225358028/locations/us-central1/endpoints/379913153233813504 deleted.
Deleting Model : projects/436225358028/locations/us-central1/models/6540155746267430912
Model deleted. . Resource name: projects/43622535

# Adaptación del Pipeline Automatizado (Kubeflow Pipelines)

In [None]:
# Importación de librerías necesarias para construir el pipeline automatizado
from typing import NamedTuple

import kfp
from kfp import dsl
from kfp.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                     OutputPath, component, ClassificationMetrics, Metrics)

from kfp import compiler
from kfp.components import load_component_from_file

import json
import yaml

In [None]:
# Crear el directorio 'components' para almacenar los archivos YAML de los componentes del pipeline
!mkdir -p components

In [None]:
@component(
    packages_to_install=["pandas", "pyarrow", "fsspec", "gcsfs"],
    base_image="python:3.9",
    output_component_file="./components/download_data.yaml"
)
def download_data(input_data_path: str,
                  input_data_filename: str,
                  downloaded_data: Output[Dataset]):
    import pandas as pd
    import os

    # Mostrar información de entrada para depuración
    print(f"Ruta de datos de entrada (GCS): {input_data_path}")
    print(f"Nombre del archivo (por ejemplo, 'train.csv'): {input_data_filename}")
    print(f"Ruta de salida para los datos descargados: {downloaded_data.path}")

    # Construir la URL completa del archivo en GCS
    url = os.path.join(input_data_path, input_data_filename)

    # Leer el dataset de House Prices
    data = pd.read_csv(url)

    # Guardar el dataset en la ruta de salida
    output_data_uri = downloaded_data.path + ".csv"
    data.to_csv(output_data_uri, index=False, encoding='utf-8-sig')

  @component(
  def download_data(input_data_path: str,


In [None]:
@component(
    packages_to_install=["pandas", "pyarrow", "fsspec", "gcsfs"],
    base_image="python:3.9",
    output_component_file="./components/preprocess_data.yaml"
)
def preprocess_data(train_size: float,
                    test_size: float,
                    valid_size: float,
                    train_data: Output[Dataset],
                    valid_data: Output[Dataset],
                    test_data: Output[Dataset],
                    input_data: Input[Dataset]):
    import numpy as np
    import pandas as pd

    # Mostrar parámetros de la división y rutas de salida para depuración
    print(f"train_size: {train_size}")
    print(f"test_size: {test_size}")
    print(f"valid_size: {valid_size}")
    print(f"Ruta de salida train_data: {train_data.path}")
    print(f"Ruta de salida valid_data: {valid_data.path}")
    print(f"Ruta de salida test_data: {test_data.path}")
    print(f"input_data: {input_data}")

    # Leer el dataset descargado
    data = pd.read_csv(input_data.path + ".csv")

    # Seleccionar las columnas relevantes para el modelado en House Prices
    modelling_columns = [
        "OverallQual",   # Calidad general de la vivienda
        "GrLivArea",     # Área habitable en pies cuadrados
        "GarageCars",    # Número de autos que caben en el garaje
        "TotalBsmtSF",   # Área total del sótano en pies cuadrados
        "FullBath",      # Número de baños completos
        "YearBuilt",     # Año de construcción
        "YearRemodAdd",  # Año de remodelación
        "SalePrice"      # Precio de venta (variable objetivo)
    ]

    data = data[modelling_columns]

    # Mezclar aleatoriamente los datos para evitar sesgos en la división
    data_shuffled = data.sample(frac=1, random_state=42)

    # Realizar la división en tres subconjuntos:
    # - train_ds: primeros train_size% de los datos
    # - valid_ds: siguiente bloque, equivalente a valid_size% del total
    # - test_ds: el restante (test_size%)
    train_ds, valid_ds, test_ds = np.split(
        data_shuffled,
        [int(train_size * len(data_shuffled)), int((1 - test_size) * len(data_shuffled))]
    )

    # Guardar cada subconjunto en archivos CSV en las rutas definidas por los outputs
    train_ds.to_csv(train_data.path + ".csv", index=False, encoding='utf-8-sig')
    valid_ds.to_csv(valid_data.path + ".csv", index=False, encoding='utf-8-sig')
    test_ds.to_csv(test_data.path + ".csv", index=False, encoding='utf-8-sig')

  @component(
  def preprocess_data(train_size: float,


In [None]:
@component(
    packages_to_install=[
        "kfp==2.4.0",
        "pandas",
        "pyarrow",
        "scikit-learn==1.3.2",
        "fsspec",
        "gcsfs",
        "click==8.1.7",
        "docstring-parser==0.16",
        "kfp-pipeline-spec==0.2.2",
        "kfp-server-api==2.0.5",
        "kubernetes==26.1.0",
        "PyYAML==6.0.2",
        "requests-toolbelt==0.10.1",
        "tabulate==0.9.0",
        "protobuf==3.20.3",
        "urllib3==1.26.20"
    ],
    base_image="python:3.9",
    output_component_file="./components/train.yaml"
)
def train_model(
    train_data: Input[Dataset],
    model: Output[Model],
):
    # Imprimir información para depuración
    print(f"train_data: {train_data}")
    print(f"model: {model}")

    from sklearn.linear_model import LinearRegression
    import pandas as pd
    import pickle
    import sklearn

    # Leer el dataset de entrenamiento
    train_ds = pd.read_csv(train_data.path + ".csv")

    # Instanciar y entrenar el modelo de regresión lineal
    my_model = LinearRegression()

    # Definir la variable objetivo: 'SalePrice'
    target = "SalePrice"

    # Separar las variables predictoras y la variable objetivo
    x_train = train_ds.drop(columns=target, axis=1)
    y_train = train_ds[target]

    # Entrenar el modelo
    my_model.fit(x_train, y_train)

    # Guardar metadatos sobre el modelo
    model.metadata["model_name"] = "LinearRegression"
    model.metadata["framework"] = "sklearn"
    model.metadata["framework_version"] = sklearn.__version__

    # Serializar y guardar el modelo en un archivo pickle
    file_name = model.path + ".pkl"
    with open(file_name, 'wb') as file:
        pickle.dump(my_model, file)

  @component(
  def train_model(


In [None]:
@component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.3.2", "fsspec", "gcsfs"],
    base_image="python:3.9",
    output_component_file="./components/evaluate_model.yaml"
)
def evaluate_model(
    test_data: Input[Dataset],
    model: Input[Model],
    target_column_name: str,
    deployment_metric: str,
    deployment_metric_threshold: float,
    kpi: Output[Metrics]
) -> NamedTuple(
    "Outputs",
    [
        ("deploy_flag", str),  # Indicador para el despliegue.
    ],
):
    # Imprimir información de entrada para depuración
    print(f"test_data: {test_data}")
    print(f"model: {model}")
    print(f"kpi: {kpi}")
    print(f"deployment_metric: {deployment_metric}")
    print(f"deployment_metric_threshold: {deployment_metric_threshold}")

    from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, mean_absolute_percentage_error
    import pandas as pd
    import pickle
    import numpy as np

    # Leer el dataset de prueba
    test_ds = pd.read_csv(test_data.path + ".csv")
    target = target_column_name  # Ejemplo: "SalePrice"

    # Separar las variables predictoras y la variable objetivo
    x_test = test_ds.drop(columns=target, axis=1)
    y_test = test_ds[target]

    print(f"model.path: {model.path}")
    file_name = model.path + ".pkl"
    print(f"file_name: {file_name}")

    # Cargar el modelo entrenado desde el archivo serializado
    with open(file_name, 'rb') as file:
        trained_model = pickle.load(file)

    # Realizar predicciones sobre el conjunto de prueba
    y_pred = trained_model.predict(x_test)

    # Calcular las métricas de evaluación
    r2 = r2_score(y_true=y_test, y_pred=y_pred)
    mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
    mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
    mape = mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred)
    rmse = np.sqrt(mse)

    model_metrics = {
        "r2": r2,
        "mae": mae,
        "mape": mape,
        "mse": mse,
        "rmse": rmse
    }

    print(f"R2: {r2}")
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Absolute Percentage Error: {round(mape, 4) * 100}%")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")

    # Registrar las métricas en el output KPI para monitoreo
    kpi.log_metric("R2", float(r2))
    kpi.log_metric("Mean Absolute Error", float(mae))
    kpi.log_metric("Mean Absolute Percentage Error", float(mape))
    kpi.log_metric("Mean Squared Error", float(mse))
    kpi.log_metric("Root Mean Squared Error", float(rmse))

    # Evaluar si el valor de la métrica especificada cumple el umbral para el despliegue
    actual_metric_value = model_metrics.get(deployment_metric)
    if actual_metric_value >= deployment_metric_threshold:
        deploy_flag = "True"
    else:
        deploy_flag = "False"

    return (deploy_flag,)

  @component(
  def evaluate_model(


In [None]:
@component(
    packages_to_install=["pandas", "pyarrow", "scikit-learn==1.3.2", "fsspec", "gcsfs", "google-cloud-aiplatform"],
    base_image="python:3.9",
    output_component_file="./components/register_model.yaml"
)
def register_model(
    serving_container_uri: str,
    project_id: str,
    region: str,
    model_name: str,
    model: Input[Model],
) -> NamedTuple(
    "Outputs",
    [
        ("model_resource_name", str),  # Nombre del recurso del modelo registrado.
    ],
):
    # Imprimir información para depuración
    print(f"serving_container_uri: {serving_container_uri}")
    print(f"project_id: {project_id}")
    print(f"region: {region}")
    print(f"model: {model}")

    from google.cloud import aiplatform

    # Mostrar la ruta del artefacto del modelo (removiendo la extensión del archivo)
    print(f"model.uri: {model.uri[:-5]}")

    # Inicializar la configuración de Vertex AI
    aiplatform.init(project=project_id, location=region)

    # Subir el modelo al Model Registry de Vertex AI
    registered_model = aiplatform.Model.upload(
        display_name=model_name,
        artifact_uri=model.uri[:-5],
        serving_container_image_uri=serving_container_uri
    )

    # Retornar el nombre del recurso del modelo registrado
    return (registered_model.resource_name,)

  @component(
  def register_model(


In [None]:
@component(
    packages_to_install=[
        "kfp==2.4.0",
        "pandas",
        "pyarrow",
        "scikit-learn==1.3.2",
        "fsspec",
        "gcsfs",
        "google-cloud-aiplatform",
        "click==8.1.7",
        "kfp-pipeline-spec==0.2.2",
        "kfp-server-api==2.0.5",
        "kubernetes==26.1.0",
        "PyYAML==6.0.2",
        "requests-toolbelt==0.10.1",
        "tabulate==0.9.0",
        "protobuf==3.20.3",
        "urllib3==1.26.20",
        "numpy==1.24.4",
        "google-cloud-pipeline-components==2.8.0"
    ],
    base_image="python:3.9",
    output_component_file="./components/deploy_model.yaml"
)
def deploy_model(
    model_resource_name: str,
    project_id: str,
    region: str
) -> NamedTuple(
    "Outputs",
    [
        ("endpoint_resource_name", str),  # Retorna el recurso del endpoint desplegado
    ]
):
    # Imprimir información para depuración
    print(f"model_resource_name: {model_resource_name}")
    print(f"project_id: {project_id}")
    print(f"region: {region}")

    from google.cloud import aiplatform

    # Inicializar Vertex AI con el proyecto y la región especificados
    aiplatform.init(project=project_id, location=region)

    # Recuperar el modelo registrado utilizando su recurso (artifact)
    model = aiplatform.Model(model_resource_name)

    # Desplegar el modelo en un endpoint de Vertex AI
    # Se utiliza 'n1-standard-4' como tipo de máquina y se definen réplicas mínimas y máximas
    endpoint = model.deploy(
        machine_type="n1-standard-4",
        min_replica_count=1,
        max_replica_count=1
    )

    # Retornar el nombre del recurso del endpoint desplegado
    return (endpoint.resource_name,)

  @component(
  def deploy_model(


In [None]:
import os

PROJECT_ID = ""
if not os.getenv("IS_TESTING"):
    shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

Project ID: famous-tree-448915-m8


In [None]:
# Definir el nombre del bucket de forma automática
BUCKET_NAME = "gs://" + PROJECT_ID + "-bucket"

# Obtener la lista de buckets existentes
existing_buckets = !gsutil ls

# Verificar si el bucket ya existe; si no, crearlo
if BUCKET_NAME not in existing_buckets:
    print("Creando el bucket:", BUCKET_NAME)
    !gsutil mb -l us-central1 {BUCKET_NAME}
else:
    print("El bucket ya existe:", BUCKET_NAME)

Creando el bucket: gs://famous-tree-448915-m8-bucket
Creating gs://famous-tree-448915-m8-bucket/...
ServiceException: 409 A Cloud Storage bucket named 'famous-tree-448915-m8-bucket' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [None]:
# Copia el archivo 'train.csv' al bucket de GCS y lista el contenido del bucket para verificar la copia.
!gsutil cp ./train.csv $BUCKET_NAME
!gsutil ls -al $BUCKET_NAME

Copying file://./train.csv [Content-Type=text/csv]...
/ [1 files][449.9 KiB/449.9 KiB]                                                
Operation completed over 1 objects/449.9 KiB.                                    
       768  2025-02-09T11:59:34Z  gs://famous-tree-448915-m8-bucket/config.json#1739102374621048  metageneration=2
     34745  2025-02-09T11:59:36Z  gs://famous-tree-448915-m8-bucket/house-prices-regression-pipeline.json#1739102376776832  metageneration=2
    460676  2025-02-09T12:06:53Z  gs://famous-tree-448915-m8-bucket/train.csv#1739102813296356  metageneration=1
                                 gs://famous-tree-448915-m8-bucket/models/
TOTAL: 3 objects, 496189 bytes (484.56 KiB)


In [None]:
# Obtener la cuenta de servicio configurada en gcloud y mostrar información de configuración
shell_output = !gcloud auth list 2>/dev/null
SERVICE_ACCOUNT = shell_output[2].replace("*", "").strip()
print("Service Account:", SERVICE_ACCOUNT)
print("Project ID:", PROJECT_ID)
print("staging_bucket_uri:", BUCKET_NAME)
print("input_data_path:", BUCKET_NAME)

Service Account: 436225358028-compute@developer.gserviceaccount.com
Project ID: famous-tree-448915-m8
staging_bucket_uri: gs://famous-tree-448915-m8-bucket
input_data_path: gs://famous-tree-448915-m8-bucket


In [None]:
import json
import os

# Definir la configuración para el dataset de House Prices – Advanced Regression Techniques
config = {
    "project": PROJECT_ID,
    "region": "us-central1",  # Región.
    "service_account": SERVICE_ACCOUNT,
    "staging_bucket_uri": BUCKET_NAME,
    "pipeline_name": "house-prices-regression-pipeline",
    "pipeline_package_path": "house-prices-regression-pipeline.json",
    "input_data_path": BUCKET_NAME,
    "input_data_filename": "train.csv",       # Archivo del dataset de House Prices.
    "target_column_name": "SalePrice",        # La variable objetivo del dataset.
    "train_size": 0.8,
    "test_size": 0.1,
    "valid_size": 0.1,
    "deployment_metric": "r2",
    "deployment_metric_threshold": 0.8,
    "model_name": "house-prices-linear-regression-model",
    "serving_container_uri": "us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-5:latest"
}

# Escribir el diccionario en el archivo config.json
with open("config.json", "w") as f:
    json.dump(config, f, indent=4)

print("Archivo config.json generado automáticamente.")

Archivo config.json generado automáticamente.


In [None]:
%%writefile build_pipeline.py

import json
import yaml

import kfp
from kfp import dsl
from kfp import compiler
from kfp.components import load_component_from_file

# Cargar los componentes del pipeline desde sus archivos YAML
download_data = load_component_from_file("./components/download_data.yaml")
preprocess_data = load_component_from_file("./components/preprocess_data.yaml")
train_model = load_component_from_file("./components/train.yaml")
evaluate_model = load_component_from_file("./components/evaluate_model.yaml")
register_model = load_component_from_file("./components/register_model.yaml")
deploy_model = load_component_from_file("./components/deploy_model.yaml")

# Leer la configuración desde el archivo config.json
with open("config.json") as json_file:
    config = json.load(json_file)

PIPELINE_NAME = config.get("pipeline_name")               # Ej. "house-prices-regression-pipeline"
PACKAGE_PATH = config.get("pipeline_package_path")        # Ej. "house-prices-regression-pipeline.json"
BUCKET_URI = config.get("staging_bucket_uri")
# Actualizar la ruta raíz del pipeline para que refleje que se trata de House Prices
PIPELINE_ROOT = "{}/pipeline_root/kfp_house_prices_regression".format(BUCKET_URI)
print(f"PIPELINE_ROOT: {PIPELINE_ROOT}")

@dsl.pipeline(
    pipeline_root=PIPELINE_ROOT,
    name=PIPELINE_NAME,
)
def pipeline(project: str = "",
             region: str = "",
             service_account: str = "",
             staging_bucket_uri: str = "",
             pipeline_name: str = "",
             pipeline_package_path: str = "",
             input_data_path: str = "",
             input_data_filename: str = "",
             target_column_name: str = "",
             train_size: float = 0.8,
             test_size: float = 0.1,
             valid_size: float = 0.1,
             hypertune_container_image_uri: str = "",
             hypertune_machine_type: str = "",
             hypertune_machine_replica_count: int = 1,
             hypertune_max_trial_count: int = 1,
             hypertune_parallel_trial_count: int = 1,
             hypertune_metric: str = "",
             hypertune_metric_objective: str = "",
             hypertune_job_name: str = "",
             deployment_metric: str = "",
             deployment_metric_threshold: float = 0.8,
             serving_container_uri: str = "",
             model_name: str = "",
             user_email: str = "",
             monitoring_job_name: str = "",
             predict_instance_schema_uri: str = ""
            ):
    # Paso 1: Descargar los datos
    download_data_op = download_data(
        input_data_path=input_data_path,
        input_data_filename=input_data_filename
    )

    # Paso 2: Preprocesar los datos
    preprocess_data_op = preprocess_data(
        train_size=train_size,
        test_size=test_size,
        valid_size=valid_size,
        input_data=download_data_op.outputs["downloaded_data"]
    )

    # Paso 3: Entrenar el modelo
    train_model_op = train_model(
        train_data=preprocess_data_op.outputs["train_data"]
    )

    # Paso 4: Evaluar el modelo en el conjunto de prueba
    evaluate_model_op = evaluate_model(
        test_data=preprocess_data_op.outputs["test_data"],
        model=train_model_op.outputs["model"],
        target_column_name=target_column_name,
        deployment_metric=deployment_metric,
        deployment_metric_threshold=deployment_metric_threshold
    )

    # Paso 5: Si el modelo cumple con el umbral de despliegue, se registra y despliega
    with dsl.If(evaluate_model_op.outputs["deploy_flag"] == "True"):
        register_model_op = register_model(
            serving_container_uri=serving_container_uri,
            model=train_model_op.outputs["model"],
            model_name=model_name,
            project_id=project,
            region=region
        )

        deploy_model_op = deploy_model(
            model_resource_name=register_model_op.outputs["model_resource_name"],
            project_id=project,
            region=region
        )

# Compilar el pipeline en el paquete especificado
compiler.Compiler().compile(
    pipeline_func=pipeline,
    package_path=PACKAGE_PATH
)

Overwriting build_pipeline.py


In [None]:
%%writefile run_pipeline.py

from google.cloud import aiplatform
import yaml
import json

# Leer la configuración desde el archivo config.json
with open("config.json") as json_file:
    config = json.load(json_file)

SERVICE_ACCOUNT = config.get("service_account")
DISPLAY_NAME = config.get("pipeline_name")
PACKAGE_PATH = config.get("pipeline_package_path")
BUCKET_URI = config.get("staging_bucket_uri")

# Definir la raíz del pipeline específica para House Prices
PIPELINE_ROOT = f"{BUCKET_URI}/pipeline_root/kfp_house_prices_regression"
print(f"PIPELINE_ROOT: {PIPELINE_ROOT}")

# Crear el PipelineJob con los parámetros obtenidos de config.json
job = aiplatform.PipelineJob(
    display_name=DISPLAY_NAME,
    template_path=PACKAGE_PATH,
    pipeline_root=PIPELINE_ROOT,
    parameter_values=config,
)

# Enviar el pipeline a ejecución usando la cuenta de servicio configurada
job.submit(service_account=SERVICE_ACCOUNT)

Overwriting run_pipeline.py


In [None]:
!python3 build_pipeline.py

PIPELINE_ROOT: gs://famous-tree-448915-m8-bucket/pipeline_root/kfp_house_prices_regression


In [None]:
!python3 run_pipeline.py

PIPELINE_ROOT: gs://famous-tree-448915-m8-bucket/pipeline_root/kfp_house_prices_regression
Creating PipelineJob
PipelineJob created. Resource name: projects/436225358028/locations/us-central1/pipelineJobs/house-prices-regression-pipeline-20250209120700
To use this PipelineJob in another session:
pipeline_job = aiplatform.PipelineJob.get('projects/436225358028/locations/us-central1/pipelineJobs/house-prices-regression-pipeline-20250209120700')
View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/house-prices-regression-pipeline-20250209120700?project=436225358028


In [None]:
!gsutil cp ./config.json $BUCKET_NAME
!gsutil cp ./house-prices-regression-pipeline.json $BUCKET_NAME
!gsutil acl ch -u AllUsers:R $BUCKET_NAME/config.json
!gsutil acl ch -u AllUsers:R $BUCKET_NAME/house-prices-regression-pipeline.json

Copying file://./config.json [Content-Type=application/json]...
/ [1 files][  768.0 B/  768.0 B]                                                
Operation completed over 1 objects/768.0 B.                                      
Copying file://./house-prices-regression-pipeline.json [Content-Type=application/json]...
/ [1 files][ 33.9 KiB/ 33.9 KiB]                                                
Operation completed over 1 objects/33.9 KiB.                                     
Updated ACL on gs://famous-tree-448915-m8-bucket/config.json
Updated ACL on gs://famous-tree-448915-m8-bucket/house-prices-regression-pipeline.json
