In [None]:
#Limpieza de carpetas para reiniciar contenido (Solo usar en caso de volver a ejecutar todo el script)

import shutil
import os

#Ruta base
base_dir = "/app"

# Directorios a limpiar (pero sin borrar archivos .py ni .ipynb)
dirs_to_clean = [
    os.path.join(base_dir, "data"),
    os.path.join(base_dir, "tfx_pipeline"),
    os.path.join(base_dir, "schema_for_example_validator"),
    os.path.join(base_dir, "schema_artifact"),
]

for directory in dirs_to_clean:
    if os.path.exists(directory):
        shutil.rmtree(directory, ignore_errors=True)  # Eliminar todo dentro
        os.makedirs(directory, exist_ok=True)  # Recrear vacío
        print(f"✅ Se ha limpiado y recreado: {directory}")
    else:
        print(f"⚠️ La carpeta {directory} no existe, no es necesario limpiarla.")

# También eliminar la base de datos de metadatos si existe
metadata_db = os.path.join(base_dir, "tfx_pipeline", "metadata.db")
if os.path.exists(metadata_db):
    os.remove(metadata_db)
    print("Se ha eliminado la base de datos de metadatos.")

print("Limpieza completada.")




In [1]:
import os
import requests
## Descargar dataset
# Directorio
_data_root = './data/covertype'
# Path 
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')
# Descaragar data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

In [2]:
import pandas as pd

data = pd.read_csv('data/covertype/covertype_train.csv')
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2991,119,7,67,11,1015,233,234,133,1570,Commanche,C7202,1
1,2876,3,18,485,71,2495,192,202,144,1557,Commanche,C7757,1
2,3171,315,2,277,9,4374,213,237,162,1052,Rawah,C7745,0
3,3087,342,13,190,31,4774,193,221,166,752,Rawah,C7745,0
4,2835,158,10,212,41,3596,231,242,141,3280,Rawah,C4744,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116203 entries, 0 to 116202
Data columns (total 13 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   Elevation                           116203 non-null  int64 
 1   Aspect                              116203 non-null  int64 
 2   Slope                               116203 non-null  int64 
 3   Horizontal_Distance_To_Hydrology    116203 non-null  int64 
 4   Vertical_Distance_To_Hydrology      116203 non-null  int64 
 5   Horizontal_Distance_To_Roadways     116203 non-null  int64 
 6   Hillshade_9am                       116203 non-null  int64 
 7   Hillshade_Noon                      116203 non-null  int64 
 8   Hillshade_3pm                       116203 non-null  int64 
 9   Horizontal_Distance_To_Fire_Points  116203 non-null  int64 
 10  Wilderness_Area                     116203 non-null  object
 11  Soil_Type                           116

In [5]:
import tensorflow as tf

In [6]:
#SELECCIÓN DE CARACTERISTICAS

from sklearn.feature_selection import SelectKBest, f_classif
# Separar características y la variable objetivo
X = data.drop(columns=["Cover_Type"]) 
y = data["Cover_Type"] 

#Seleccionar solo las características numéricas
X_numeric = X.select_dtypes(include=['number'])


In [7]:
# Aplicar SelectKBest para selección de características
k = 5  # Número de características a seleccionar
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X_numeric, y)

selected_features = X_numeric.columns[selector.get_support()]

print("Características seleccionadas:")
print(selected_features)

Características seleccionadas:
Index(['Elevation', 'Slope', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Horizontal_Distance_To_Fire_Points'],
      dtype='object')


In [8]:
#Crear DataFrame con True/False para cada columna
feature_selection_result = pd.DataFrame({
    "Columns": X_numeric.columns,
    "Retain": X_numeric.columns.isin(selected_features)
})

print(feature_selection_result.to_string(index=False))

                           Columns  Retain
                         Elevation    True
                            Aspect   False
                             Slope    True
  Horizontal_Distance_To_Hydrology   False
    Vertical_Distance_To_Hydrology   False
   Horizontal_Distance_To_Roadways    True
                     Hillshade_9am    True
                    Hillshade_Noon   False
                     Hillshade_3pm   False
Horizontal_Distance_To_Fire_Points    True


In [9]:
#DATA PIPELINE
## Configurar el contexto interactivo

import tfx
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext

In [10]:
#Ingesta de datos
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
from tfx.orchestration.metadata import sqlite_metadata_connection_config

# Definir la ruta del directorio de metadatos
PIPELINE_ROOT = os.path.join(os.getcwd(), "tfx_pipeline")
METADATA_PATH = os.path.join(PIPELINE_ROOT, "metadata.db")

# Crear la configuración de la base de datos SQLite en el formato correcto
metadata_connection_config = sqlite_metadata_connection_config(METADATA_PATH)

# Iniciar el contexto interactivo con la configuración corregida
context = InteractiveContext(pipeline_root=PIPELINE_ROOT, metadata_connection_config=metadata_connection_config)

print(f"Pipeline configurado en: {PIPELINE_ROOT}")
print(f"Base de datos de metadatos en: {METADATA_PATH}")


Pipeline configurado en: /app/tfx_pipeline
Base de datos de metadatos en: /app/tfx_pipeline/metadata.db


In [11]:
from tfx.components import CsvExampleGen

# Definir la ruta de los datos 
DATA_DIR = os.path.abspath("./data/covertype")  # Ruta absoluta del directorio donde está el CSV
print(f"Usando DATA_DIR: {DATA_DIR}")

#Crear el componente ExampleGen
example_gen = CsvExampleGen(input_base=DATA_DIR)

#Ejecutar ExampleGen en el contexto interactivo
context.run(example_gen)

#Verificar la salida de los datos ingeridos
artifact_uri = example_gen.outputs['examples'].get()[0].uri
print(f"Datos ingeridos y convertidos a TFRecord en: {artifact_uri}")



Usando DATA_DIR: /app/data/covertype






Datos ingeridos y convertidos a TFRecord en: /app/tfx_pipeline/CsvExampleGen/examples/1


In [12]:
#ESTADISTICAS

from tfx.components import StatisticsGen

#Crear el componente StatisticsGen con la salida de ExampleGen
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples']
)

#Ejecutar StatisticsGen en el contexto interactivo
context.run(statistics_gen)

#Obtener la URI de las estadísticas generadas
artifact_uri = statistics_gen.outputs['statistics'].get()[0].uri
print(f"Estadísticas generadas en: {artifact_uri}")


Estadísticas generadas en: /app/tfx_pipeline/StatisticsGen/statistics/2


In [13]:
print("Contenido de StatisticsGen:", os.listdir(artifact_uri))

Contenido de StatisticsGen: ['Split-eval', 'Split-train']


In [14]:
train_stats_dir = os.path.join(artifact_uri, "Split-train")
print("Contenido de Split-train:", os.listdir(train_stats_dir))


Contenido de Split-train: ['FeatureStats.pb']


In [15]:
from google.protobuf import text_format
from tensorflow_metadata.proto.v0 import statistics_pb2

#Ruta del archivo de estadísticas en Split-train
stats_path = os.path.join(artifact_uri, "Split-train", "FeatureStats.pb")

#Verificar si el archivo existe antes de cargarlo
if os.path.exists(stats_path):
    #Crear un objeto ProtoBuf para almacenar las estadísticas
    stats_proto = statistics_pb2.DatasetFeatureStatisticsList()

    #Leer el archivo como un ProtoBuf binario
    with open(stats_path, "rb") as f:
        stats_proto.ParseFromString(f.read())

    #Mostrar las estadísticas cargadas
    #print(stats_proto)
else:
    print(f"El archivo {stats_path} no existe. Verifica el directorio de estadísticas.")

import tensorflow_data_validation as tfdv
from tensorflow_metadata.proto.v0 import statistics_pb2

# Convertir el ProtoBuf cargado en el formato correcto para visualización
stats_dataset = statistics_pb2.DatasetFeatureStatisticsList()
stats_dataset.datasets.extend([stats_proto.datasets[0]])

# Visualizar las estadísticas con FACETS
tfdv.visualize_statistics(stats_dataset)



In [16]:
from tfx.components import SchemaGen

#Inferir Esquema
# Crear componente SchemaGen utilizando la salida de StatisticsGen
schema_gen = SchemaGen(
    statistics=statistics_gen.outputs['statistics']
)

# Ejecutar SchemaGen en el contexto interactivo
context.run(schema_gen)

# Obtener la URI del esquema generado
schema_uri = schema_gen.outputs['schema'].get()[0].uri
print(f"Esquema generado en: {schema_uri}")

# Verificar el contenido del directorio del esquema
print("Contenido de SchemaGen:", os.listdir(schema_uri))

context.show(schema_gen.outputs['schema'])



Esquema generado en: /app/tfx_pipeline/SchemaGen/schema/3
Contenido de SchemaGen: ['schema.pbtxt']


Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Aspect',INT,required,,-
'Cover_Type',INT,required,,-
'Elevation',INT,required,,-
'Hillshade_3pm',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Soil_Type',"'C2702', 'C2703', 'C2704', 'C2705', 'C2706', 'C2717', 'C3501', 'C3502', 'C4201', 'C4703', 'C4704', 'C4744', 'C4758', 'C5101', 'C6101', 'C6102', 'C6731', 'C7101', 'C7102', 'C7103', 'C7201', 'C7202', 'C7700', 'C7701', 'C7702', 'C7709', 'C7710', 'C7745', 'C7746', 'C7755', 'C7756', 'C7757', 'C7790', 'C8703', 'C8707', 'C8708', 'C8771', 'C8772', 'C8776', 'C5151'"
'Wilderness_Area',"'Cache', 'Commanche', 'Neota', 'Rawah'"


In [17]:
import tensorflow_data_validation as tfdv 

In [18]:
from tensorflow_metadata.proto.v0 import schema_pb2

#CURANDO EL ESQUEMA

#Definir rangos para las caracteristicas Hillshade_9am,Hillshade_Noon, Slope, Cover_Type

# Cargar el esquema inferido desde el directorio generado por SchemaGen
schema = tfdv.load_schema_text(os.path.join(schema_uri, "schema.pbtxt"))

# Definir valores aceptables
feature_domains = {
    "Hillshade_9am": (0, 255),
    "Hillshade_Noon": (0, 255),
    "Slope": (0, 90),
    "Cover_Type": (0, 6)
}

# Aplicar restricciones de dominio a las características
for feature_name, (min_val, max_val) in feature_domains.items():
    tfdv.set_domain(schema, feature_name, 
                    schema_pb2.IntDomain(min=min_val, max=max_val))

# Marcar "Cover_Type" como una variable categórica
cover_type_feature = next(f for f in schema.feature if f.name == "Cover_Type")
cover_type_feature.int_domain.is_categorical = True

# Guardar el esquema actualizado
updated_schema_dir = os.path.join(os.path.dirname(schema_uri), "updated_schema")
os.makedirs(updated_schema_dir, exist_ok=True)  # Crear el directorio si no existe
updated_schema_path = os.path.join(updated_schema_dir, "updated_schema.pbtxt")

tfdv.write_schema_text(schema, updated_schema_path)

print(f"Esquema actualizado y guardado en: {updated_schema_path}")


Esquema actualizado y guardado en: /app/tfx_pipeline/SchemaGen/schema/updated_schema/updated_schema.pbtxt


In [19]:
#ENTORNOS DE ESQUEMA

#Simular un conjunto de datos de inferencia o serving, eliminando la columna Cover_Type.

# Definir directorio para datos de inferencia
INFERENCE_DATA_DIR = os.path.abspath("./data/covertype/inference")

# Crear carpeta si no existe
os.makedirs(INFERENCE_DATA_DIR, exist_ok=True)

# Cargar los datos de entrenamiento
data_path = os.path.join(_data_root, 'covertype_train.csv')
df = pd.read_csv(data_path)

# Crear un subconjunto para inferencia eliminando "Cover_Type"
df_inference = df.drop(columns=["Cover_Type"])

# Guardar el archivo de inferencia
inference_data_path = os.path.join(INFERENCE_DATA_DIR, "covertype_inference.csv")
df_inference.to_csv(inference_data_path, index=False)

print(f"Conjunto de datos de inferencia guardado en: {inference_data_path}")


Conjunto de datos de inferencia guardado en: /app/data/covertype/inference/covertype_inference.csv


In [20]:
#Ingesta del conjunto de inferencia

from tfx.components import CsvExampleGen

#Crear el componente ExampleGen para datos de inferencia
inference_example_gen = CsvExampleGen(input_base=INFERENCE_DATA_DIR)

#Ejecutar ExampleGen en el contexto de TFX
context.run(inference_example_gen)



0,1
.execution_id,4
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7fe9618a7430.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']/app/data/covertype/inference['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:6173042,xor_checksum:1740702724,sum_checksum:1740702724"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/app/data/covertype/inference['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['output_file_format']5['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:6173042,xor_checksum:1740702724,sum_checksum:1740702724"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/app/tfx_pipeline/CsvExampleGen/examples/4
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/app/data/covertype/inference
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['output_file_format'],5
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:6173042,xor_checksum:1740702724,sum_checksum:1740702724"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/app/tfx_pipeline/CsvExampleGen/examples/4
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


In [21]:
# Generar estadísticas para los datos de inferencia
statistics_gen_inference = StatisticsGen(
    examples=inference_example_gen.outputs['examples']
)

#Ejecutar StatisticsGen en el contexto de TFX
context.run(statistics_gen_inference)

0,1
.execution_id,5
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } StatisticsGen at 0x7fe960c3f970.inputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0.outputs['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fe960bddaf0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""].exec_properties['stats_options_json']None['exclude_splits'][]"
.component.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"
.component.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fe960bddaf0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"

0,1
.inputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"
.outputs,"['statistics'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fe960bddaf0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"
.exec_properties,['stats_options_json']None['exclude_splits'][]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/app/tfx_pipeline/CsvExampleGen/examples/4
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fe960bddaf0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/app/tfx_pipeline/StatisticsGen/statistics/5
.span,0
.split_names,"[""train"", ""eval""]"

0,1
['stats_options_json'],
['exclude_splits'],[]

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7fe96341bd90.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /app/tfx_pipeline/CsvExampleGen/examples/4) at 0x7fe9738c0b20.type<class 'tfx.types.standard_artifacts.Examples'>.uri/app/tfx_pipeline/CsvExampleGen/examples/4.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/app/tfx_pipeline/CsvExampleGen/examples/4
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['statistics'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'ExampleStatistics' (1 artifact) at 0x7fe960bddaf0.type_nameExampleStatistics._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"

0,1
.type_name,ExampleStatistics
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'ExampleStatistics' (uri: /app/tfx_pipeline/StatisticsGen/statistics/5) at 0x7fe960c1a670.type<class 'tfx.types.standard_artifacts.ExampleStatistics'>.uri/app/tfx_pipeline/StatisticsGen/statistics/5.span0.split_names[""train"", ""eval""]"

0,1
.type,<class 'tfx.types.standard_artifacts.ExampleStatistics'>
.uri,/app/tfx_pipeline/StatisticsGen/statistics/5
.span,0
.split_names,"[""train"", ""eval""]"


In [22]:
import os
import tensorflow_data_validation as tfdv

# Ruta del esquema curado previamente generado por SchemaGen
updated_schema_path = "/app/tfx_pipeline/SchemaGen/schema/updated_schema/updated_schema.pbtxt"

# Verificar que el esquema corregido existe antes de cargarlo
if not os.path.exists(updated_schema_path):
    raise FileNotFoundError(f"ERROR: No se encontró el esquema curado en {updated_schema_path}")

# Cargar el esquema corregido
schema = tfdv.load_schema_text(updated_schema_path)

# Configurar los entornos de esquema: TRAINING y SERVING
schema.default_environment.append("TRAINING")
schema.default_environment.append("SERVING")

# Hacer que "Cover_Type" sea obligatoria en TRAINING pero opcional en SERVING
try:
    cover_type_feature = next(f for f in schema.feature if f.name == "Cover_Type")
    cover_type_feature.not_in_environment.append("SERVING")
except StopIteration:
    print("⚠️ Advertencia: La característica 'Cover_Type' no se encontró en el esquema.")

# Definir el directorio donde se guardará el esquema con entornos
PIPELINE_ROOT = "/app/tfx_pipeline"
final_schema_dir = os.path.join(PIPELINE_ROOT, "SchemaGen/schema/final_schema")
os.makedirs(final_schema_dir, exist_ok=True)  # Crear directorio si no existe
final_schema_path = os.path.join(final_schema_dir, "final_schema.pbtxt")

# Guardar el esquema actualizado con entornos de esquema
tfdv.write_schema_text(schema, final_schema_path)

print(f"Esquema con entornos actualizado y guardado en: {final_schema_path}")


Esquema con entornos actualizado y guardado en: /app/tfx_pipeline/SchemaGen/schema/final_schema/final_schema.pbtxt


In [23]:
from tfx.components import StatisticsGen
from tfx.v1.components import ImportSchemaGen
from tfx.v1.components import ExampleValidator
from tfx.v1.types import standard_artifacts
import tensorflow_data_validation as tfdv
import os

# Ruta del esquema con entornos actualizado
final_schema_path = "/app/tfx_pipeline/SchemaGen/schema/final_schema/final_schema.pbtxt"

# Verificar que el esquema final existe antes de usarlo
if not os.path.exists(final_schema_path):
    raise FileNotFoundError(f"ERROR: No se encontró el esquema final en {final_schema_path}")

# Crear `ImportSchemaGen` para registrar el esquema en el pipeline
schema_importer = ImportSchemaGen(
    schema_file=final_schema_path  # Pasamos el esquema corregido con entornos
)

#Ejecutar `ImportSchemaGen` para registrar el esquema en TFX
context.run(schema_importer)

# Usar el esquema registrado
example_validator = ExampleValidator(
    statistics=statistics_gen_inference.outputs['statistics'],
    schema=schema_importer.outputs['schema']  # Usamos la salida de `ImportSchemaGen`
)

# Ejecutar `ExampleValidator` en el contexto de TFX
context.run(example_validator)

print("Validación completada. Se compararon los datos de inferencia con el esquema.")

# Obtener la URI de las anomalías detectadas
anomalies_uri = example_validator.outputs['anomalies'].get()[0].uri
print(f"Resultados de validación guardados en: {anomalies_uri}")

# Cargar y visualizar anomalías
from tensorflow_metadata.proto.v0 import anomalies_pb2

anomalies_path = os.path.join(anomalies_uri, "Split-inference", "Anomalies.pb")
if os.path.exists(anomalies_path):
    anomalies_proto = anomalies_pb2.Anomalies()
    with open(anomalies_path, "rb") as f:
        anomalies_proto.ParseFromString(f.read())

    # Visualizar anomalías detectadas
    tfdv.display_anomalies(anomalies_proto)
else:
    print(f"No se encontraron anomalías en {anomalies_path}.")

Validación completada. Se compararon los datos de inferencia con el esquema.
Resultados de validación guardados en: /app/tfx_pipeline/ExampleValidator/anomalies/7
No se encontraron anomalías en /app/tfx_pipeline/ExampleValidator/anomalies/7/Split-inference/Anomalies.pb.


In [24]:
#Validar el esquema mostrando los cambios introducidos

#Ruta del esquema final
final_schema_path = final_schema_path = "/app/tfx_pipeline/SchemaGen/schema/final_schema/final_schema.pbtxt"


#Cargar el esquema final
schema = tfdv.load_schema_text(final_schema_path)

#Mostrar el esquema con los cambios
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Aspect',INT,required,,-
'Cover_Type',INT,required,,min: 0; max: 6
'Elevation',INT,required,,-
'Hillshade_3pm',INT,required,,-
'Hillshade_9am',INT,required,,min: 0; max: 255
'Hillshade_Noon',INT,required,,min: 0; max: 255
'Horizontal_Distance_To_Fire_Points',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Slope',INT,required,,min: 0; max: 90


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Soil_Type',"'C2702', 'C2703', 'C2704', 'C2705', 'C2706', 'C2717', 'C3501', 'C3502', 'C4201', 'C4703', 'C4704', 'C4744', 'C4758', 'C5101', 'C6101', 'C6102', 'C6731', 'C7101', 'C7102', 'C7103', 'C7201', 'C7202', 'C7700', 'C7701', 'C7702', 'C7709', 'C7710', 'C7745', 'C7746', 'C7755', 'C7756', 'C7757', 'C7790', 'C8703', 'C8707', 'C8708', 'C8771', 'C8772', 'C8776', 'C5151'"
'Wilderness_Area',"'Cache', 'Commanche', 'Neota', 'Rawah'"


In [25]:
#Mostrar los entornos

print(f"\Entornos definidos en el esquema: {schema.default_environment}")

\Entornos definidos en el esquema: ['TRAINING', 'SERVING']


In [26]:
#Generar nuevas estadisticas

#Importar el esquema con importSchemaGen

from tfx.v1.components import ImportSchemaGen
import os

# Ruta del esquema final con entornos TRAINING y SERVING
final_schema_path = "/app/tfx_pipeline/SchemaGen/schema/final_schema/final_schema.pbtxt"

# Crear el componente ImportSchemaGen para importar el esquema final
import_schema_gen = ImportSchemaGen(schema_file=final_schema_path)

# Ejecutar ImportSchemaGen en el contexto de TFX
context.run(import_schema_gen)

# Confirmar la importación
imported_schema_uri = import_schema_gen.outputs['schema'].get()[0].uri
print(f"Esquema importado y registrado en los metadatos de ML en: {imported_schema_uri}")



Esquema importado y registrado en los metadatos de ML en: /app/tfx_pipeline/ImportSchemaGen/schema/6


In [27]:
from tfx.components import StatisticsGen

# Verificar que el esquema importado está disponible en los metadatos
imported_schema_uri = import_schema_gen.outputs['schema'].get()[0].uri
if not os.path.exists(imported_schema_uri):
    raise FileNotFoundError(f"ERROR: No se encontró el esquema importado en {imported_schema_uri}. Verifica que se importó correctamente.")

# Usar StatisticsGen con el esquema curado para generar estadísticas
statistics_gen = StatisticsGen(
    examples=example_gen.outputs['examples'],  # Datos de entrenamiento
    schema=import_schema_gen.outputs['schema']  # Esquema curado
)

# Ejecutar StatisticsGen en el contexto de TFX
context.run(statistics_gen)

# Confirmar la ejecución y mostrar la URI de las estadísticas generadas
statistics_uri = statistics_gen.outputs['statistics'].get()[0].uri
print(f"Estadísticas generadas y guardadas en: {statistics_uri}")


Estadísticas generadas y guardadas en: /app/tfx_pipeline/StatisticsGen/statistics/9


In [28]:
# Ruta del directorio donde se guardaron las estadísticas
statistics_dir = statistics_uri

# Ruta del archivo de estadísticas en Split-train
stats_path = os.path.join(statistics_dir, "Split-train", "FeatureStats.pb")

# Verificar si el archivo existe antes de cargarlo
if os.path.exists(stats_path):
    # Crear un objeto ProtoBuf para almacenar las estadísticas
    stats_proto = statistics_pb2.DatasetFeatureStatisticsList()

    # Leer el archivo como un ProtoBuf binario
    with open(stats_path, "rb") as f:
        stats_proto.ParseFromString(f.read())

    # Convertir el ProtoBuf cargado en el formato correcto para visualización
    stats_dataset = statistics_pb2.DatasetFeatureStatisticsList()
    stats_dataset.datasets.extend([stats_proto.datasets[0]])

    #Visualizar las estadísticas con FACETS
    tfdv.visualize_statistics(stats_dataset)

    print("Estadísticas cargadas y visualizadas correctamente.")

else:
    print(f"ERROR: El archivo {stats_path} no existe. Verifica el directorio de estadísticas.")



Estadísticas cargadas y visualizadas correctamente.


In [29]:
from tfx.components import ExampleValidator

# Crear el componente ExampleValidator para detectar anomalías
example_validator = ExampleValidator(
    statistics=statistics_gen.outputs['statistics'],  # Estadísticas generadas
    schema=import_schema_gen.outputs['schema']  # Esquema corregido e importado
)

# Ejecutar ExampleValidator
context.run(example_validator)

# Obtener la URI de las anomalías detectadas
anomalies_uri = example_validator.outputs['anomalies'].get()[0].uri
print(f"Análisis de anomalías completado. Resultados guardados en: {anomalies_uri}")


Análisis de anomalías completado. Resultados guardados en: /app/tfx_pipeline/ExampleValidator/anomalies/10


In [30]:
#INGENIERIA DE CARACTERISTICAS

#Declaración de constantes

import tensorflow_transform as tft
from tfx.components import Transform


In [None]:
#La función de preprocesameinto se guarda en un archivo .py en la misma ruta que este notebook para que peuda ser leido


In [31]:
import os

# Definir la ruta del archivo en la misma carpeta que el notebook
module_file = os.path.abspath("preprocessing.py") 

# Verificar que el archivo existe
if not os.path.exists(module_file):
    raise FileNotFoundError(f"ERROR: No se encontró el archivo en {module_file}")

print(f"Archivo de preprocesamiento encontrado en: {module_file}")

Archivo de preprocesamiento encontrado en: /app/preprocessing.py


In [32]:
from tfx.v1.components import Transform

# Crear el componente Transform
transform = Transform(
    examples=example_gen.outputs['examples'],  # Datos de entrada
    schema=import_schema_gen.outputs['schema'],  # Esquema curado
    module_file="/app/preprocessing.py" 
)

# Ejecutar Transform en el contexto de TFX
context.run(transform)

# Obtener la URI de los datos transformados
transform_graph_uri = transform.outputs['transform_graph'].get()[0].uri
transformed_examples_uri = transform.outputs['transformed_examples'].get()[0].uri

print(f"Transformación completada.")
print(f"Transform Graph guardado en: {transform_graph_uri}")
print(f"Ejemplos transformados guardados en: {transformed_examples_uri}")


running bdist_wheel
running build
running build_py
creating build
creating build/lib
copying preprocessing.py -> build/lib
copying util.py -> build/lib
installing to /tmp/tmpde49bhhr
running install
running install_lib
copying build/lib/preprocessing.py -> /tmp/tmpde49bhhr
copying build/lib/util.py -> /tmp/tmpde49bhhr
running install_egg_info
running egg_info
creating tfx_user_code_Transform.egg-info
writing tfx_user_code_Transform.egg-info/PKG-INFO
writing dependency_links to tfx_user_code_Transform.egg-info/dependency_links.txt
writing top-level names to tfx_user_code_Transform.egg-info/top_level.txt
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
reading manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
writing manifest file 'tfx_user_code_Transform.egg-info/SOURCES.txt'
Copying tfx_user_code_Transform.egg-info to /tmp/tmpde49bhhr/tfx_user_code_Transform-0.0+b062e1aede7334124d9cb2707f5d6d1869e582588fae284fbf1755c7f6e465cd-py3.9.egg-info
running instal



Processing ./tfx_pipeline/_wheels/tfx_user_code_Transform-0.0+b062e1aede7334124d9cb2707f5d6d1869e582588fae284fbf1755c7f6e465cd-py3-none-any.whl
Installing collected packages: tfx-user-code-Transform
Successfully installed tfx-user-code-Transform-0.0+b062e1aede7334124d9cb2707f5d6d1869e582588fae284fbf1755c7f6e465cd
Processing ./tfx_pipeline/_wheels/tfx_user_code_Transform-0.0+b062e1aede7334124d9cb2707f5d6d1869e582588fae284fbf1755c7f6e465cd-py3-none-any.whl
Installing collected packages: tfx-user-code-Transform
Successfully installed tfx-user-code-Transform-0.0+b062e1aede7334124d9cb2707f5d6d1869e582588fae284fbf1755c7f6e465cd
Processing ./tfx_pipeline/_wheels/tfx_user_code_Transform-0.0+b062e1aede7334124d9cb2707f5d6d1869e582588fae284fbf1755c7f6e465cd-py3-none-any.whl
Installing collected packages: tfx-user-code-Transform
Successfully installed tfx-user-code-Transform-0.0+b062e1aede7334124d9cb2707f5d6d1869e582588fae284fbf1755c7f6e465cd
INFO:tensorflow:Assets written to: /app/tfx_pipeline/Tr

INFO:tensorflow:Assets written to: /app/tfx_pipeline/Transform/transform_graph/11/.temp_path/tftransform_tmp/25e61b3e414b4ea48010cbc62d52e399/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:Assets written to: /app/tfx_pipeline/Transform/transform_graph/11/.temp_path/tftransform_tmp/4a0cea9d6638449c957f3d10e55c5930/assets


INFO:tensorflow:Assets written to: /app/tfx_pipeline/Transform/transform_graph/11/.temp_path/tftransform_tmp/4a0cea9d6638449c957f3d10e55c5930/assets


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:struct2tensor is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_decision_forests is not available.


INFO:tensorflow:tensorflow_text is not available.


INFO:tensorflow:tensorflow_text is not available.


Transformación completada.
Transform Graph guardado en: /app/tfx_pipeline/Transform/transform_graph/11
Ejemplos transformados guardados en: /app/tfx_pipeline/Transform/transformed_examples/11


In [33]:
# Ruta del Transform Graph generado
transform_graph_uri = transform_graph_uri

# Cargar el Transform Graph
transform_graph = tft.TFTransformOutput(transform_graph_uri)

# Obtener la ruta del modelo de transformación guardado (CORREGIDO)
saved_model_dir = transform_graph.transform_savedmodel_dir

# Mostrar las operaciones en el modelo de transformación
print(f"Transform Graph cargado desde: {saved_model_dir}")
print(f"Operaciones en el modelo de transformación:")

# Cargar el modelo de transformación
model = tf.saved_model.load(saved_model_dir)

# Visualizar las transformaciones aplicadas
print(model.signatures)



Transform Graph cargado desde: /app/tfx_pipeline/Transform/transform_graph/11/transform_fn
Operaciones en el modelo de transformación:
_SignatureMap({'serving_default': <ConcreteFunction (*, inputs: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs'), inputs_1: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_1'), inputs_10: TensorSpec(shape=(None, 1), dtype=tf.string, name='inputs_10'), inputs_11: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_11'), inputs_12: TensorSpec(shape=(None, 1), dtype=tf.string, name='inputs_12'), inputs_2: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_2'), inputs_3: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_3'), inputs_4: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_4'), inputs_5: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_5'), inputs_6: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_6'), inputs_7: TensorSpec(shape=(None, 1), dtype=tf.int64, name='inputs_7'), inputs_8: Ten

In [34]:
print(f"Ruta de los ejemplos transformados: {transformed_examples_uri}")


Ruta de los ejemplos transformados: /app/tfx_pipeline/Transform/transformed_examples/11


In [35]:
import os

# Verificar qué archivos hay en la carpeta de datos transformados
if os.path.exists(transformed_examples_uri):
    print(f"Contenido en {transformed_examples_uri}:")
    print(os.listdir(transformed_examples_uri))
else:
    print(f"ERROR: No se encontró la carpeta {transformed_examples_uri}. Verifica que Transform se ejecutó correctamente.")


Contenido en /app/tfx_pipeline/Transform/transformed_examples/11:
['Split-eval', 'Split-train']


In [36]:
import os

# Ruta de los datos transformados
transformed_statistics_dir = transformed_examples_uri

# Listar archivos en la carpeta
if os.path.exists(transformed_statistics_dir):
    print(f"Contenido en {transformed_statistics_dir}:")
    print(os.listdir(transformed_statistics_dir))
else:
    print(f"ERROR: No se encontró la carpeta {transformed_statistics_dir}. Verifica que Transform se ejecutó correctamente.")



Contenido en /app/tfx_pipeline/Transform/transformed_examples/11:
['Split-eval', 'Split-train']


In [37]:

from tensorflow.train import Example

# Ruta del archivo transformado en formato TFRecord comprimido (.gz)
transformed_data_path = "/app/tfx_pipeline/Transform/transformed_examples/11/Split-train"
transformed_files = tf.io.gfile.glob(os.path.join(transformed_data_path, "*.gz"))  # Buscar archivos .gz

# Verificar que hay archivos transformados
if not transformed_files:
    raise FileNotFoundError(f"ERROR: No se encontraron archivos transformados en {transformed_data_path}")

# Cargar el dataset transformado
raw_dataset = tf.data.TFRecordDataset(transformed_files, compression_type="GZIP")  # Especificamos que está comprimido

print("Inspeccionando ejemplos transformados...\n")

# Mostrar los primeros 5 ejemplos transformados
for raw_record in raw_dataset.take(5):
    example = Example()
    example.ParseFromString(raw_record.numpy())  # Decodificar el TFRecord
    print(example)
    print("\n" + "-"*80 + "\n")  # Separador entre ejemplos


Inspeccionando ejemplos transformados...

features {
  feature {
    key: "Aspect"
    value {
      float_list {
        value: 0.008333333767950535
      }
    }
  }
  feature {
    key: "Cover_Type"
    value {
      int64_list {
        value: 1
      }
    }
  }
  feature {
    key: "Elevation"
    value {
      float_list {
        value: 0.5072901248931885
      }
    }
  }
  feature {
    key: "Hillshade_3pm"
    value {
      float_list {
        value: 0.5691699385643005
      }
    }
  }
  feature {
    key: "Hillshade_9am"
    value {
      float_list {
        value: 0.7559055089950562
      }
    }
  }
  feature {
    key: "Hillshade_Noon"
    value {
      float_list {
        value: 0.7952755689620972
      }
    }
  }
  feature {
    key: "Horizontal_Distance_To_Fire_Points"
    value {
      float_list {
        value: 0.2172154039144516
      }
    }
  }
  feature {
    key: "Horizontal_Distance_To_Hydrology"
    value {
      float_list {
        value: 0.3540146052

2025-02-28 00:37:21.771417: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [38]:
#Metadatos de aprendizaje automatico
from ml_metadata.metadata_store import metadata_store
from ml_metadata.proto import metadata_store_pb2
from ml_metadata.proto import metadata_store_service_pb2

# Definir la configuración del almacén de metadatos
metadata_config = metadata_store_pb2.ConnectionConfig()
metadata_config.sqlite.filename_uri = METADATA_PATH  # Usar la misma ruta de metadatos definida antes
metadata_config.sqlite.connection_mode = metadata_store_pb2.SqliteMetadataSourceConfig.READWRITE

# Crear una instancia del almacén de metadatos
metadata_store = metadata_store.MetadataStore(metadata_config)

print("Conexión al almacén de metadatos establecida correctamente.")



Conexión al almacén de metadatos establecida correctamente.


In [39]:
print(METADATA_PATH)


/app/tfx_pipeline/metadata.db


In [40]:
#Listar todos los tipos de artefactos
from ml_metadata.proto import metadata_store_pb2

# Obtener todos los tipos de artefactos registrados
artifact_types = metadata_store.get_artifact_types()

# Mostrar todos los tipos de artefactos registrados
for artifact_type in artifact_types:
    print(f"ID: {artifact_type.id}, Nombre: {artifact_type.name}, Propiedades: {artifact_type.properties}")


ID: 14, Nombre: Examples, Propiedades: {'version': 1, 'split_names': 3, 'span': 1}
ID: 16, Nombre: ExampleStatistics, Propiedades: {'split_names': 3, 'span': 1}
ID: 18, Nombre: Schema, Propiedades: {}
ID: 21, Nombre: ExampleAnomalies, Propiedades: {'span': 1, 'split_names': 3}
ID: 23, Nombre: TransformGraph, Propiedades: {}
ID: 24, Nombre: TransformCache, Propiedades: {}


In [41]:
#lista de artefactos para schema

schema_type = metadata_store.get_artifact_type(type_name="Schema")

# Mostrar información del tipo de artefacto
print(f"ID: {schema_type.id}")
print(f"Nombre: {schema_type.name}")
print(f"Propiedades: {schema_type.properties}")



ID: 18
Nombre: Schema
Propiedades: {}


In [42]:
# Obtener todos los artefactos de tipo "Schema"
schemas = metadata_store.get_artifacts_by_type("Schema")

# Mostrar la información de los esquemas
for schema in schemas:
    print(f"ID: {schema.id}")
    print(f"URI: {schema.uri}")
    print(f"Creado en: {schema.create_time_since_epoch}")
    print(f"Propiedades: {schema.properties}")
    print("-" * 40)

ID: 3
URI: /app/tfx_pipeline/SchemaGen/schema/3
Creado en: 1740702709387
Propiedades: {}
----------------------------------------
ID: 6
URI: /app/tfx_pipeline/ImportSchemaGen/schema/6
Creado en: 1740702827031
Propiedades: {}
----------------------------------------
ID: 13
URI: /app/tfx_pipeline/Transform/pre_transform_schema/11
Creado en: 1740702973134
Propiedades: {}
----------------------------------------
ID: 15
URI: /app/tfx_pipeline/Transform/post_transform_schema/11
Creado en: 1740702973135
Propiedades: {}
----------------------------------------


In [43]:
#Obtener propiedades de artefacto (ID=15)
artifact_id = 15  
artifact_type = "Schema" 

# Obtener artefactos solo del tipo específico
artifacts = metadata_store.get_artifacts_by_type(artifact_type)

# Filtrar por ID
artifact = next((a for a in artifacts if a.id == artifact_id), None)

if artifact:
    print(f"Artefacto encontrado: {artifact}")
else:
    print(f"No se encontró un artefacto con ID {artifact_id} del tipo {artifact_type}")


Artefacto encontrado: id: 15
type_id: 18
uri: "/app/tfx_pipeline/Transform/post_transform_schema/11"
custom_properties {
  key: "name"
  value {
    string_value: "post_transform_schema:2025-02-28T00:36:11.871002"
  }
}
custom_properties {
  key: "producer_component"
  value {
    string_value: "Transform"
  }
}
custom_properties {
  key: "tfx_version"
  value {
    string_value: "1.16.0"
  }
}
state: LIVE
name: "post_transform_schema:2025-02-28T00:36:11.871002"
type: "Schema"
create_time_since_epoch: 1740702973135
last_update_time_since_epoch: 1740703015092



In [44]:
# Verificar si el artefacto tiene custom_properties
if artifact.custom_properties:
    print("\n📌 Propiedades personalizadas del artefacto:")
    for key, value in artifact.custom_properties.items():
        if value.HasField("int_value"):
            print(f"{key}: {value.int_value}")
        elif value.HasField("double_value"):
            print(f"{key}: {value.double_value}")
        elif value.HasField("string_value"):
            print(f"{key}: {value.string_value}")
else:
    print("No hay propiedades personalizadas en este artefacto.")



📌 Propiedades personalizadas del artefacto:
tfx_version: 1.16.0
name: post_transform_schema:2025-02-28T00:36:11.871002
producer_component: Transform


In [45]:
# Estado del artefacto
artifact_state = artifact.state

# Producer Component 
producer_component = artifact.custom_properties["producer_component"].string_value \
    if "producer_component" in artifact.custom_properties else "N/A"

# Mostrar la información
print(f"Estado del artefacto: {artifact_state}")
print(f"Componente que lo generó: {producer_component}")


Estado del artefacto: 2
Componente que lo generó: Transform


In [46]:
# Obtener todos los artefactos del tipo "ExampleStatistics"
example_statistics_artifacts = metadata_store.get_artifacts_by_type("ExampleStatistics")

# Revisar cada artefacto y extraer los split_names
for artifact in example_statistics_artifacts:
    print(f"\Artefacto ID: {artifact.id}")

    # Extraer los split_names desde custom_properties
    if "split_names" in artifact.custom_properties:
        split_names = artifact.custom_properties["split_names"].string_value
        print(f"Split Names: {split_names}")
    else:
        print("No se encontraron split_names en este artefacto.")


\Artefacto ID: 2
No se encontraron split_names en este artefacto.
\Artefacto ID: 5
No se encontraron split_names en este artefacto.
\Artefacto ID: 8
No se encontraron split_names en este artefacto.
\Artefacto ID: 14
No se encontraron split_names en este artefacto.
\Artefacto ID: 16
No se encontraron split_names en este artefacto.


In [47]:
# Obtener todos los artefactos de tipo "TransformGraph"
graphs = metadata_store.get_artifacts_by_type("TransformGraph")

# Mostrar la información de los esquemas
for TransformGraph in graphs:
    print(f"ID: {TransformGraph.id}")
    print(f"URI: {TransformGraph.uri}")


ID: 10
URI: /app/tfx_pipeline/Transform/transform_graph/11


In [48]:
#Crear una función para devolver artefaactos de transformGraph
from ml_metadata.proto import metadata_store_pb2

# ID del artefacto `TransformGraph` que queremos analizar
transform_graph_id = 10  

# Obtener los eventos asociados al artefacto `TransformGraph`
events = metadata_store.get_events_by_artifact_ids([transform_graph_id])

# Obtener la ejecución que generó este artefacto
execution_id = None
for event in events:
    if event.type == metadata_store_pb2.Event.OUTPUT:  # La ejecución que lo generó
        execution_id = event.execution_id
        break

if execution_id:
    print(f"✅ Artefacto TransformGraph fue generado por la ejecución ID: {execution_id}")

    # Obtener los eventos de entrada de esa ejecución
    execution_events = metadata_store.get_events_by_execution_ids([execution_id])

    # Filtrar solo los artefactos de entrada
    input_artifact_ids = [event.artifact_id for event in execution_events if event.type == metadata_store_pb2.Event.INPUT]

    print("\Artefactos principales utilizados para generar TransformGraph:")
    for artifact_id in input_artifact_ids:
        artifact = metadata_store.get_artifacts_by_id([artifact_id])[0]
        print(f"- ID: {artifact.id}, Tipo: {artifact.type_id}, URI: {artifact.uri}")
else:
    print("No se encontró la ejecución que generó este TransformGraph.")



✅ Artefacto TransformGraph fue generado por la ejecución ID: 11
\Artefactos principales utilizados para generar TransformGraph:
- ID: 1, Tipo: 14, URI: /app/tfx_pipeline/CsvExampleGen/examples/1
- ID: 6, Tipo: 18, URI: /app/tfx_pipeline/ImportSchemaGen/schema/6


In [49]:
#Rastree las entradas de un artefacto en particular.

from ml_metadata.proto import metadata_store_pb2

# ID del artefacto 
artifact_id = 10  

# Obtener los eventos asociados al artefacto
events = metadata_store.get_events_by_artifact_ids([artifact_id])

# Encontrar la ejecución que generó el artefacto
execution_id = None
for event in events:
    if event.type == metadata_store_pb2.Event.OUTPUT:  #
        execution_id = event.execution_id
        break

if execution_id:
    print(f"Artefacto ID {artifact_id} fue generado por la ejecución ID: {execution_id}")

    # Obtener los eventos de entrada de esa ejecución
    execution_events = metadata_store.get_events_by_execution_ids([execution_id])

    # Filtrar los artefactos que sirvieron de entrada
    input_artifact_ids = [event.artifact_id for event in execution_events if event.type == metadata_store_pb2.Event.INPUT]

    print("\Artefactos de entrada utilizados para generar este artefacto:")
    for input_id in input_artifact_ids:
        input_artifact = metadata_store.get_artifacts_by_id([input_id])[0]
        print(f"- ID: {input_artifact.id}, Tipo ID: {input_artifact.type_id}, URI: {input_artifact.uri}")
else:
    print("No se encontró la ejecución que generó este artefacto.")



Artefacto ID 10 fue generado por la ejecución ID: 11
\Artefactos de entrada utilizados para generar este artefacto:
- ID: 1, Tipo ID: 14, URI: /app/tfx_pipeline/CsvExampleGen/examples/1
- ID: 6, Tipo ID: 18, URI: /app/tfx_pipeline/ImportSchemaGen/schema/6
