In [7]:
import boto3
import pandas as pd
import great_expectations as gx
from pathlib import Path
import tempfile
import os

In [8]:
# Configuración de MinIO
MINIO_ENDPOINT = "localhost:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "admin123"
BUCKET_NAME = "trusted-zone"
OBJECT_KEY = "storage/social_media/bluesky/posts_clean.parquet"

In [9]:
# Ruta temporal para guardar el .parquet descargado
temp_dir = tempfile.gettempdir()
local_file_path = os.path.join(temp_dir, "posts_clean.parquet")

In [10]:
# Cliente de MinIO (compatible S3)
s3 = boto3.client(
    "s3",
    endpoint_url=f"http://{MINIO_ENDPOINT}",
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY,
)

In [19]:
response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix="storage/social_media/bluesky/")
for obj in response.get("Contents", []):
    print("File: ", obj["Key"])

File:  storage/social_media/bluesky/failed_log.json
File:  storage/social_media/bluesky/images/catalannews.com_20250606_115521_0.jpg
File:  storage/social_media/bluesky/images/catalannews.com_20250606_151445_0.jpg
File:  storage/social_media/bluesky/images/catalannews.com_20250606_171031_0.jpg
File:  storage/social_media/bluesky/images/catalannews.com_20250607_111911_0.jpg
File:  storage/social_media/bluesky/images/catalannews.com_20250607_131108_0.jpg
File:  storage/social_media/bluesky/images/elpais.com_20250520_155654_0.jpg
File:  storage/social_media/bluesky/images/elpais.com_20250520_161808_0.jpg
File:  storage/social_media/bluesky/images/elpais.com_20250520_163814_0.jpg
File:  storage/social_media/bluesky/images/elpais.com_20250606_124000_0.jpg
File:  storage/social_media/bluesky/images/elpais.com_20250606_124150_0.jpg
File:  storage/social_media/bluesky/images/elpais.com_20250606_140223_0.jpg
File:  storage/social_media/bluesky/images/elpais.com_20250606_141710_0.jpg
File:  stor

  datetime_now = datetime.datetime.utcnow()


In [20]:
# Descargar archivo desde MinIO
s3.download_file(BUCKET_NAME, OBJECT_KEY, local_file_path)
print(f" Archivo descargado de MinIO: {local_file_path}")

 Archivo descargado de MinIO: /tmp/posts_clean.parquet


In [21]:
# Cargar el DataFrame
df = pd.read_parquet(local_file_path)

In [22]:
# Crear contexto ephemeral de Great Expectations
context = gx.get_context(mode="ephemeral")
data_source = context.data_sources.add_pandas(name="minio_source")
data_asset = data_source.add_dataframe_asset(name="posts_clean")
batch_def = data_asset.add_batch_definition_whole_dataframe("minio_batch")
batch = batch_def.get_batch({"dataframe": df})

In [23]:
# Definir expectativas
expectations = [
    gx.expectations.ExpectColumnValuesToNotBeNull(column="uri"),
    gx.expectations.ExpectColumnValuesToNotBeNull(column="timestamp"),
    gx.expectations.ExpectColumnValuesToNotBeNull(column="text"),
    gx.expectations.ExpectColumnValuesToBeOfType(column="likes", type_="float64"),
    gx.expectations.ExpectColumnValuesToBeBetween(column="likes", min_value=0),
    gx.expectations.ExpectColumnValuesToBeBetween(column="reposts", min_value=0),
    gx.expectations.ExpectColumnValuesToBeBetween(column="replies", min_value=0),
]

In [29]:
# Ejecutar validaciones
results = [batch.validate(expectation) for expectation in expectations]

Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 529.37it/s]
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 951.66it/s] 
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 1172.82it/s]
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 413.39it/s] 
Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 913.79it/s] 
Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 502.94it/s]
Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 969.24it/s] 


In [27]:
# Mostrar resultados
for i, result in enumerate(results, 1):
    print(f"\n Expectation {i}: {result['expectation_config']}")
    print(f" Success: {result['success']}")
    print(f" Details: {result['result']}")


 Expectation 1: {
  "type": "expect_column_values_to_not_be_null",
  "kwargs": {
    "batch_id": "minio_source-posts_clean",
    "column": "uri"
  },
  "meta": {}
}
 Success: True
 Details: {'element_count': 92, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': [], 'partial_unexpected_counts': [], 'partial_unexpected_index_list': []}

 Expectation 2: {
  "type": "expect_column_values_to_not_be_null",
  "kwargs": {
    "batch_id": "minio_source-posts_clean",
    "column": "timestamp"
  },
  "meta": {}
}
 Success: True
 Details: {'element_count': 92, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'partial_unexpected_list': [], 'partial_unexpected_counts': [], 'partial_unexpected_index_list': []}

 Expectation 3: {
  "type": "expect_column_values_to_not_be_null",
  "kwargs": {
    "batch_id": "minio_source-posts_clean",
    "column": "text"
  },
  "meta": {}
}
 Success: True
 Details: {'element_count': 92, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'p

In [28]:
# (Opcional) Eliminar archivo temporal
os.remove(local_file_path)
print(" Archivo temporal eliminado.")

 Archivo temporal eliminado.
