<a href="https://colab.research.google.com/github/JuanDiaz77/Proyecto-colab/blob/main/Validaci%C3%B3n_de_datos_con_Great_Expectations_actividad_pr%C3%A1ctica.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =========================================
#  Reinicio limpio y reinstalación controlada
# =========================================

!pip uninstall -y numpy pandas great_expectations
!pip install numpy==1.26.4 pandas==2.2.2 great_expectations==0.18.12 jinja2==3.1.4

# Reinicia el kernel para aplicar los cambios (obligatorio en Colab)
import IPython
IPython.Application.instance().kernel.do_shutdown(True)


In [None]:
# Script universal para validar un DataFrame (equivalente a Great Expectations)
# Crea CSV, valida reglas, guarda JSON de resultados y genera reporte HTML.
# Ejecutar en Google Colab (o localmente).

!pip install jinja2 --quiet

import pandas as pd
import re
import json
from jinja2 import Template
from pathlib import Path

# -------------------------
# 1) Crear dataset de ejemplo
# -------------------------
data = {
    "id": [1, 2, 3, 4, 5],
    "nombre": ["Laura Gómez", "Carlos Pérez", "Ana Torres", "Juan Díaz", "María López"],
    "edad": [25, 17, 30, 45, 22],
    "email": [
        "laura.gomez@example.com",
        "carlos.perez@example.com",
        None,
        "juan.diaz@example.com",
        "maria.lopez@example.com"
    ],
    "pais": ["Colombia", "México", "Chile", "Argentina", "Colombia"]
}
df = pd.DataFrame(data)
csv_path = "clientes.csv"
df.to_csv(csv_path, index=False)
print(f"CSV creado: {csv_path}")
display(df)

# -------------------------
# 2) Definir "expectativas" (reglas)
# -------------------------
expectations = [
    {
        "name": "email_not_null",
        "description": "La columna 'email' no debe tener valores nulos",
        "column": "email",
        "type": "not_null"
    },
    {
        "name": "nombre_not_null",
        "description": "La columna 'nombre' no debe tener valores nulos",
        "column": "nombre",
        "type": "not_null"
    },
    {
        "name": "edad_between_18_99",
        "description": "La columna 'edad' debe estar entre 18 y 99",
        "column": "edad",
        "type": "between",
        "min": 18,
        "max": 99
    },
    {
        "name": "email_regex",
        "description": "La columna 'email' debe tener formato de correo válido",
        "column": "email",
        "type": "regex",
        "pattern": r"[^@]+@[^@]+\.[^@]+"
    },
    {
        "name": "id_unique",
        "description": "La columna 'id' debe ser única",
        "column": "id",
        "type": "unique"
    }
]

# -------------------------
# 3) Ejecutar validaciones
# -------------------------
results = {"success": True, "results": []}

for exp in expectations:
    col = exp["column"]
    etype = exp["type"]
    res = {"expectation": exp["name"], "description": exp["description"], "success": None, "details": {}}
    if etype == "not_null":
        null_mask = df[col].isnull()
        failed_rows = df[null_mask]
        success = failed_rows.shape[0] == 0
        res["success"] = success
        res["details"] = {"failed_count": int(failed_rows.shape[0]), "failed_rows_index": failed_rows.index.tolist()}
    elif etype == "between":
        mn = exp["min"]
        mx = exp["max"]
        mask = ~df[col].between(mn, mx)
        failed_rows = df[mask]
        success = failed_rows.shape[0] == 0
        res["success"] = success
        res["details"] = {"failed_count": int(failed_rows.shape[0]), "failed_rows_index": failed_rows.index.tolist(), "min": mn, "max": mx}
    elif etype == "regex":
        pattern = re.compile(exp["pattern"])
        mask = df[col].isnull() | ~df[col].astype(str).apply(lambda v: bool(pattern.match(v)))
        # treat null as failure (unless you want to skip)
        failed_rows = df[mask]
        success = failed_rows.shape[0] == 0
        res["success"] = success
        res["details"] = {"failed_count": int(failed_rows.shape[0]), "failed_rows_index": failed_rows.index.tolist(), "pattern": exp["pattern"]}
    elif etype == "unique":
        dup_mask = df[col].duplicated(keep=False)
        failed_rows = df[dup_mask]
        success = failed_rows.shape[0] == 0
        res["success"] = success
        res["details"] = {"failed_count": int(failed_rows.shape[0]), "failed_rows_index": failed_rows.index.tolist()}
    else:
        res["success"] = False
        res["details"] = {"error": "Tipo de expectativa desconocido"}
    results["results"].append(res)
    if not res["success"]:
        results["success"] = False

# Añadir un resumen por expectation
for r in results["results"]:
    print(f"{'✅' if r['success'] else '❌'} {r['expectation']}: {r['description']} (failed_count={r['details'].get('failed_count',0)})")

# -------------------------
# 4) Guardar resultados en JSON (equivalente al output de GE)
# -------------------------
json_path = "reporte_validacion.json"
with open(json_path, "w") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)

print(f"\n Resultados guardados en: {json_path}")

# -------------------------
# 5) Generar reporte HTML (resumen + filas fallidas)
# -------------------------
failed_rows_summary = []
for r in results["results"]:
    if not r["success"]:
        idxs = r["details"].get("failed_rows_index", [])
        for i in idxs:
            row = df.loc[i].to_dict()
            failed_rows_summary.append({"expectation": r["expectation"], "index": int(i), "row": row})

html_template = """
<!doctype html>
<html>
<head>
  <meta charset="utf-8">
  <title>Reporte de Validación - Datos</title>
  <style>
    body { font-family: Arial, sans-serif; margin: 24px; color: #111; }
    h1 { color: #1f618d; }
    table { border-collapse: collapse; width: 100%; margin-bottom: 18px; }
    th, td { border: 1px solid #ddd; padding: 8px; text-align:left; }
    th { background:#f2f2f2; }
    .ok { color: green; font-weight: bold; }
    .fail { color: red; font-weight: bold; }
    .small { font-size: 0.9em; color:#666; }
    .box { padding: 12px; border-radius: 6px; background:#fafafa; border:1px solid #eee; }
  </style>
</head>
<body>
  <h1>Reporte de Validación de Datos</h1>
  <p class="small">Archivo: <strong>{{ csv_path }}</strong></p>
  <div class="box">
    <h2>Resumen</h2>
    <p>Resultado global: <strong class="{{ 'ok' if success else 'fail' }}">{{ 'Éxito' if success else 'Fallo' }}</strong></p>
    <table>
      <thead><tr><th>Expectation</th><th>Descripción</th><th>Resultado</th><th>Failed count</th></tr></thead>
      <tbody>
      {% for r in results %}
        <tr>
          <td>{{ r.expectation }}</td>
          <td>{{ r.description }}</td>
          <td class="{{ 'ok' if r.success else 'fail' }}">{{ 'OK' if r.success else 'Falló' }}</td>
          <td>{{ r.details.failed_count }}</td>
        </tr>
      {% endfor %}
      </tbody>
    </table>
  </div>

  <h2>Filas que no cumplieron una o más expectativas</h2>
  {% if failed_rows %}
    <table>
      <thead>
        <tr>
          <th>Expectation</th><th>Index</th><th>Fila (JSON)</th>
        </tr>
      </thead>
      <tbody>
      {% for f in failed_rows %}
        <tr>
          <td>{{ f.expectation }}</td>
          <td>{{ f.index }}</td>
          <td><pre>{{ f.row | tojson(indent=2, ensure_ascii=false) }}</pre></td>
        </tr>
      {% endfor %}
      </tbody>
    </table>
  {% else %}
    <p>No se encontraron filas fallidas.</p>
  {% endif %}

  <p class="small">Generado con script de validación alternativo (sin Great Expectations) — resultado reproducible.</p>
</body>
</html>
"""

template = Template(html_template)
html = template.render(csv_path=csv_path, success=results["success"], results=results["results"], failed_rows=failed_rows_summary)

html_path = "reporte_validacion.html"
with open(html_path, "w", encoding="utf-8") as f:
    f.write(html)

print(f" Reporte HTML generado: {html_path}")
# Si estás en Colab, puedes descargarlo desde el panel lateral o servirlo:
display(Path(html_path).absolute().as_uri())
