# Auditoria de integridad HDFS (entrega minima)

Este notebook cumple los requisitos de entrega:
- Lectura de auditorias fsck.
- Tabla de metricas (tiempos/recursos).
- Conclusiones y recomendaciones.


## 1) Configuracion y rutas

El notebook intenta leer auditorias desde rutas locales comunes y, si no existen, intenta exportarlas desde HDFS a un directorio local temporal.


In [None]:
!pip install pandas numpy

Collecting pandas
  Downloading pandas-3.0.0-cp314-cp314-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.4.2-cp314-cp314-win_amd64.whl.metadata (6.6 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting tzdata (from pandas)
  Using cached tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading pandas-3.0.0-cp314-cp314-win_amd64.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ------------------------- -------------- 6.3/9.9 MB 51.5 MB/s eta 0:00:01
   ---------------------------------------- 9.9/9.9 MB 48.9 MB/s  0:00:00
Downloading numpy-2.4.2-cp314-cp314-win_amd64.whl (12.4 MB)
   ---------------------------------------- 0.0/12.4 MB ? eta -:--:--
   ---------------------------------------- 12.4/12.4 MB 62.2 


[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: C:\Users\ferna\AppData\Local\Programs\Python\Python314\python.exe -m pip install --upgrade pip


In [None]:
from pathlib import Path
import os
import re
import shutil
import subprocess
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', 120)
WORKDIR = Path.cwd()
LOCAL_EXPORT_DIR = WORKDIR / "audit_export" / "fsck"
CANDIDATE_AUDIT_DIRS = [
    Path(os.environ.get("AUDIT_DIR", "")) if os.environ.get("AUDIT_DIR") else None,
    Path("/media/notebooks/audit/fsck"),
    WORKDIR / "audit" / "fsck",
    WORKDIR.parent / "audit" / "fsck",
    LOCAL_EXPORT_DIR,
]
CANDIDATE_AUDIT_DIRS = [p for p in CANDIDATE_AUDIT_DIRS if p is not None]
print("WORKDIR:", WORKDIR)


## 2) Carga de auditorias fsck

Si no se encuentran archivos localmente, se intenta copiar `/audit/fsck` desde HDFS usando `hdfs dfs -get`.


In [None]:
def resolve_audit_dir():
    for p in CANDIDATE_AUDIT_DIRS:
        if p.exists() and any(p.glob("*/fsck_data.txt")):
            return p
    return None

def export_from_hdfs(local_dir: Path) -> bool:
    if shutil.which("hdfs") is None:
        return False
    local_dir.mkdir(parents=True, exist_ok=True)
    test_cmd = ["hdfs", "dfs", "-test", "-d", "/audit/fsck"]
    test_res = subprocess.run(test_cmd, capture_output=True, text=True)
    if test_res.returncode != 0:
        return False
    get_cmd = ["hdfs", "dfs", "-get", "-f", "/audit/fsck/*", str(local_dir)]
    get_res = subprocess.run(get_cmd, capture_output=True, text=True)
    return get_res.returncode == 0

AUDIT_DIR = resolve_audit_dir()
if AUDIT_DIR is None:
    ok = export_from_hdfs(LOCAL_EXPORT_DIR)
    AUDIT_DIR = LOCAL_EXPORT_DIR if ok else None

print("AUDIT_DIR:", AUDIT_DIR)
if AUDIT_DIR is None:
    print("No se encontraron auditorias fsck. Ejecuta scripts/30_fsck_audit.sh y reintenta.")
else:
    print("Fechas detectadas:", sorted([p.name for p in AUDIT_DIR.glob("*") if p.is_dir()]))


In [None]:
def parse_fsck_text(text: str) -> dict:
    return {
        "corrupt": len(re.findall(r"\bCORRUPT\b", text, flags=re.IGNORECASE)),
        "missing": len(re.findall(r"\bMISSING\b", text, flags=re.IGNORECASE)),
        "under_replicated": len(re.findall(r"Under replicated", text, flags=re.IGNORECASE)),
        "status_healthy": bool(re.search(r"Status:\s*HEALTHY", text, flags=re.IGNORECASE)),
    }

rows = []
if AUDIT_DIR is not None:
    for dt_dir in sorted([p for p in AUDIT_DIR.glob("*") if p.is_dir()]):
        fsck_file = dt_dir / "fsck_data.txt"
        if not fsck_file.exists():
            continue
        txt = fsck_file.read_text(encoding="utf-8", errors="ignore")
        m = parse_fsck_text(txt)
        m["dt"] = dt_dir.name
        m["source_file"] = str(fsck_file)
        rows.append(m)

df_fsck = pd.DataFrame(rows)
if not df_fsck.empty:
    df_fsck = df_fsck[["dt", "corrupt", "missing", "under_replicated", "status_healthy", "source_file"]].sort_values("dt")
else:
    df_fsck = pd.DataFrame(columns=["dt", "corrupt", "missing", "under_replicated", "status_healthy", "source_file"])

print("Tabla de lectura de auditorias fsck:")
df_fsck


## 3) Tabla de metricas (tiempos/recursos)

Carga `notebooks/metrics/metrics.csv` si existe. Si no, crea una tabla base para completar con tus datos (por ejemplo de `docker stats` y tiempos de ejecucion por fase).


In [None]:
METRICS_PATHS = [
    WORKDIR / "metrics" / "metrics.csv",
    WORKDIR.parent / "metrics" / "metrics.csv",
    Path("/media/notebooks/metrics/metrics.csv"),
]

metrics_path = next((p for p in METRICS_PATHS if p.exists()), None)
if metrics_path is not None:
    df_metrics = pd.read_csv(metrics_path)
    df_metrics["source"] = str(metrics_path)
else:
    df_metrics = pd.DataFrame([
        {"fase": "ingesta", "duracion_seg": np.nan, "cpu_promedio_pct": np.nan, "mem_promedio_mib": np.nan, "red_in_mib": np.nan, "red_out_mib": np.nan, "source": "template"},
        {"fase": "auditoria_fsck", "duracion_seg": np.nan, "cpu_promedio_pct": np.nan, "mem_promedio_mib": np.nan, "red_in_mib": np.nan, "red_out_mib": np.nan, "source": "template"},
        {"fase": "backup_copy", "duracion_seg": np.nan, "cpu_promedio_pct": np.nan, "mem_promedio_mib": np.nan, "red_in_mib": np.nan, "red_out_mib": np.nan, "source": "template"},
        {"fase": "incidente_recuperacion", "duracion_seg": np.nan, "cpu_promedio_pct": np.nan, "mem_promedio_mib": np.nan, "red_in_mib": np.nan, "red_out_mib": np.nan, "source": "template"},
    ])

print("Tabla de metricas (tiempos/recursos):")
df_metrics


## 4) Conclusiones y recomendaciones

Se generan conclusiones automaticas basadas en la auditoria y en la tabla de metricas cargada.


In [None]:
conclusiones = []

if df_fsck.empty:
    conclusiones.append("No hay evidencia fsck cargada: ejecutar scripts/30_fsck_audit.sh y volver a correr el notebook.")
else:
    ult = df_fsck.sort_values("dt").iloc[-1]
    dt_ult = ult["dt"]
    corrupt_ult = int(ult["corrupt"])
    missing_ult = int(ult["missing"])
    under_ult = int(ult["under_replicated"])

    if corrupt_ult == 0 and missing_ult == 0:
        conclusiones.append(f"Integridad correcta en {dt_ult}: sin CORRUPT ni MISSING.")
    else:
        conclusiones.append(f"Se detectan incidencias en {dt_ult}: CORRUPT={corrupt_ult}, MISSING={missing_ult}.")

    if under_ult > 0:
        conclusiones.append("Hay bloques under_replicated; revisar numero de DataNodes vivos y factor de replicacion.")

if df_metrics["duracion_seg"].notna().any():
    fase_lenta = df_metrics.loc[df_metrics["duracion_seg"].idxmax(), "fase"]
    tiempo_max = float(df_metrics["duracion_seg"].max())
    conclusiones.append(f"La fase mas costosa en tiempo es {fase_lenta} ({tiempo_max:.1f} s).")
else:
    conclusiones.append("Completa la tabla de metricas para cerrar el analisis de coste/beneficio.")

recomendaciones = [
    "Mantener auditoria fsck periodica y guardar resultados en /audit/fsck/<DT>/.",
    "Usar al menos 3 DataNodes para incidentes realistas y reducir under-replication.",
    "Definir frecuencia de auditoria segun riesgo y coste (diaria para datos criticos, semanal para datos frios).",
    "Registrar siempre tiempos y recursos (docker stats) para justificar decisiones de replicacion.",
]

print("Conclusiones:")
for i, c in enumerate(conclusiones, 1):
    print(f"{i}. {c}")

print("\nRecomendaciones:")
for i, r in enumerate(recomendaciones, 1):
    print(f"{i}. {r}")


## 5) Exportables para la entrega

- `fsck_resumen_notebook.csv` con la lectura de auditorias.
- `metricas_notebook.csv` con la tabla de tiempos/recursos.


In [None]:
out_fsck = WORKDIR / "fsck_resumen_notebook.csv"
out_metrics = WORKDIR / "metricas_notebook.csv"
df_fsck.to_csv(out_fsck, index=False)
df_metrics.to_csv(out_metrics, index=False)
print("Exportado:")
print("-", out_fsck)
print("-", out_metrics)
