# Revisión de resultados

Revisión preeliminar de los resultados y generación de visualización para incluir en la memoria del TFM

In [1]:
import os
from pyspark.sql import SparkSession, functions as F, Window
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
spark = SparkSession.builder.appName("PAC2024-Visualizacion").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 11:36:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
input_path = "data/pac_2024_anomalias_kmeans/"
output_dir = "data/pac_sample_viz"
os.makedirs(output_dir, exist_ok=True) 

In [4]:
df = spark.read.parquet(input_path)

                                                                                

## Visualización básica de resultados 

In [5]:
resumen = df.select(
    F.count("*").alias("n"),
    F.mean("anomaly_score").alias("mean_anomaly"),
    F.stddev("anomaly_score").alias("std_anomaly"),
    F.min("anomaly_score").alias("min_anomaly"),
    F.expr("percentile(anomaly_score, array(0.01,0.05,0.25,0.5,0.75,0.95,0.99))").alias("percentiles"),
    F.max("anomaly_score").alias("max_anomaly")
)
resumen.show(truncate=False)

[Stage 3:>                                                          (0 + 1) / 1]

+-------+----------------------+------------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|n      |mean_anomaly          |std_anomaly       |min_anomaly        |percentiles                                                                                                                                     |max_anomaly      |
+-------+----------------------+------------------+-------------------+------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+
|2217817|1.2836369699484627E-14|1.0000002254469875|-0.6480860485671216|[-0.6475237495601345, -0.6338821129451712, -0.407826010388501, -0.19928771779860754, 0.10776618854231303, 1.1799441126913337, 2.655021946564461]|278.9492448496908|
+-------+----------------------+------------------+---------

                                                                                

In [6]:
# Proporción de anómalos
prop_anom = df.agg(F.mean(F.col("ANOMALIA").cast("double")).alias("ratio_anomalos"))
prop_anom.show()

+-------------------+
|     ratio_anomalos|
+-------------------+
|0.02054768269879796|
+-------------------+



### Revisiones específicas

In [7]:
# Top-N por anomaly_score
N = 50
cols_top = ["BENEFICIARIO","PROVINCIA_SAFE","MEDIDA","IMPORTE_EUROS",
            "cluster","dist_to_center","anomaly_score","ANOMALIA"]
top_outliers = df.orderBy(F.col("anomaly_score").desc()).select(*cols_top).limit(N)
top_outliers.show(N, truncate=False)



+-------------------------------------------------------------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------+-------+------------------+------------------+--------+
|BENEFICIARIO                                                 |PROVINCIA_SAFE        |MEDIDA                                                                                                                                 |IMPORTE_EUROS|cluster|dist_to_center    |anomaly_score     |ANOMALIA|
+-------------------------------------------------------------+----------------------+---------------------------------------------------------------------------------------------------------------------------------------+-------------+-------+------------------+------------------+--------+
|AGROPECUARIA MERLI. S.L.                                     |SL                    |22400 - Monzón                        

                                                                                

In [8]:
# Por clúster media de score y ratio de anomalías
by_cluster = df.groupBy("cluster").agg(
    F.count("*").alias("n"),
    F.mean("anomaly_score").alias("mean_anomaly"),
    F.stddev("anomaly_score").alias("std_anomaly"),
    F.mean(F.col("ANOMALIA").cast("double")).alias("ratio_anom")
).orderBy("cluster")
by_cluster.show(truncate=False)

+-------+-------+-----------------------+------------------+--------------------+
|cluster|n      |mean_anomaly           |std_anomaly       |ratio_anom          |
+-------+-------+-----------------------+------------------+--------------------+
|0      |2143645|6.639344840657035E-15  |1.0000002332476727|0.019919342988227996|
|1      |53773  |-1.2250566667999033E-15|1.0000092984764477|0.04915106094136463 |
|2      |20399  |-1.2067232790193548E-15|1.0000245119066613|0.011177018481298102|
+-------+-------+-----------------------+------------------+--------------------+



In [9]:
# Por provincia (top 20 por ratio_anom, con mínimo tamaño para estabilidad)
min_n = 1000
by_prov = (df.groupBy("PROVINCIA_SAFE")
             .agg(F.count("*").alias("n"),
                  F.mean("anomaly_score").alias("mean_anomaly"),
                  F.mean(F.col("ANOMALIA").cast("double")).alias("ratio_anom"))
             .where(F.col("n") >= min_n)
             .orderBy(F.col("ratio_anom").desc()))
by_prov.show(20, truncate=False)

+----------------------+-----+-----------------------+--------------------+
|PROVINCIA_SAFE        |n    |mean_anomaly           |ratio_anom          |
+----------------------+-----+-----------------------+--------------------+
|Las-Palmas            |3940 |3.2888402418961338     |0.9446700507614213  |
|Santa-Cruz-de-Tenerife|10023|2.2520728803341705     |0.8113339319565     |
|Ourense               |9558 |1.3838318853808451     |0.12115505335844319 |
|Pontevedra            |10497|1.2250980923540717     |0.08726302753167571 |
|Alava                 |8059 |1.3734285541002857     |0.08698349671175083 |
|Cantabria             |20592|0.5820391226721915     |0.06837606837606838 |
|Asturias              |39782|0.2797218275202876     |0.06739228796943342 |
|Girona                |13388|0.850858597517955      |0.05123991634299373 |
|Albacete              |53773|-1.2250566667999033E-15|0.04915106094136463 |
|Guadalajara           |14321|0.7020008228147775     |0.03533272816144124 |
|Illes-Balea

In [10]:
# Por medida (intervención)
by_medida = (df.groupBy("MEDIDA")
               .agg(F.count("*").alias("n"),
                    F.mean("anomaly_score").alias("mean_anomaly"),
                    F.mean(F.col("ANOMALIA").cast("double")).alias("ratio_anom"))
               .where(F.col("n") >= min_n)
               .orderBy(F.col("ratio_anom").desc()))
by_medida.show(20, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------+-----+------------------+---------------------+
|MEDIDA                                                                                                                 |n    |mean_anomaly      |ratio_anom           |
+-----------------------------------------------------------------------------------------------------------------------+-----+------------------+---------------------+
|VI.11   Prevención y reparación de los daños causados a los bosques por incendios, desastres naturales y catástrofes   |1580 |4.143900326038061 |1.0                  |
|III.2   En el sector de los productos apícolas                                                                         |2708 |3.9914205744335525|1.0                  |
|VI.12   Inversiones para incrementar la capacidad de adaptación y el valor medioambiental de los ecosistemas forestales|1109 |4.945092488557745 |1.0      

                                                                                

## Visualización gráfica

In [11]:
# Configuracion para plots
plt.rcParams.update({
    "figure.figsize": (8, 5),
    "figure.dpi": 100,
    "savefig.dpi": 300,
    "font.size": 11,
    "axes.titlesize": 12,
    "axes.labelsize": 11,
    "axes.grid": True,
    "grid.alpha": 0.25,
    "legend.fontsize": 10,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
})

### Distribución general

In [None]:
pdf_dist = (df.select("anomaly_score", "ANOMALIA")
              .sample(withReplacement=False, fraction=1.0, seed=42)  
              .toPandas())


# Boxplot anomaly_score
plt.figure()
pdf_dist.boxplot(column="anomaly_score")
plt.title("Boxplot de la puntuación de anomalía")
plt.ylabel("anomaly_score")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "boxplot_anomaly_score.png"))
plt.savefig(os.path.join(output_dir, "boxplot_anomaly_score.pdf"))
plt.close()

## Por cluster

In [13]:
# se realiza con todo el dataset
pdf_cluster = (df.groupBy("cluster")
                .agg(F.count("*").alias("n"),
                     F.mean("anomaly_score").alias("mean_anomaly"),
                     F.stddev("anomaly_score").alias("std_anomaly"),
                     F.mean(F.col("ANOMALIA").cast("double")).alias("ratio_anom"))
                .orderBy("cluster")).toPandas()

# Ratio de anómalos por clúster
plt.figure()
plt.bar(pdf_cluster["cluster"].astype(str), pdf_cluster["ratio_anom"])
plt.title("Proporción de anomalías por clúster")
plt.xlabel("Clúster")
plt.ylabel("Ratio de ANOMALIA=1")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "bar_ratio_anom_por_cluster.png"))
plt.savefig(os.path.join(output_dir, "bar_ratio_anom_por_cluster.pdf"))
plt.close()

# Media de anomaly_score por clúster
plt.figure()
plt.bar(pdf_cluster["cluster"].astype(str), pdf_cluster["mean_anomaly"])
plt.title("Media de anomaly_score por clúster")
plt.xlabel("Clúster")
plt.ylabel("Media de anomaly_score")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "bar_mean_anomaly_por_cluster.png"))
plt.savefig(os.path.join(output_dir, "bar_mean_anomaly_por_cluster.pdf"))
plt.close()


In [None]:
# Dispersión IMPORTE_LOG vs anomaly_score por clúster
# Submuestreo opcional para claridad en el scatter
df_scatter = (df.select("cluster", "IMPORTE_LOG", "anomaly_score")
                .dropna()
                .sample(False, 0.2, seed=42))  # 20% del sample
pdf_scatter = df_scatter.toPandas()

clusters = sorted(pdf_scatter["cluster"].unique())

cols = 2
rows = 2

fig, axes = plt.subplots(rows, cols, figsize=(10,8), sharex=True, sharey=True)

axes = axes.ravel()

for i, c in enumerate(clusters):
    ax = axes[i]
    sub = pdf_scatter[pdf_scatter["cluster"] == c]
    ax.scatter(sub["IMPORTE_LOG"], sub["anomaly_score"], s=6, alpha=0.35)
    ax.set_title(f"Clúster {c}")
    ax.set_xlabel("IMPORTE_LOG")
    ax.set_ylabel("anomaly_score")

for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("Dispersión de Importe (log) vs anomaly score por clúster", y=0.98)
fig.tight_layout()
fig.savefig(os.path.join(output_dir, "scatter_importe_anomaly_por_cluster.png"), dpi=300)
fig.savefig(os.path.join(output_dir, "scatter_importe_anomaly_por_cluster.pdf"))
plt.close(fig)


In [15]:
# Clear the cache in Spark
spark.catalog.clearCache()

# Stop the Spark session
spark.stop()