In [2]:
import os
from pyspark.sql import SparkSession, functions as F
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
spark = SparkSession.builder.appName("PAC2024-Visualizacion").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/07 00:02:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/07 00:02:49 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/09/07 00:02:49 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/09/07 00:02:49 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [8]:
input_path = "data/espacio_datos/pac_2024_anomalias_kmeans"
output_dir = "data/pac_2024_viz"

In [9]:
df = spark.read.parquet(input_path)

In [18]:
df.show(100, truncate=False)

+---------------------------------+------------+-----------------------------------------------------------------------------------+---------------+------+--------+------------+------------+-------------+-----------+------------------+------------+-----------+------------+-----------+------------+-------+------------------+-------------------+--------+--------------+
|BENEFICIARIO                     |MUNICIPIO   |MEDIDA                                                                             |OBJETIVO_ESP   |FEAGA |FEADER  |IMPORTECOFIN|FEADER_COFIN|IMPORTE_EUROS|IMPORTE_ABS|IMPORTE_LOG       |IMPORTE_SIGN|FEAGA_ratio|FEADER_ratio|COFIN_ratio|IS_RECUP_ANY|cluster|dist_to_center    |anomaly_score      |ANOMALIA|PROVINCIA_SAFE|
+---------------------------------+------------+-----------------------------------------------------------------------------------+---------------+------+--------+------------+------------+-------------+-----------+------------------+------------+-----------+

In [19]:
df.groupBy(F.col("ANOMALIA")).count().show()

+--------+-------+
|ANOMALIA|  count|
+--------+-------+
|       1|  44631|
|       0|2173185|
|    null|      1|
+--------+-------+



In [10]:
# Anomalía por provincia
anom_prov = (
df.groupBy("PROVINCIA_SAFE")
.agg(F.count("*").alias("n"), F.sum(F.col("ANOMALIA").cast("int")).alias("anom"))
.withColumn("anom_rate", F.col("anom")/F.col("n"))
.orderBy(F.col("anom_rate").desc(), F.col("n").desc())
)

In [11]:
anom_prov.show(truncate=False)



+--------------+-----+----+--------------------+
|PROVINCIA_SAFE|n    |anom|anom_rate           |
+--------------+-----+----+--------------------+
|Ceuta         |84   |84  |1.0                 |
|Melilla       |84   |84  |1.0                 |
|SL            |1    |1   |1.0                 |
|Alava         |8059 |6077|0.7540637796252637  |
|Pontevedra    |10497|3575|0.3405734971896732  |
|Ourense       |9558 |3242|0.33919229964427705 |
|Guipuzcoa     |11093|2475|0.22311367529072387 |
|Vizcaya       |9755 |1955|0.20041004613018965 |
|Las-Palmas    |3940 |344 |0.08730964467005076 |
|La-Rioja      |19794|1709|0.08633929473577852 |
|Girona        |13388|843 |0.0629668359725127  |
|Leon          |30820|897 |0.029104477611940297|
|Teruel        |29622|709 |0.023934913240159342|
|Barcelona     |31299|748 |0.023898527109492317|
|A-Coruna      |29735|659 |0.02216243484109635 |
|Navarra       |41046|857 |0.020879013789407007|
|Caceres       |51830|1045|0.020162068300212232|
|Cantabria     |2059

                                                                                

In [12]:
# Anomalías por provincia y medida (para heatmap)
anom_prov_med = (
df.groupBy("PROVINCIA_SAFE","MEDIDA")
.agg(F.count("*").alias("n"), F.sum(F.col("ANOMALIA").cast("int")).alias("anom"))
.withColumn("anom_rate", F.col("anom")/F.col("n"))
)

In [13]:
# Top beneficiarios por anomaly_score (auditoría)
topN = (
df.orderBy(F.col("anomaly_score").desc_nulls_last())
.select("BENEFICIARIO","PROVINCIA_SAFE","MEDIDA","IMPORTE_EUROS","anomaly_score","IS_RECUP_ANY")
.limit(200)
)

In [14]:
by_cluster = (
df.groupBy("cluster")
.agg(
F.count("*").alias("n"),
F.avg("anomaly_score").alias("mean_score"),
F.expr("percentile(anomaly_score, 0.5)").alias("median_score"),
F.avg("IMPORTE_ABS").alias("mean_importe_abs"),
)
.orderBy("cluster")
)

In [21]:
by_cluster.show(truncate=False)



+-------+-------+-----------------------+--------------------+------------------+
|cluster|n      |mean_score             |median_score        |mean_importe_abs  |
+-------+-------+-----------------------+--------------------+------------------+
|0      |1190736|2.912861452648815E-14  |-0.1764727560433063 |2632.345441869625 |
|1      |697349 |7.512292241348402E-16  |-0.18481922858886124|5575.4478011153415|
|2      |40891  |9.007981076961579E-16  |-0.39405182396435223|2949.4293810373924|
|3      |283465 |-1.3321651709245227E-14|-0.17419299640409347|2355.550334609195 |
|4      |1      |null                   |null                |0.01              |
|5      |5375   |9.094947017729283E-16  |-0.19574022878277014|4224.78377488372  |
+-------+-------+-----------------------+--------------------+------------------+



                                                                                