In [1]:
!pip install pyspark seaborn pandas scikit-learn matplotlib skl2onnx



In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

import seaborn as sns
import pandas as pd
import json

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [11]:
spark = (
    SparkSession.builder
    .appName("ComedorTrainingModel")
    .getOrCreate()
)

spark

In [12]:
comedor_df = spark.read.csv("FINAL_DATASET_V2.csv", header=True, sep=";")
comedor_df.show(5)

+----------------+----------+-----------------+-----------------+------------------+---------------+-----------+---------------------+--------------------+-----------------+---------------+-------------------+----------------+--------------+----------------+--------------+-----------------+--------------+------------------+------------------+-------+
|Codigo_municipio|Municipios|Temp_min_invierno|Prec_max_invierno|Calidad_vida_media|Poblacion_total|Renta_media|Total_paro_registrado|Paro_hombre_menor_25|Paro_hombre_25_45|Paro_hombre_45+|Paro_mujer_menor_25|Paro_mujer_25_45|Paro_mujer_45+|Paro_agricultura|Paro_industria|Paro_construccion|Paro_servicios|       demanda_raw|     demanda_score|Demanda|
+----------------+----------+-----------------+-----------------+------------------+---------------+-----------+---------------------+--------------------+-----------------+---------------+-------------------+----------------+--------------+----------------+--------------+-----------------+---

In [13]:
comedor_df.printSchema()

root
 |-- Codigo_municipio: string (nullable = true)
 |-- Municipios: string (nullable = true)
 |-- Temp_min_invierno: string (nullable = true)
 |-- Prec_max_invierno: string (nullable = true)
 |-- Calidad_vida_media: string (nullable = true)
 |-- Poblacion_total: string (nullable = true)
 |-- Renta_media: string (nullable = true)
 |-- Total_paro_registrado: string (nullable = true)
 |-- Paro_hombre_menor_25: string (nullable = true)
 |-- Paro_hombre_25_45: string (nullable = true)
 |-- Paro_hombre_45+: string (nullable = true)
 |-- Paro_mujer_menor_25: string (nullable = true)
 |-- Paro_mujer_25_45: string (nullable = true)
 |-- Paro_mujer_45+: string (nullable = true)
 |-- Paro_agricultura: string (nullable = true)
 |-- Paro_industria: string (nullable = true)
 |-- Paro_construccion: string (nullable = true)
 |-- Paro_servicios: string (nullable = true)
 |-- demanda_raw: string (nullable = true)
 |-- demanda_score: string (nullable = true)
 |-- Demanda: string (nullable = true)



In [14]:
features = [
    "Temp_min_invierno",
    "Prec_max_invierno",
    "Calidad_vida_media",
    "Poblacion_total",
    "Renta_media",
    "Total_paro_registrado",
    "Paro_hombre_menor_25",
    "Paro_hombre_25_45",
    "Paro_hombre_45+",
    "Paro_mujer_menor_25",
    "Paro_mujer_25_45",
    "Paro_mujer_45+",
    "Paro_agricultura",
    "Paro_industria",
    "Paro_construccion",
    "Paro_servicios"
]

model_df = comedor_df.select(*features, "Demanda").toPandas()

x = model_df[features].astype('float32')
y = model_df["Demanda"].astype('int')


In [None]:
if y.value_counts().min() < 2:
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=42
    )
else:
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2, random_state=42, stratify=y
    )

tree_clf = Pipeline(
    steps=[
        ("tree", DecisionTreeClassifier(
            max_depth=4,            # prueba 3-6 para algo "natural"
            min_samples_leaf=8,     # evita hojas con 1-2 casos
            min_samples_split=16,   # evita splits demasiado finos
            class_weight="balanced",# útil si hay desbalance
            random_state=42
        ))
    ]
)

gradient_clf = Pipeline(
    steps=[
        ("gb", GradientBoostingClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=3,
            random_state=42
        ))
    ]
)

tree_clf.fit(x_train, y_train)
gradient_clf.fit(x_train, y_train)

y_pred = tree_clf.predict(x_test)
y_pred_gradient = gradient_clf.predict(x_test)

print(confusion_matrix(y_test, y_pred))
print(confusion_matrix(y_test, y_pred_gradient))

print(classification_report(y_test, y_pred))
print(classification_report(y_test, y_pred_gradient))

[[ 1  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  1 10  6  0]
 [ 0  1  5 19  4]
 [ 0  0  0  0  7]]
[[ 1  0  0  0  0]
 [ 0  0  0  0  0]
 [ 0  1 11  5  0]
 [ 0  0  1 26  2]
 [ 0  0  0  1  6]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         0
           3       0.67      0.59      0.62        17
           4       0.76      0.66      0.70        29
           5       0.64      1.00      0.78         7

    accuracy                           0.69        54
   macro avg       0.61      0.65      0.62        54
weighted avg       0.72      0.69      0.69        54

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         1
           2       0.00      0.00      0.00         0
           3       0.92      0.65      0.76        17
           4       0.81      0.90      0.85        29
           5       0.75      0.86      0.80         7

   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    tree_clf, x, y,
    cv=5,
    scoring="f1_macro"
)

scores_boots = cross_val_score(
    gradient_clf, x, y,
    cv=5,
    scoring="f1_macro"
)

print(scores)
print("Media CV:", scores.mean())

print(scores_boots)
print("Media CV Boosting:", scores_boots.mean())



[0.68924731 0.58342082 0.48428049 0.606219   0.53290448]
Media CV: 0.5792144203492728
[0.65248447 0.89277389 0.66171552 0.7473483  0.9130203 ]
Media CV Boosting: 0.7734684986477176


In [18]:
initial_type = [("input", FloatTensorType([None, len(features)]))]
onnx_tree_model = convert_sklearn(tree_clf, initial_types=initial_type)
onnx_boosting_model = convert_sklearn(gradient_clf, initial_types=initial_type)

onnx_tree_path = "tree_comedor.onnx"
onnx_boosting_path = "boosting_comedor.onnx"

with open(onnx_tree_path, "wb") as f:
    f.write(onnx_tree_model.SerializeToString())

with open(onnx_boosting_path, "wb") as f:
    f.write(onnx_boosting_model.SerializeToString())

print(f"Modelo ONNX guardado en: {onnx_tree_path}")
print(f"Modelo ONNX guardado en: {onnx_boosting_path}")

Modelo ONNX guardado en: tree_comedor.onnx
Modelo ONNX guardado en: boosting_comedor.onnx


In [19]:
metadata = {
    "feature_order": features
}

with open("comedor_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print("Metadata guardada en comedor_metadata.json")

Metadata guardada en comedor_metadata.json
