In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Cargar archivo
df = pd.read_csv("datos_combinados_estaciones2125.csv", sep=';', engine='python', on_bad_lines='skip')

# Reemplazar '-' por NaN
df.replace('-', np.nan, inplace=True)

# Reemplazar comas por puntos en números decimales
df = df.applymap(lambda x: str(x).replace(',', '.') if isinstance(x, str) else x)

# Convertir columnas numéricas (excepto las 3 primeras: código, nombre, fecha)
for col in df.columns[3:]:
    try:
        df[col] = df[col].astype(float)
    except:
        pass

# Eliminar columnas con más del 50% de nulos
threshold_col = len(df) * 0.5
df.dropna(axis=1, thresh=threshold_col, inplace=True)

# Eliminar filas con más del 30% de nulos
threshold_row = len(df.columns) * 0.7
df.dropna(axis=0, thresh=threshold_row, inplace=True)

# Rellenar los nulos restantes con la media de cada columna
df.fillna(df.mean(numeric_only=True), inplace=True)

# Mostrar resumen limpio
print("Resumen tras limpieza:")
print(df.info())


Resumen tras limpieza:
<class 'pandas.core.frame.DataFrame'>
Index: 7052 entries, 0 to 7299
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   COD_ESTACION  7052 non-null   int64  
 1   NOM_ESTACION  7052 non-null   object 
 2   FECHA         7052 non-null   object 
 3   SO2           7052 non-null   float64
 4   CO            7052 non-null   float64
 5   NO            7052 non-null   float64
 6   NO2           7052 non-null   float64
 7   NOx           7052 non-null   float64
 8   O3            7052 non-null   float64
 9   PM10          7052 non-null   float64
 10  PM2.5         7052 non-null   float64
dtypes: float64(8), int64(1), object(2)
memory usage: 661.1+ KB
None


  df.replace('-', np.nan, inplace=True)
  df = df.applymap(lambda x: str(x).replace(',', '.') if isinstance(x, str) else x)


In [None]:
# 2) Definición de funciones ICA
def obtener_categoria_pm10(pm10, q1, q2, q3):
    if pm10 <= q1: return 'Muy buena'
    elif pm10 <= q2: return 'Buena'
    elif pm10 <= q3: return 'Mala'
    else: return 'Muy mala'

def obtener_categoria_no2(no2):
    if no2 <= 40: return 'Muy buena'
    elif no2 <= 100: return 'Buena'
    elif no2 <= 200: return 'Mala'
    else: return 'Muy mala'

def obtener_categoria_o3(o3):
    if o3 <= 100: return 'Muy buena'
    elif o3 <= 180: return 'Buena'
    elif o3 <= 240: return 'Mala'
    else: return 'Muy mala'

def obtener_categoria_so2(so2):
    if so2 <= 40: return 'Muy buena'
    elif so2 <= 100: return 'Buena'
    elif so2 <= 350: return 'Mala'
    else: return 'Muy mala'
# 3) Calcular umbrales PM10 dinámicos
q1, q2, q3 = df['PM10'].quantile([0.25, 0.50, 0.75])

# 4) Crear etiqueta ICA_CATEGORIA
def calcular_ica(row):
    cats = [
        obtener_categoria_pm10(row['PM10'], q1, q2, q3),
        obtener_categoria_no2(row['NO2']),
        obtener_categoria_o3(row['O3']),
        obtener_categoria_so2(row['SO2'])
    ]
    orden = ['Muy buena','Buena','Mala','Muy mala']
    return max(cats, key=lambda c: orden.index(c))

df['ICA_CATEGORIA'] = df.apply(calcular_ica, axis=1)

# 5) Definir features y target
features = ['PM10','PM2.5','NO2','O3','SO2','CO','NO','NOx']
features = [f for f in features if f in df.columns]
X = df[features]
y = df['ICA_CATEGORIA']

# 6) Eliminar filas con NaN
data = pd.concat([X,y], axis=1).dropna()
X = data[features]; y = data['ICA_CATEGORIA']


In [None]:
# 7) Split temporal (80/20 sin shuffle)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# 8) Pipeline y GridSearch para RandomForest
preprocessor = ColumnTransformer([('num', StandardScaler(), features)])
pipeline = Pipeline([
    ('prep', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [10, 20, None],
    'clf__max_features': ['sqrt','auto']
}

tscv = TimeSeriesSplit(n_splits=5)
grid = GridSearchCV(pipeline, param_grid, cv=tscv, scoring='accuracy', n_jobs=-1)
grid.fit(X_train, y_train)
print("Mejores parámetros:", grid.best_params_)

# 9) Evaluación final
y_pred = grid.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in 

Mejores parámetros: {'clf__max_depth': 10, 'clf__max_features': 'sqrt', 'clf__n_estimators': 200}
Confusion Matrix:
[[329   0   0   0]
 [  0 328   0   0]
 [  0   0 139   0]
 [  0   0   0 615]]

Classification Report:
              precision    recall  f1-score   support

       Buena       1.00      1.00      1.00       329
        Mala       1.00      1.00      1.00       328
   Muy buena       1.00      1.00      1.00       139
    Muy mala       1.00      1.00      1.00       615

    accuracy                           1.00      1411
   macro avg       1.00      1.00      1.00      1411
weighted avg       1.00      1.00      1.00      1411



In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, verbosity=0),
    "SVR": SVR(),
    "MLP": MLPRegressor(random_state=42, max_iter=1000)
}

# Preprocessing pipeline (same for all)
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), features)
])

# Time series split
tscv = TimeSeriesSplit(n_splits=5)

# Resultados
results = {}

for name, model in models.items():
    pipe = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    # Cross-validated R² score (solo en entrenamiento)
    scores = cross_val_score(pipe, X_train, y_train, cv=tscv, scoring="r2", n_jobs=-1)
    results[name] = {
        "mean_r2_cv": scores.mean(),
        "std_r2_cv": scores.std()
    }

    # Fit y evaluación en test set
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    results[name]["test_r2"] = r2_score(y_test, y_pred)
    results[name]["test_mse"] = mean_squared_error(y_test, y_pred)

# Mostrar resultados
for model_name, res in results.items():
    print(f"\nModelo: {model_name}")
    print(f" R² CV promedio: {res['mean_r2_cv']:.3f} ± {res['std_r2_cv']:.3f}")
    print(f" R² en test:     {res['test_r2']:.3f}")
    print(f" MSE en test:    {res['test_mse']:.3f}")


  df = df.replace("-", pd.NA).applymap(


Mejores parámetros: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__n_estimators': 200}
MSE: 149.47762738123657
R² : 0.5074136725527845


In [None]:
print(df.columns.tolist())


['COD_ESTACION;NOM_ESTACION;FECHA;SO2;CO;NO;NO2;NOx;O3;PM10;PM2.5;PM1;NH3;C6H6;C7H8;C8H10;Direc.;H.Rel.;Precip.;Pres.;R.Sol.;Ruido;Temp.;UV-B;Veloc.;Veloc.max.;As;BaA;BaP;BbFA;BjFA;BkFA;Cd;DahA;FA;HMN;H2S;IcdP;Ni;Pb;PST']
