In [14]:
import pandas as pd

# Cargar el archivo con separador ';' y parsear fechas
df = pd.read_csv('aire.csv', sep=';', parse_dates=['timestamp_captura'], dayfirst=True)

# LIMPIEZA
# Quitar duplicados
df.drop_duplicates(inplace=True)

# Quitar filas completamente vacías
df.dropna(how='all', inplace=True)

# Quitar espacios en blanco de los campos tipo texto (si los hay)
df = df.apply(lambda col: col.str.strip() if col.dtype == 'object' else col)

# Asegurarse que todos los datos numéricos sean numéricos (y convertir errores a NaN)
columnas_numericas = df.columns.drop('timestamp_captura')
df[columnas_numericas] = df[columnas_numericas].apply(pd.to_numeric, errors='coerce')

# Guardar el resultado limpio
df.info()


  df = pd.read_csv('aire.csv', sep=';', parse_dates=['timestamp_captura'], dayfirst=True)


<class 'pandas.core.frame.DataFrame'>
Index: 34972 entries, 0 to 34971
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   objectid           34972 non-null  int64  
 1   nombre             0 non-null      float64
 2   direccion          0 non-null      float64
 3   tipozona           0 non-null      float64
 4   parametros         0 non-null      float64
 5   mediciones         0 non-null      float64
 6   so2                17616 non-null  float64
 7   no2                34972 non-null  float64
 8   o3                 17616 non-null  float64
 9   co                 4599 non-null   float64
 10  pm10               26294 non-null  float64
 11  pm25               26294 non-null  float64
 12  tipoemisio         0 non-null      float64
 13  fecha_carg         0 non-null      float64
 14  calidad_am         0 non-null      float64
 15  fiwareid           0 non-null      float64
 16  longitude          34972 no

In [15]:
import pandas as pd

df = df.rename(columns={'timestamp_captura': 'Fecha'})

# 2. Concatenar los DataFrames
df_total = df

# 3. Convertir la columna 'Fecha' a tipo datetime (NaT si hay errores) y eliminar filas con 'Fecha' nula
df_total['Fecha'] = pd.to_datetime(df_total['Fecha'], errors='coerce', dayfirst=True)
df_total.dropna(subset=['Fecha'], inplace=True)

# 4. Detectar columnas numéricas y convertir a numérico (con coerce para manejar errores)
numeric_cols = df_total.columns.drop('Fecha')  # Excluimos la columna 'Fecha'
df_total[numeric_cols] = df_total[numeric_cols].apply(pd.to_numeric, errors='coerce')

# 5. Rellenar valores nulos en las columnas numéricas con la media de cada columna
for col in numeric_cols:
    media = df_total[col].mean()
    df_total[col].fillna(media, inplace=True)

# 6. Eliminar duplicados
df_total.drop_duplicates(inplace=True)

# 7. Guardar el DataFrame limpio a un archivo CSV
df_total.to_csv('Contaminacion_combinado_limpio.csv', sep=';', index=False)

# 8. Resumen del DataFrame final
print(f"Total de filas: {df_total.shape[0]}")
print(f"Columnas: {df_total.columns.tolist()}")


Total de filas: 17078
Columnas: ['objectid', 'nombre', 'direccion', 'tipozona', 'parametros', 'mediciones', 'so2', 'no2', 'o3', 'co', 'pm10', 'pm25', 'tipoemisio', 'fecha_carg', 'calidad_am', 'fiwareid', 'longitude', 'latitude', 'geometry_type', 'geo_point_lon', 'geo_point_lat', 'Fecha', 'Identificador']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_total[col].fillna(media, inplace=True)


In [16]:
df_total.info()


<class 'pandas.core.frame.DataFrame'>
Index: 17078 entries, 0 to 33075
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   objectid       17078 non-null  int64         
 1   nombre         0 non-null      float64       
 2   direccion      0 non-null      float64       
 3   tipozona       0 non-null      float64       
 4   parametros     0 non-null      float64       
 5   mediciones     0 non-null      float64       
 6   so2            17078 non-null  float64       
 7   no2            17078 non-null  float64       
 8   o3             17078 non-null  float64       
 9   co             17078 non-null  float64       
 10  pm10           17078 non-null  float64       
 11  pm25           17078 non-null  float64       
 12  tipoemisio     0 non-null      float64       
 13  fecha_carg     0 non-null      float64       
 14  calidad_am     0 non-null      float64       
 15  fiwareid       0 non-nul

In [17]:
# Identificar filas con valores no numéricos
for column in df_total.columns:
    non_numeric_rows = df_total[~df_total[column].apply(pd.to_numeric, errors='coerce').notnull()]
    if not non_numeric_rows.empty:
        print(f"Column '{column}' tiene los siguientes valores no numéricos:")
        print(non_numeric_rows)


Column 'nombre' tiene los siguientes valores no numéricos:
       objectid  nombre  direccion  tipozona  parametros  mediciones  \
0            13     NaN        NaN       NaN         NaN         NaN   
1            13     NaN        NaN       NaN         NaN         NaN   
2            13     NaN        NaN       NaN         NaN         NaN   
3            13     NaN        NaN       NaN         NaN         NaN   
4            13     NaN        NaN       NaN         NaN         NaN   
...         ...     ...        ...       ...         ...         ...   
33071        21     NaN        NaN       NaN         NaN         NaN   
33072        21     NaN        NaN       NaN         NaN         NaN   
33073        21     NaN        NaN       NaN         NaN         NaN   
33074        21     NaN        NaN       NaN         NaN         NaN   
33075        21     NaN        NaN       NaN         NaN         NaN   

            so2   no2         o3   co  ...  fecha_carg  calidad_am  fiwareid

In [18]:
# Convertir todas las columnas a numéricas, forzando valores no numéricos a NaN
df_total_clean = df_total.apply(pd.to_numeric, errors='coerce')

# Eliminar filas con valores NaN
df_total_clean = df_total_clean.dropna()

# Verificar si hay filas eliminadas y confirmar
print(f"Se eliminaron {len(df_total) - len(df_total_clean)} filas con valores no numéricos")


Se eliminaron 17078 filas con valores no numéricos


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# --- Limpieza de datos ---
df_total_clean = df_total.apply(pd.to_numeric, errors='coerce')
df_total_clean = df_total_clean.dropna()

# --- Separación en X e y ---
X = df_total_clean.drop(columns=['Fecha', 'o3'])
y = df_total_clean['o3']
print("Filas x: " + X.Identificador.count)
print("Y : " + y.count)
# --- División train/test ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Escalado ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Importación de modelos ---
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

try:
    from xgboost import XGBRegressor
    xgb_installed = True
except ImportError:
    xgb_installed = False

# --- Modelos definidos con mejores hiperparámetros ---
modelos = {
    'Random Forest': RandomForestRegressor(n_estimators=300, max_depth=10, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42),
    'Red Neuronal (MLP)': MLPRegressor(hidden_layer_sizes=(128, 64, 32), alpha=0.001,
                                        learning_rate='adaptive', max_iter=1500,
                                        early_stopping=True, random_state=42),
    'SVR': SVR(kernel='rbf', C=150, gamma='scale')
}

if xgb_installed:
    modelos['XGBoost'] = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        early_stopping_rounds=10,
        eval_metric='rmse',
        verbosity=0,
        random_state=42
    )

# --- Entrenamiento y evaluación ---
resultados = []
for nombre, modelo in modelos.items():
    if nombre == 'XGBoost':
        modelo.fit(X_train_scaled, y_train,
                   eval_set=[(X_test_scaled, y_test)],
                   verbose=False)
    else:
        modelo.fit(X_train_scaled, y_train)

    y_pred = modelo.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    resultados.append({'Modelo': nombre, 'MSE': mse, 'R2': r2})
    print(f"--- {nombre} ---")
    print(f"  MSE: {mse:.2f}")
    print(f"  R2: {r2:.4f}")
    print()

# --- Comparación de resultados ---
df_resultados = pd.DataFrame(resultados).sort_values(by='MSE')
print("Resumen de modelos ordenados por MSE (menor es mejor):")
print(df_resultados)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

# --- Limpieza de datos ---
df_total_clean = df_total.apply(pd.to_numeric, errors='coerce')
df_total_clean = df_total_clean.dropna()  # Eliminamos los valores NaN

# --- Separación en X e y ---
X = df_total_clean.drop(columns=['Fecha', 'Ozono(µg/m³)'])
y = df_total_clean['Ozono(µg/m³)']

# --- División train/test ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Escalado ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Optimización de Hiperparámetros con GridSearchCV ---
# Hiperparámetros de XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [200, 300, 400],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42)

grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
grid_search_xgb.fit(X_train_scaled, y_train)

# Mejores hiperparámetros de XGBoost
print("Mejores parámetros para XGBoost:", grid_search_xgb.best_params_)

# Entrenamos con los mejores parámetros
best_xgb_model = grid_search_xgb.best_estimator_
best_xgb_model.fit(X_train_scaled, y_train)

# --- Predicciones y evaluación ---
y_pred_xgb = best_xgb_model.predict(X_test_scaled)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

# --- Cross-validation con XGBoost ---
cross_val_scores_xgb = cross_val_score(best_xgb_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validation MSE promedio (XGBoost): {-cross_val_scores_xgb.mean():.2f}")

# Mostrar resultados finales
print(f"MSE (XGBoost): {mse_xgb:.2f}")
print(f"R² (XGBoost): {r2_xgb:.4f}")

# --- Comparar con otros modelos ---
# Entrenar RandomForestRegressor como un modelo base
rf_model = RandomForestRegressor(n_estimators=300, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Mostrar comparación
print(f"MSE (Random Forest): {mse_rf:.2f}")
print(f"R² (Random Forest): {r2_rf:.4f}")

# --- Combinación de modelos (Ensemble) ---
# Crear un Ensemble simple de XGBoost y Random Forest
ensemble_predictions = (y_pred_xgb + y_pred_rf) / 2
ensemble_mse = mean_squared_error(y_test, ensemble_predictions)
ensemble_r2 = r2_score(y_test, ensemble_predictions)

print(f"MSE (Ensemble): {ensemble_mse:.2f}")
print(f"R² (Ensemble): {ensemble_r2:.4f}")


Mejores parámetros para XGBoost: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 400, 'subsample': 0.8}
Cross-validation MSE promedio (XGBoost): 106.11
MSE (XGBoost): 111.55
R² (XGBoost): 0.5938
MSE (Random Forest): 116.99
R² (Random Forest): 0.5740
MSE (Ensemble): 111.39
R² (Ensemble): 0.5944


In [None]:
columnas_seleccionadas = [
    'NO(µg/m³)',
    'Veloc.(m/s)',
    'SO2(µg/m³)',
    'Ruido(dBA)',
    'Tolueno(µg/m³)'
]


In [None]:
# --- Nuevo dataset con columnas seleccionadas ---
X_reducido = df_total_clean[columnas_seleccionadas]
y = df_total_clean['Ozono(µg/m³)']

# --- División y escalado ---
X_train, X_test, y_train, y_test = train_test_split(X_reducido, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Entrenamiento con XGBoost ---
from xgboost import XGBRegressor
modelo = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)
modelo.fit(X_train_scaled, y_train)

# --- Evaluación ---
from sklearn.metrics import mean_squared_error, r2_score
y_pred = modelo.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE (modelo reducido): {mse:.2f}")
print(f"R²: {r2:.4f}")


MSE (modelo reducido): 129.70
R²: 0.5277


In [None]:
Q1 = df['PM10'].quantile(0.25)
Q2 = df['PM10'].quantile(0.50)
Q3 = df['PM10'].quantile(0.75)

# Función adaptada para PM10
def obtener_categoria_pm10(pm10):
    if pm10 <= Q1:
        return 'Muy buena'
    elif pm10 <= Q2:
        return 'Buena'
    elif pm10 <= Q3:
        return 'Mala'
    else:
        return 'Muy mala'

# Funciones originales para NO2, O3 y SO2
def obtener_categoria_no2(no2):
    if no2 <= 40:
        return 'Muy buena'
    elif no2 <= 100:
        return 'Buena'
    elif no2 <= 200:
        return 'Mala'
    else:
        return 'Muy mala'

def obtener_categoria_o3(o3):
    if o3 <= 100:
        return 'Muy buena'
    elif o3 <= 180:
        return 'Buena'
    elif o3 <= 240:
        return 'Mala'
    else:
        return 'Muy mala'

def obtener_categoria_so2(so2):
    if so2 <= 40:
        return 'Muy buena'
    elif so2 <= 100:
        return 'Buena'
    elif so2 <= 350:
        return 'Mala'
    else:
        return 'Muy mala'

# Función que toma la peor categoría
def calcular_ica(row):
    categorias = [
        obtener_categoria_pm10(row['PM10']),
        obtener_categoria_no2(row['NO2']),
        obtener_categoria_o3(row['O3']),
        obtener_categoria_so2(row['SO2'])
    ]
    orden = ['Muy buena', 'Buena', 'Mala', 'Muy mala']
    return max(categorias, key=lambda x: orden.index(x))

# Aplicamos y vemos la distribución resultante
df['ICA_CATEGORIA'] = df.apply(calcular_ica, axis=1)
print(df['ICA_CATEGORIA'].value_counts())

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
# 3. Separar variables y labels
# ---------------------------
# Features: los contaminantes usados más algunos extra
features = ['PM10', 'NO2', 'O3', 'SO2', 'CO', 'NO', 'NOx']
features = [f for f in features if f in df.columns]

X = df[features]
y = df['ICA_CATEGORIA']

# Aseguramos que no haya nulos
data = pd.concat([X, y], axis=1).dropna()
X = data[features]
y = data['ICA_CATEGORIA']

# ---------------------------
# 4. Cross-Validation Temporal (TimeSeriesSplit)
# ---------------------------
tscv = TimeSeriesSplit(n_splits=5)  # 5 splits para evaluación

model = RandomForestClassifier(random_state=42)

# Realizamos cross-validation temporal
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Entrenamos el modelo
    model.fit(X_train, y_train)

    # Predicciones
    y_pred = model.predict(X_test)

    # Evaluación
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

In [None]:
print(df['ICA_CATEGORIA'].value_counts())