In [407]:
import pandas as pd
from functools import reduce

In [408]:
# 1. Archivos de contaminantes
df_dioxido_nitrogeno = pd.read_csv("Dioxido_de_Nitrogeno.csv", sep=';', decimal=',')
df_oxido_nitrogeno = pd.read_csv("Oxido_de_Nitrogeno.csv", sep=';', decimal=',')
df_mp25 = pd.read_csv("MP2_5.csv", sep=';', decimal=',')
df_mp10 = pd.read_csv("MP10.csv", sep=';', decimal=',')

In [409]:
# Modifica estas líneas en tu celda de carga
df_temp = pd.read_csv("temp.csv", sep=';', decimal=',')
df_hum = pd.read_csv("hum.csv", sep=';', decimal=',')
df_precip = pd.read_csv("precip.csv", sep=';', decimal=',')
df_vel_viento = pd.read_csv("vel_viento.csv", sep=';', decimal=',')
df_dir_viento = pd.read_csv("dir_viento.csv", sep=';', decimal=',')

In [410]:
import pandas as pd

# 1. Definimos una función para reutilizar la lógica
def procesar_fecha_hora(df):
    if 'FECHA (YYMMDD)' in df.columns and 'HORA (HHMM)' in df.columns:
        
        fecha_str = df['FECHA (YYMMDD)'].astype(str).str.zfill(6)
        hora_str = df['HORA (HHMM)'].astype(str).str.zfill(4)
        
        # Convertimos a datetime
        df['fecha_hora'] = pd.to_datetime(
            fecha_str + ' ' + hora_str, 
            format='%y%m%d %H%M'
        )

    return df

# 2. Lista de tus DataFrames
lista_dfs = [
    df_dioxido_nitrogeno, df_oxido_nitrogeno, df_mp25, df_mp10,  # Contaminantes
    df_temp, df_hum, df_precip,                                  # Meteorológicos
    df_vel_viento, df_dir_viento                                 # Viento
]

for df in lista_dfs:
    procesar_fecha_hora(df)
    df.drop(columns=['FECHA (YYMMDD)', 'HORA (HHMM)'], inplace=True)
    #imprime el nombre de la primera columna para verificar
    print(f"Primera columna después de procesar: {df.columns[0]}")



Primera columna después de procesar: Registros validados
Primera columna después de procesar: Registros validados
Primera columna después de procesar: Registros validados
Primera columna después de procesar: Registros validados
Primera columna después de procesar: Temperatura (°C)
Primera columna después de procesar: Humedad Relativa (%)
Primera columna después de procesar: Precipitacion (mm)
Primera columna después de procesar: Velocidad Viento (m/s)
Primera columna después de procesar: Direccion Viento (grados)


In [411]:
df_meteorologicos = df_hum["fecha_hora"].to_frame()
df_meteorologicos.head()

Unnamed: 0,fecha_hora
0,2024-12-11 01:00:00
1,2024-12-11 02:00:00
2,2024-12-11 03:00:00
3,2024-12-11 04:00:00
4,2024-12-11 05:00:00


In [412]:
df_meteorologicos["temperatura"] = df_temp["Temperatura (°C)"]
df_meteorologicos["temperatura"].isna().sum()

np.int64(113)

In [413]:
df_meteorologicos["temperatura"] = df_temp["Temperatura (°C)"]
df_meteorologicos["humedad"] = df_hum["Humedad Relativa (%)"]
df_meteorologicos["precipitacion"] = df_precip["Precipitacion (mm)"]
df_meteorologicos["velocidad_viento"] = df_vel_viento["Velocidad Viento (m/s)"]
df_meteorologicos["direccion_viento"] = df_dir_viento["Direccion Viento (grados)"]
datos_meteorologicos = df_meteorologicos.copy()


In [414]:
datos_meteorologicos.to_csv("datos_meteorologicos.csv", index=False)

In [415]:
df_meteorologicos.dtypes

fecha_hora          datetime64[ns]
temperatura                float64
humedad                    float64
precipitacion              float64
velocidad_viento           float64
direccion_viento           float64
dtype: object

In [416]:
columnas_numericas = ['temperatura', 'humedad', 'precipitacion', 'velocidad_viento', 'direccion_viento']

for col in columnas_numericas:
    datos_meteorologicos[col] = pd.to_numeric(datos_meteorologicos[col], errors='coerce')

datos_meteorologicos['fecha'] = datos_meteorologicos['fecha_hora'].dt.date

datos_diarios = datos_meteorologicos.groupby('fecha').agg({
    'temperatura': ['mean', 'max', 'min'],
    'humedad': ['mean', 'max', 'min'],
    'precipitacion': ['mean', 'max', 'min'],
    'velocidad_viento': ['mean', 'max', 'min'],
    'direccion_viento': ['mean', 'max', 'min']
})

datos_diarios.columns = ['_'.join(col).strip() for col in datos_diarios.columns.values]
datos_diarios = datos_diarios.reset_index()


print(f"\n{len(datos_diarios)} días × {len(datos_diarios.columns)} columnas")



366 días × 16 columnas


In [417]:
pd.DataFrame({
    'columna': datos_diarios.columns,
    'dias_con_NaN': datos_diarios.isna().sum().values,
    'porcentaje_NaN': (datos_diarios.isna().sum() / len(datos_diarios) * 100).values
})



Unnamed: 0,columna,dias_con_NaN,porcentaje_NaN
0,fecha,0,0.0
1,temperatura_mean,1,0.273224
2,temperatura_max,1,0.273224
3,temperatura_min,1,0.273224
4,humedad_mean,1,0.273224
5,humedad_max,1,0.273224
6,humedad_min,1,0.273224
7,precipitacion_mean,1,0.273224
8,precipitacion_max,1,0.273224
9,precipitacion_min,1,0.273224


In [418]:
filas_con_nan = datos_diarios[datos_diarios.isna().any(axis=1)]
filas_con_nan


Unnamed: 0,fecha,temperatura_mean,temperatura_max,temperatura_min,humedad_mean,humedad_max,humedad_min,precipitacion_mean,precipitacion_max,precipitacion_min,velocidad_viento_mean,velocidad_viento_max,velocidad_viento_min,direccion_viento_mean,direccion_viento_max,direccion_viento_min
326,2025-11-02,,,,,,,,,,,,,,,


In [419]:
datos_met_dia = datos_diarios.dropna()

print("Cantidad de NaNs restantes:", datos_meteorologicos.isna().sum().sum())
print("Dimensiones finales:", datos_meteorologicos.shape)
datos_met_dia

Cantidad de NaNs restantes: 555
Dimensiones finales: (8783, 7)


Unnamed: 0,fecha,temperatura_mean,temperatura_max,temperatura_min,humedad_mean,humedad_max,humedad_min,precipitacion_mean,precipitacion_max,precipitacion_min,velocidad_viento_mean,velocidad_viento_max,velocidad_viento_min,direccion_viento_mean,direccion_viento_max,direccion_viento_min
0,2024-12-11,17.381087,24.9725,9.47749,57.920761,91.9250,29.7150,0.000000,0.00,0.0,1.900991,4.11595,0.243455,149.745113,218.011,13.09990
1,2024-12-12,17.191979,23.9325,10.72500,63.143242,89.9750,34.1450,0.000000,0.00,0.0,1.972444,5.32260,0.296152,179.717158,284.377,59.84490
2,2024-12-13,18.682500,25.2050,12.17750,63.852192,100.0000,39.8025,0.000000,0.00,0.0,1.622534,2.79213,0.518944,166.069883,234.286,85.27920
3,2024-12-14,20.614062,28.7750,13.86750,49.542387,73.9675,28.0275,0.000000,0.00,0.0,1.563239,2.08691,0.115194,174.777842,317.185,29.63020
4,2024-12-15,16.587117,20.5150,12.96330,73.146362,91.4000,54.9975,0.000000,0.00,0.0,2.501182,4.54567,0.074868,281.835240,346.968,0.04225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2025-12-07,17.747083,22.2000,13.60500,75.487292,100.0000,56.9675,0.000000,0.00,0.0,2.345541,4.22102,0.407846,265.277469,348.451,9.16525
362,2025-12-08,17.176979,23.5175,14.38250,81.359379,95.4500,54.5475,0.000000,0.00,0.0,2.549238,4.28088,0.224354,313.715458,352.394,193.91000
363,2025-12-09,16.344387,18.1750,14.89500,83.389387,100.0000,65.9133,0.014706,0.25,0.0,2.143824,5.33063,0.771174,305.171333,359.431,246.44100
364,2025-12-10,17.034479,20.0625,14.66000,84.020837,100.0000,67.8075,0.000000,0.00,0.0,1.926764,4.15968,0.119706,313.728417,353.370,248.70900


Datos de contaminates 

In [420]:
df_dioxido_nitrogeno 

Unnamed: 0,Registros validados,Registros preliminares,Registros no validados,Unnamed: 5,fecha_hora
0,7.32,,,,2024-12-10
1,4.26,,,,2024-12-11
2,2.46,,,,2024-12-12
3,2.60,,,,2024-12-13
4,2.89,,,,2024-12-14
...,...,...,...,...,...
360,,2.01,,,2025-12-05
361,,2.53,,,2025-12-06
362,,1.68,,,2025-12-07
363,,1.68,,,2025-12-08


In [421]:
datos_contaminantes = df_dioxido_nitrogeno["fecha_hora"].to_frame()
datos_contaminantes.rename(columns={"fecha_hora": "fecha"}, inplace=True)
datos_contaminantes

Unnamed: 0,fecha
0,2024-12-10
1,2024-12-11
2,2024-12-12
3,2024-12-13
4,2024-12-14
...,...
360,2025-12-05
361,2025-12-06
362,2025-12-07
363,2025-12-08


In [422]:
contaminantes = [df_dioxido_nitrogeno, df_mp10, df_mp25, df_oxido_nitrogeno]

cols_a_promediar = ['Registros preliminares', 'Registros validados', 'Registros no validados']

for df in contaminantes:
    cols_existentes = [col for col in cols_a_promediar if col in df.columns]
    df["Registros"] = df[cols_existentes].mean(axis=1)

In [423]:
datos_contaminantes["N02"] = df_dioxido_nitrogeno["Registros"]
datos_contaminantes["MP10"] = df_mp10["Registros"]
datos_contaminantes["MP2.5"] = df_mp25["Registros"]
datos_contaminantes["NO"] = df_oxido_nitrogeno["Registros"]
datos_contaminantes

Unnamed: 0,fecha,N02,MP10,MP2.5,NO
0,2024-12-10,7.32,25.0,6.0,9.84380
1,2024-12-11,4.26,22.0,3.0,5.34077
2,2024-12-12,2.46,20.0,4.0,3.75766
3,2024-12-13,2.60,23.0,4.0,3.66602
4,2024-12-14,2.89,28.0,8.0,3.91333
...,...,...,...,...,...
360,2025-12-05,2.01,28.0,6.0,3.01881
361,2025-12-06,2.53,33.0,8.0,3.54896
362,2025-12-07,1.68,36.0,12.0,2.68929
363,2025-12-08,1.68,34.0,10.0,2.71201


In [424]:
datos_contaminantes

Unnamed: 0,fecha,N02,MP10,MP2.5,NO
0,2024-12-10,7.32,25.0,6.0,9.84380
1,2024-12-11,4.26,22.0,3.0,5.34077
2,2024-12-12,2.46,20.0,4.0,3.75766
3,2024-12-13,2.60,23.0,4.0,3.66602
4,2024-12-14,2.89,28.0,8.0,3.91333
...,...,...,...,...,...
360,2025-12-05,2.01,28.0,6.0,3.01881
361,2025-12-06,2.53,33.0,8.0,3.54896
362,2025-12-07,1.68,36.0,12.0,2.68929
363,2025-12-08,1.68,34.0,10.0,2.71201


In [425]:
datos_met_dia

Unnamed: 0,fecha,temperatura_mean,temperatura_max,temperatura_min,humedad_mean,humedad_max,humedad_min,precipitacion_mean,precipitacion_max,precipitacion_min,velocidad_viento_mean,velocidad_viento_max,velocidad_viento_min,direccion_viento_mean,direccion_viento_max,direccion_viento_min
0,2024-12-11,17.381087,24.9725,9.47749,57.920761,91.9250,29.7150,0.000000,0.00,0.0,1.900991,4.11595,0.243455,149.745113,218.011,13.09990
1,2024-12-12,17.191979,23.9325,10.72500,63.143242,89.9750,34.1450,0.000000,0.00,0.0,1.972444,5.32260,0.296152,179.717158,284.377,59.84490
2,2024-12-13,18.682500,25.2050,12.17750,63.852192,100.0000,39.8025,0.000000,0.00,0.0,1.622534,2.79213,0.518944,166.069883,234.286,85.27920
3,2024-12-14,20.614062,28.7750,13.86750,49.542387,73.9675,28.0275,0.000000,0.00,0.0,1.563239,2.08691,0.115194,174.777842,317.185,29.63020
4,2024-12-15,16.587117,20.5150,12.96330,73.146362,91.4000,54.9975,0.000000,0.00,0.0,2.501182,4.54567,0.074868,281.835240,346.968,0.04225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2025-12-07,17.747083,22.2000,13.60500,75.487292,100.0000,56.9675,0.000000,0.00,0.0,2.345541,4.22102,0.407846,265.277469,348.451,9.16525
362,2025-12-08,17.176979,23.5175,14.38250,81.359379,95.4500,54.5475,0.000000,0.00,0.0,2.549238,4.28088,0.224354,313.715458,352.394,193.91000
363,2025-12-09,16.344387,18.1750,14.89500,83.389387,100.0000,65.9133,0.014706,0.25,0.0,2.143824,5.33063,0.771174,305.171333,359.431,246.44100
364,2025-12-10,17.034479,20.0625,14.66000,84.020837,100.0000,67.8075,0.000000,0.00,0.0,1.926764,4.15968,0.119706,313.728417,353.370,248.70900


In [426]:

datos_met_dia['fecha'] = pd.to_datetime(datos_met_dia['fecha'])
datos_contaminantes['fecha'] = pd.to_datetime(datos_contaminantes['fecha'])

df_final = pd.merge(
    datos_met_dia,
    datos_contaminantes,
    on='fecha',
    how='outer'
)

df_final = df_final.sort_values('fecha').reset_index(drop=True)
df_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datos_met_dia['fecha'] = pd.to_datetime(datos_met_dia['fecha'])


Unnamed: 0,fecha,temperatura_mean,temperatura_max,temperatura_min,humedad_mean,humedad_max,humedad_min,precipitacion_mean,precipitacion_max,precipitacion_min,velocidad_viento_mean,velocidad_viento_max,velocidad_viento_min,direccion_viento_mean,direccion_viento_max,direccion_viento_min,N02,MP10,MP2.5,NO
0,2024-12-10,,,,,,,,,,,,,,,,7.32,25.0,6.0,9.8438
1,2024-12-11,17.381087,24.9725,9.47749,57.920761,91.925,29.715,0.0,0.0,0.0,1.900991,4.11595,0.243455,149.745113,218.011,13.0999,4.26,22.0,3.0,5.34077
2,2024-12-12,17.191979,23.9325,10.725,63.143242,89.975,34.145,0.0,0.0,0.0,1.972444,5.3226,0.296152,179.717158,284.377,59.8449,2.46,20.0,4.0,3.75766
3,2024-12-13,18.6825,25.205,12.1775,63.852192,100.0,39.8025,0.0,0.0,0.0,1.622534,2.79213,0.518944,166.069883,234.286,85.2792,2.6,23.0,4.0,3.66602
4,2024-12-14,20.614062,28.775,13.8675,49.542387,73.9675,28.0275,0.0,0.0,0.0,1.563239,2.08691,0.115194,174.777842,317.185,29.6302,2.89,28.0,8.0,3.91333


In [427]:
filas_con_nan = df_final[df_final.isna().any(axis=1)]
filas_con_nan

Unnamed: 0,fecha,temperatura_mean,temperatura_max,temperatura_min,humedad_mean,humedad_max,humedad_min,precipitacion_mean,precipitacion_max,precipitacion_min,velocidad_viento_mean,velocidad_viento_max,velocidad_viento_min,direccion_viento_mean,direccion_viento_max,direccion_viento_min,N02,MP10,MP2.5,NO
0,2024-12-10,,,,,,,,,,,,,,,,7.32,25.0,6.0,9.8438
77,2025-02-25,19.716555,28.9967,12.8825,56.053109,89.05,25.645,0.0,0.0,0.0,1.159419,2.00408,0.066891,164.941359,235.268,97.4139,,,5.0,
183,2025-06-11,12.154479,14.38,10.4,78.790417,100.0,63.305,0.739583,6.5,0.0,3.519465,4.61714,2.75702,238.473642,354.271,13.9116,2.92,,14.0,4.65534
326,2025-11-01,9.622667,12.2925,7.9915,93.004722,100.0,76.0175,0.0,0.0,0.0,1.401916,2.13945,0.900119,137.635333,184.178,114.691,,,,
327,2025-11-02,,,,,,,,,,,,,,,,,,,
328,2025-11-03,13.245687,14.4425,11.328,79.710338,88.325,69.4301,0.0,0.0,0.0,1.827728,2.644,1.13809,294.189778,347.166,2.24208e-44,,,,
334,2025-11-09,10.741671,11.4925,9.92,89.140471,92.9,85.6,0.0,0.0,0.0,0.373094,0.55094,0.09309,98.496671,298.964,22.7062,,,,
335,2025-11-10,15.916317,18.3375,13.44,65.696875,80.775,50.9925,0.0,0.0,0.0,3.290768,6.12683,0.943521,262.066308,334.714,210.555,,,,
364,2025-12-09,16.344387,18.175,14.895,83.389387,100.0,65.9133,0.014706,0.25,0.0,2.143824,5.33063,0.771174,305.171333,359.431,246.441,,,,
365,2025-12-10,17.034479,20.0625,14.66,84.020837,100.0,67.8075,0.0,0.0,0.0,1.926764,4.15968,0.119706,313.728417,353.37,248.709,,,,


In [428]:
# Te quedas desde la fila 1 hasta el final
df_final = df_final.iloc[1:]

# Es buena práctica reiniciar el índice después para que vuelva a empezar en 0
df_final = df_final.reset_index(drop=True)


In [429]:
promedios = df_final.mean(numeric_only=True)
df_final = df_final.fillna(promedios)

In [430]:
print(df_final.isna().sum())

pd.DataFrame({
    'columna': df_final.columns,
    'dias_con_NaN': df_final.isna().sum().values
})

# Opción 3: Porcentaje de NaN por columna
pd.DataFrame({
    'columna': df_final.columns,
    'dias_con_NaN': df_final.isna().sum().values,
    'porcentaje_NaN': (df_final.isna().sum() / len(df_final) * 100).values
})

fecha                    0
temperatura_mean         0
temperatura_max          0
temperatura_min          0
humedad_mean             0
humedad_max              0
humedad_min              0
precipitacion_mean       0
precipitacion_max        0
precipitacion_min        0
velocidad_viento_mean    0
velocidad_viento_max     0
velocidad_viento_min     0
direccion_viento_mean    0
direccion_viento_max     0
direccion_viento_min     0
N02                      0
MP10                     0
MP2.5                    0
NO                       0
dtype: int64


Unnamed: 0,columna,dias_con_NaN,porcentaje_NaN
0,fecha,0,0.0
1,temperatura_mean,0,0.0
2,temperatura_max,0,0.0
3,temperatura_min,0,0.0
4,humedad_mean,0,0.0
5,humedad_max,0,0.0
6,humedad_min,0,0.0
7,precipitacion_mean,0,0.0
8,precipitacion_max,0,0.0
9,precipitacion_min,0,0.0


In [431]:
df_final.to_csv("datos_completos_estacion_punteras.csv", index=False)