```
Index(['Measurement date', 'Station code', 'Latitude', 'Longitude', 'SO2',
       'NO2', 'O3', 'CO', 'PM10', 'PM2.5'],
      dtype='object') dataset: data
Index(['Measurement date', 'Station code', 'Latitude', 'Longitude', 'SO2',
       'NO2', 'O3', 'CO', 'PM10', 'PM2.5'],
      dtype='object') dataset: instrument
Index(['Measurement date', 'Station code', 'Latitude', 'Longitude', 'SO2',
       'NO2', 'O3', 'CO', 'PM10', 'PM2.5'],
      dtype='object') dataset: pollutant```

# 92/550 puntos

In [25]:
import pandas as pd
import json
import pickle
import os
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# Cargar los datos
data = pd.read_csv("data/raw/measurement_data.csv")
instrument_data = pd.read_csv("data/raw/instrument_data.csv", parse_dates=['Measurement date'])

print(instrument_data["Instrument status"].value_counts(normalize=True) * 100)

# Convertir fechas a tipo datetime
data['Measurement date'] = pd.to_datetime(data['Measurement date'])
instrument_data['Measurement date'] = pd.to_datetime(instrument_data['Measurement date'])

# Realizar el merge entre data e instrument_data
merged_data = pd.merge(data, instrument_data, on=['Station code', 'Measurement date'], how='inner')

# Extraer características temporales
merged_data["hour"] = merged_data["Measurement date"].dt.hour
merged_data["month"] = merged_data["Measurement date"].dt.month
merged_data["weekday"] = merged_data["Measurement date"].dt.weekday
merged_data["is_weekend"] = merged_data["weekday"].isin([5, 6]).astype(int)

# Crear nuevas características
# Promedio móvil de 3 horas para los contaminantes
for pollutant in ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']:
    merged_data[f'{pollutant}_rolling_mean'] = merged_data[pollutant].rolling(window=3).mean()

# Relación entre contaminantes
merged_data['SO2_NO2'] = merged_data['SO2'] / (merged_data['NO2'] + 1e-5)  # Agregar un pequeño valor para evitar división por 0
merged_data['O3_PM10'] = merged_data['O3'] / (merged_data['PM10'] + 1e-5)
merged_data['SO2_O3'] = merged_data['SO2'] / (merged_data['O3'] + 1e-5)

# Clasificar el `Instrument status` en binario (0 -> Normal, 1 -> Anómalo)
merged_data['instrument_status_binary'] = np.where(merged_data['Instrument status'] == 0, 0, 1)


# Definir las estaciones y contaminantes con sus períodos
stations = {
    "205": ("SO2", '2023-11-01 00:00:00', '2023-11-30 23:00:00'),
    "209": ("NO2", '2023-09-01 00:00:00', '2023-09-30 23:00:00'),
    "223": ("O3", '2023-07-01 00:00:00', '2023-07-31 23:00:00'),
    "224": ("CO", '2023-10-01 00:00:00', '2023-10-31 23:00:00'),
    "226": ("PM10", '2023-08-01 00:00:00', '2023-08-31 23:00:00'),
    "227": ("PM2.5", '2023-12-01 00:00:00', '2023-12-31 23:00:00')
}

# Inicializar el diccionario de resultados
output = {"target": {}}

for station_code, (pollutant, start_date, end_date) in stations.items():
    print(f"\nProcesando estación {station_code} con contaminante {pollutant}")
    station_data = merged_data[(merged_data['Station code'] == int(station_code)) & (merged_data[pollutant] >= 0)].copy()
    
    if station_data.empty:
        continue
    
    # Aplicar Isolation Forest para detectar anomalías
    features = ['hour', 'weekday', 'month', pollutant]
    iso_forest = IsolationForest(contamination=0.03, random_state=42)
    station_data['anomaly'] = iso_forest.fit_predict(station_data[features])
    station_data['anomaly'] = np.where(station_data['anomaly'] == -1, 1, 0)
    
    # Filtrar anomalías detectadas
    anomalies = station_data[station_data['anomaly'] == 1].copy()
    if anomalies.empty:
        continue
    
    # Entrenar modelo multiclase para clasificar anomalías
    X_anomalies = anomalies[features]
    y_anomalies = anomalies['Instrument status']
    if y_anomalies.isnull().all():
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X_anomalies, y_anomalies, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Clasificación de anomalías - F1 Score: {f1_score(y_test, y_pred, average='macro'):.4f}")
    
    # Predecir anomalías en el período solicitado
    date_range = pd.date_range(start=pd.to_datetime(start_date), end=pd.to_datetime(end_date), freq='h')
    forecast_data = pd.DataFrame({'Measurement date': date_range})
    forecast_data['hour'] = forecast_data['Measurement date'].dt.hour
    forecast_data['weekday'] = forecast_data['Measurement date'].dt.weekday
    forecast_data['month'] = forecast_data['Measurement date'].dt.month
    forecast_data[pollutant] = 0  # Se podría usar un valor estimado o promedio
    
    # Aplicar Isolation Forest en los datos futuros
    forecast_data['anomaly'] = iso_forest.predict(forecast_data[features])
    forecast_data['anomaly'] = np.where(forecast_data['anomaly'] == -1, 1, 0)
    
    anomalous_data = forecast_data[forecast_data['anomaly'] == 1].copy()
    if not anomalous_data.empty:
        anomalous_data['anomaly_type'] = clf.predict(anomalous_data[features])
    
    final_results = forecast_data.merge(anomalous_data[['Measurement date', 'anomaly_type']], on='Measurement date', how='left')
    output["target"][station_code] = {str(date): int(pred) if not np.isnan(pred) else 0 for date, pred in zip(final_results['Measurement date'], final_results['anomaly_type'].fillna(0))}

# Guardar predicciones
output_filename = "predictions/predictions_task_3.json"
os.makedirs(os.path.dirname(output_filename), exist_ok=True)
with open(output_filename, 'w') as f:
    json.dump(output, f, indent=4)

print(f"Predicciones guardadas en {output_filename}")
print(instrument_data["Instrument status"].value_counts(normalize=True) * 100)

Instrument status
0    97.381889
8     0.764730
1     0.732977
9     0.531042
4     0.484925
2     0.104437
Name: proportion, dtype: float64

Procesando estación 205 con contaminante SO2
Clasificación de anomalías - F1 Score: 0.6665

Procesando estación 209 con contaminante NO2
Clasificación de anomalías - F1 Score: 0.6869

Procesando estación 223 con contaminante O3
Clasificación de anomalías - F1 Score: 0.4489

Procesando estación 224 con contaminante CO
Clasificación de anomalías - F1 Score: 0.4498

Procesando estación 226 con contaminante PM10
Clasificación de anomalías - F1 Score: 0.5407

Procesando estación 227 con contaminante PM2.5
Clasificación de anomalías - F1 Score: 0.5620
Predicciones guardadas en predictions/predictions_task_3.json
Instrument status
0    97.381889
8     0.764730
1     0.732977
9     0.531042
4     0.484925
2     0.104437
Name: proportion, dtype: float64


In [18]:
print(station_data.describe())

       Station code      Latitude     Longitude           SO2           NO2  \
count       24453.0  2.445300e+04  2.445300e+04  24453.000000  24453.000000   
mean          205.0  3.756426e+01  1.269747e+02      0.003272      0.031749   
std             0.0  7.105573e-15  1.421115e-14      0.018240      0.025728   
min           205.0  3.756426e+01  1.269747e+02     -1.000000     -1.000000   
25%           205.0  3.756426e+01  1.269747e+02      0.003000      0.019000   
50%           205.0  3.756426e+01  1.269747e+02      0.003000      0.029000   
75%           205.0  3.756426e+01  1.269747e+02      0.004000      0.043000   
max           205.0  3.756426e+01  1.269747e+02      0.082000      0.248000   

                 O3            CO          PM10         PM2.5     Item code  \
count  24453.000000  24453.000000  24453.000000  24453.000000  24453.000000   
mean       0.024997      0.496716     37.973459     22.708502      1.518219   
std        0.042638      0.256208     25.970254    

In [15]:
print(merged_data.describe())

                    Measurement date  Station code      Latitude  \
count                        3703662  3.703662e+06  3.703662e+06   
mean   2022-06-04 01:49:42.133468160  2.159581e+02  3.755426e+01   
min              2021-01-01 00:00:00  2.040000e+02  3.745236e+01   
25%              2021-09-15 04:00:00  2.100000e+02  3.751753e+01   
50%              2022-05-30 09:00:00  2.160000e+02  3.754496e+01   
75%              2023-02-11 15:00:00  2.220000e+02  3.758485e+01   
max              2023-12-31 23:00:00  2.280000e+02  3.765877e+01   
std                              NaN  7.176528e+00  5.342886e-02   

          Longitude           SO2           NO2            O3            CO  \
count  3.703662e+06  3.703662e+06  3.703662e+06  3.703662e+06  3.703662e+06   
mean   1.269889e+02 -7.829759e-04  2.350889e-02  1.933672e-02  5.095291e-01   
min    1.268352e+02 -1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00   
25%    1.269271e+02  3.000000e-03  1.600000e-02  9.000000e-03  3.000000

# Otros

In [21]:
import pandas as pd
import json
import pickle
import os
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# Cargar los datos
data = pd.read_csv("data/raw/measurement_data.csv")
instrument_data = pd.read_csv("data/raw/instrument_data.csv", parse_dates=['Measurement date'])

print(instrument_data["Instrument status"].value_counts(normalize=True) * 100)

# Convertir fechas a tipo datetime
data['Measurement date'] = pd.to_datetime(data['Measurement date'])
instrument_data['Measurement date'] = pd.to_datetime(instrument_data['Measurement date'])

# Realizar el merge entre data e instrument_data
merged_data = pd.merge(data, instrument_data, on=['Station code', 'Measurement date'], how='inner')

# Extraer características temporales
merged_data["hour"] = merged_data["Measurement date"].dt.hour
merged_data["month"] = merged_data["Measurement date"].dt.month
merged_data["weekday"] = merged_data["Measurement date"].dt.weekday
merged_data["is_weekend"] = merged_data["weekday"].isin([5, 6]).astype(int)

# Crear nuevas características
# Promedio móvil de 3 horas para los contaminantes
for pollutant in ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']:
    merged_data[f'{pollutant}_rolling_mean'] = merged_data[pollutant].rolling(window=3).mean()

# Relación entre contaminantes
merged_data['SO2_NO2'] = merged_data['SO2'] / (merged_data['NO2'] + 1e-5)  # Agregar un pequeño valor para evitar división por 0
merged_data['O3_PM10'] = merged_data['O3'] / (merged_data['PM10'] + 1e-5)
merged_data['SO2_O3'] = merged_data['SO2'] / (merged_data['O3'] + 1e-5)

# Eliminar valores negativos para los contaminantes de todo el conjunto de datos
contaminants = ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5', 
                'SO2_rolling_mean', 'NO2_rolling_mean', 'O3_rolling_mean', 
                'CO_rolling_mean', 'PM10_rolling_mean', 'PM2.5_rolling_mean', 
                'SO2_NO2', 'O3_PM10', 'SO2_O3']
merged_data = merged_data[(merged_data[contaminants] >= 0).all(axis=1)]

# Clasificar el `Instrument status` en binario (0 -> Normal, 1 -> Anómalo)
merged_data['instrument_status_binary'] = np.where(merged_data['Instrument status'] == 0, 0, 1)

# Definir las estaciones y contaminantes con sus períodos
stations = {
    "205": ("SO2", '2023-11-01 00:00:00', '2023-11-30 23:00:00'),
    "209": ("NO2", '2023-09-01 00:00:00', '2023-09-30 23:00:00'),
    "223": ("O3", '2023-07-01 00:00:00', '2023-07-31 23:00:00'),
    "224": ("CO", '2023-10-01 00:00:00', '2023-10-31 23:00:00'),
    "226": ("PM10", '2023-08-01 00:00:00', '2023-08-31 23:00:00'),
    "227": ("PM2.5", '2023-12-01 00:00:00', '2023-12-31 23:00:00')
}

# Inicializar el diccionario de resultados
output = {"target": {}}

for station_code, (pollutant, start_date, end_date) in stations.items():
    print(f"\nProcesando estación {station_code} con contaminante {pollutant}")
    station_data = merged_data[(merged_data['Station code'] == int(station_code)) & (merged_data[pollutant] >= 0)].copy()
    
    if station_data.empty:
        continue
    
    # Aplicar Isolation Forest para detectar anomalías
    features = ['hour', 'weekday', 'month', pollutant]
    iso_forest = IsolationForest(contamination=0.04, random_state=42)
    station_data['anomaly'] = iso_forest.fit_predict(station_data[features])
    station_data['anomaly'] = np.where(station_data['anomaly'] == -1, 1, 0)
    
    # Filtrar anomalías detectadas
    anomalies = station_data[station_data['anomaly'] == 1].copy()
    if anomalies.empty:
        continue
    
    # Entrenar modelo multiclase para clasificar anomalías
    X_anomalies = anomalies[features]
    y_anomalies = anomalies['Instrument status']
    if y_anomalies.isnull().all():
        continue
    
    X_train, X_test, y_train, y_test = train_test_split(X_anomalies, y_anomalies, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(f"Clasificación de anomalías - F1 Score: {f1_score(y_test, y_pred, average='macro'):.4f}")
    
    # Predecir anomalías en el período solicitado
    date_range = pd.date_range(start=pd.to_datetime(start_date), end=pd.to_datetime(end_date), freq='h')
    forecast_data = pd.DataFrame({'Measurement date': date_range})
    forecast_data['hour'] = forecast_data['Measurement date'].dt.hour
    forecast_data['weekday'] = forecast_data['Measurement date'].dt.weekday
    forecast_data['month'] = forecast_data['Measurement date'].dt.month
    forecast_data[pollutant] = 0  # Se podría usar un valor estimado o promedio
    
    # Aplicar Isolation Forest en los datos futuros
    forecast_data['anomaly'] = iso_forest.predict(forecast_data[features])
    forecast_data['anomaly'] = np.where(forecast_data['anomaly'] == -1, 1, 0)
    
    anomalous_data = forecast_data[forecast_data['anomaly'] == 1].copy()
    if not anomalous_data.empty:
        anomalous_data['anomaly_type'] = clf.predict(anomalous_data[features])
    
    final_results = forecast_data.merge(anomalous_data[['Measurement date', 'anomaly_type']], on='Measurement date', how='left')
    output["target"][station_code] = {str(date): int(pred) if not np.isnan(pred) else 0 for date, pred in zip(final_results['Measurement date'], final_results['anomaly_type'].fillna(0))}


# Guardar predicciones
output_filename = "predictions/predictions_task_3.json"
os.makedirs(os.path.dirname(output_filename), exist_ok=True)
with open(output_filename, 'w') as f:
    json.dump(output, f, indent=4)

print(f"Predicciones guardadas en {output_filename}")
print(instrument_data["Instrument status"].value_counts(normalize=True) * 100)



Instrument status
0    97.381889
8     0.764730
1     0.732977
9     0.531042
4     0.484925
2     0.104437
Name: proportion, dtype: float64

Procesando estación 205 con contaminante SO2
Clasificación de anomalías - F1 Score: 0.8180

Procesando estación 209 con contaminante NO2
Clasificación de anomalías - F1 Score: 0.7108

Procesando estación 223 con contaminante O3
Clasificación de anomalías - F1 Score: 0.6293

Procesando estación 224 con contaminante CO
Clasificación de anomalías - F1 Score: 0.5358

Procesando estación 226 con contaminante PM10
Clasificación de anomalías - F1 Score: 0.5519

Procesando estación 227 con contaminante PM2.5
Clasificación de anomalías - F1 Score: 0.5855
Predicciones guardadas en predictions/predictions_task_3.json
Instrument status
0    97.381889
8     0.764730
1     0.732977
9     0.531042
4     0.484925
2     0.104437
Name: proportion, dtype: float64


In [24]:
station_data.describe()

Unnamed: 0,Measurement date,Station code,Latitude,Longitude,SO2,NO2,O3,CO,PM10,PM2.5,...,NO2_rolling_mean,O3_rolling_mean,CO_rolling_mean,PM10_rolling_mean,PM2.5_rolling_mean,SO2_NO2,O3_PM10,SO2_O3,instrument_status_binary,anomaly
count,150665,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,...,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0,150665.0
mean,2022-06-12 16:07:59.816812032,227.0,37.50269,127.0925,0.004033,0.029552,0.02386,0.564486,46.153446,24.097083,...,0.029552,0.02386,0.564472,46.15103,24.091019,0.509061,11.521873,0.858277,0.029542,0.039969
min,2021-01-01 00:00:00,227.0,37.50269,127.0925,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2021-09-20 21:00:00,227.0,37.50269,127.0925,0.003,0.017,0.006,0.4,22.0,11.0,...,0.017,0.006,0.4,22.0,10.666667,0.09802,0.000156,0.09995,0.0,0.0
50%,2022-06-10 01:00:00,227.0,37.50269,127.0925,0.004,0.026,0.02,0.5,35.0,18.0,...,0.026333,0.02,0.5,35.333333,18.333333,0.139502,0.000583,0.181736,0.0,0.0
75%,2023-02-26 17:00:00,227.0,37.50269,127.0925,0.005,0.04,0.035,0.7,54.0,30.0,...,0.04,0.035,0.7,54.0,30.0,0.214133,0.001263,0.62422,0.0,0.0
max,2023-11-30 23:00:00,227.0,37.50269,127.0925,0.124,0.114,0.17,12.4,985.0,985.0,...,0.114,0.17,12.4,985.0,985.0,1400.0,4300.0,500.0,1.0,1.0
std,,0.0,2.131635e-14,2.84218e-14,0.002701,0.015931,0.020423,0.374957,71.314216,40.327664,...,0.015855,0.020339,0.362727,68.823761,38.42274,13.694722,151.656342,11.399797,0.169322,0.195888


# redes

In [8]:
import pandas as pd
import json
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.ensemble import IsolationForest

# Cargar los datos
data = pd.read_csv("data/raw/measurement_data.csv")
instrument_data = pd.read_csv("data/raw/instrument_data.csv", parse_dates=['Measurement date'])

print(instrument_data["Instrument status"].value_counts(normalize=True) * 100)

# Convertir fechas a tipo datetime
data['Measurement date'] = pd.to_datetime(data['Measurement date'])
instrument_data['Measurement date'] = pd.to_datetime(instrument_data['Measurement date'])

# Realizar el merge entre data e instrument_data
merged_data = pd.merge(data, instrument_data, on=['Station code', 'Measurement date'], how='inner')

# Extraer características temporales
merged_data["hour"] = merged_data["Measurement date"].dt.hour
merged_data["month"] = merged_data["Measurement date"].dt.month
merged_data["weekday"] = merged_data["Measurement date"].dt.weekday
merged_data["is_weekend"] = merged_data["weekday"].isin([5, 6]).astype(int)

# Crear nuevas características
# Promedio móvil de 3 horas para los contaminantes
for pollutant in ['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5']:
    merged_data[f'{pollutant}_rolling_mean'] = merged_data[pollutant].rolling(window=3).mean()

# Relación entre contaminantes
merged_data['SO2_NO2'] = merged_data['SO2'] / (merged_data['NO2'] + 1e-5)  # Agregar un pequeño valor para evitar división por 0
merged_data['O3_PM10'] = merged_data['O3'] / (merged_data['PM10'] + 1e-5)
merged_data['SO2_O3'] = merged_data['SO2'] / (merged_data['O3'] + 1e-5)

# Clasificar el `Instrument status` en las categorías proporcionadas
# 0 -> Normal, 1 -> Need for calibration, 2 -> Abnormal, 4 -> Power cut off, 8 -> Under repair, 9 -> Abnormal data
valid_statuses = [0, 1, 2, 4, 8, 9]
merged_data = merged_data[merged_data['Instrument status'].isin(valid_statuses)]

# Mapeo de Instrument status a clases 0-5
status_map = {0: 0, 1: 1, 2: 2, 4: 3, 8: 4, 9: 5}
merged_data['instrument_status_mapped'] = merged_data['Instrument status'].map(status_map)

# Definir las estaciones y contaminantes con sus períodos
stations = {
    "205": ("SO2", '2023-11-01 00:00:00', '2023-11-30 23:00:00'),
    "209": ("NO2", '2023-09-01 00:00:00', '2023-09-30 23:00:00'),
    "223": ("O3", '2023-07-01 00:00:00', '2023-07-31 23:00:00'),
    "224": ("CO", '2023-10-01 00:00:00', '2023-10-31 23:00:00'),
    "226": ("PM10", '2023-08-01 00:00:00', '2023-08-31 23:00:00'),
    "227": ("PM2.5", '2023-12-01 00:00:00', '2023-12-31 23:00:00')
}

# Inicializar el diccionario de resultados
output = {"target": {}}

# Mapeo para las predicciones de clase (0-5) a los valores originales (0, 1, 2, 4, 8, 9)
reverse_status_map = {0: 0, 1: 1, 2: 2, 3: 4, 4: 8, 5: 9}

for station_code, (pollutant, start_date, end_date) in stations.items():
    print(f"\nProcesando estación {station_code} con contaminante {pollutant}")
    station_data = merged_data[(merged_data['Station code'] == int(station_code)) & (merged_data[pollutant] >= 0)].copy()

    if station_data.empty:
        continue

    # Aplicar Isolation Forest para detectar anomalías
    features = ['hour', 'weekday', 'month', pollutant]
    iso_forest = IsolationForest(contamination=0.03, random_state=42)
    station_data['anomaly'] = iso_forest.fit_predict(station_data[features])
    station_data['anomaly'] = np.where(station_data['anomaly'] == -1, 1, 0)

    # Filtrar anomalías detectadas
    anomalies = station_data[station_data['anomaly'] == 1].copy()
    if anomalies.empty:
        continue

    # Entrenar la red neuronal para clasificar anomalías
    X_anomalies = anomalies[features]
    y_anomalies = anomalies['instrument_status_mapped']
    if y_anomalies.isnull().all():
        continue

    X_train, X_test, y_train, y_test = train_test_split(X_anomalies, y_anomalies, test_size=0.2, random_state=42)

    # Red neuronal más compleja
    model = Sequential([
        Dense(256, input_dim=X_train.shape[1], activation='relu'),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(6, activation='softmax')  # Salida multiclase con 6 clases posibles (0, 1, 2, 4, 8, 9)
    ])

    model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # EarlyStopping para evitar el overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])

    y_pred = np.argmax(model.predict(X_test), axis=1)
    print(f"Clasificación de anomalías - F1 Score: {f1_score(y_test, y_pred, average='macro'):.4f}")

    # Predecir anomalías en el período solicitado
    date_range = pd.date_range(start=pd.to_datetime(start_date), end=pd.to_datetime(end_date), freq='h')
    forecast_data = pd.DataFrame({'Measurement date': date_range})
    forecast_data['hour'] = forecast_data['Measurement date'].dt.hour
    forecast_data['weekday'] = forecast_data['Measurement date'].dt.weekday
    forecast_data['month'] = forecast_data['Measurement date'].dt.month
    forecast_data[pollutant] = 0  # Se podría usar un valor estimado o promedio

    # Aplicar Isolation Forest en los datos futuros
    forecast_data['anomaly'] = iso_forest.predict(forecast_data[features])
    forecast_data['anomaly'] = np.where(forecast_data['anomaly'] == -1, 1, 0)

    anomalous_data = forecast_data[forecast_data['anomaly'] == 1].copy()
    if not anomalous_data.empty:
        # Convertir la predicción de clases (0-5) a los valores originales (0, 1, 2, 4, 8, 9)
        anomalous_data['anomaly_type'] = [reverse_status_map[label] for label in np.argmax(model.predict(anomalous_data[features]), axis=1)]

    # Merge de las predicciones de anomalías con los datos de forecast
    final_results = forecast_data.merge(anomalous_data[['Measurement date', 'anomaly_type']], on='Measurement date', how='left')
    output["target"][station_code] = {str(date): int(pred) if not np.isnan(pred) else 0 for date, pred in zip(final_results['Measurement date'], final_results['anomaly_type'].fillna(0))}

# Guardar predicciones
output_filename = "predictions/predictions_task_3.json"
os.makedirs(os.path.dirname(output_filename), exist_ok=True)
with open(output_filename, 'w') as f:
    json.dump(output, f, indent=4)

print(f"Predicciones guardadas en {output_filename}")
print(instrument_data["Instrument status"].value_counts(normalize=True) * 100)



Instrument status
0    97.381889
8     0.764730
1     0.732977
9     0.531042
4     0.484925
2     0.104437
Name: proportion, dtype: float64

Procesando estación 205 con contaminante SO2


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8971 - loss: 0.5006 - val_accuracy: 0.9830 - val_loss: 0.1061
Epoch 2/100
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9697 - loss: 0.1578 - val_accuracy: 0.9830 - val_loss: 0.0888
Epoch 3/100
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9773 - loss: 0.1137 - val_accuracy: 0.9830 - val_loss: 0.1115
Epoch 4/100
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9738 - loss: 0.1181 - val_accuracy: 0.9830 - val_loss: 0.0902
Epoch 5/100
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9741 - loss: 0.1156 - val_accuracy: 0.9830 - val_loss: 0.0819
Epoch 6/100
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9725 - loss: 0.1223 - val_accuracy: 0.9830 - val_loss: 0.0787
Epoch 7/100
[1m110/11

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.9229 - loss: 0.4534 - val_accuracy: 0.9879 - val_loss: 0.0936
Epoch 2/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9893 - loss: 0.1184 - val_accuracy: 0.9879 - val_loss: 0.1034
Epoch 3/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9887 - loss: 0.0952 - val_accuracy: 0.9879 - val_loss: 0.0649
Epoch 4/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9900 - loss: 0.0714 - val_accuracy: 0.9879 - val_loss: 0.0659
Epoch 5/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9881 - loss: 0.0763 - val_accuracy: 0.9879 - val_loss: 0.0747
Epoch 6/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9874 - loss: 0.0768 - val_accuracy: 0.9879 - val_loss: 0.0696
[1m26/26[0m [32m━━━━━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9882 - loss: 0.2837 - val_accuracy: 0.9794 - val_loss: 0.1254
Epoch 2/100
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9904 - loss: 0.0642 - val_accuracy: 0.9794 - val_loss: 0.1067
Epoch 3/100
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9860 - loss: 0.0868 - val_accuracy: 0.9794 - val_loss: 0.1079
Epoch 4/100
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9901 - loss: 0.0522 - val_accuracy: 0.9794 - val_loss: 0.1092
Epoch 5/100
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9872 - loss: 0.0732 - val_accuracy: 0.9794 - val_loss: 0.1091
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
Clasificación de anomalías - F1 Score: 0.2474
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step

Procesando est

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9046 - loss: 0.5750 - val_accuracy: 0.9641 - val_loss: 0.2249
Epoch 2/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9615 - loss: 0.2251 - val_accuracy: 0.9653 - val_loss: 0.1930
Epoch 3/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9570 - loss: 0.2050 - val_accuracy: 0.9641 - val_loss: 0.1718
Epoch 4/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9611 - loss: 0.1699 - val_accuracy: 0.9677 - val_loss: 0.1723
Epoch 5/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9628 - loss: 0.1594 - val_accuracy: 0.9653 - val_loss: 0.1611
Epoch 6/100
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9672 - loss: 0.1431 - val_accuracy: 0.9653 - val_loss: 0.1383
Epoch 7/100
[1m105/105[0m [32m━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8935 - loss: 0.5275 - val_accuracy: 0.9340 - val_loss: 0.4150
Epoch 2/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9400 - loss: 0.3372 - val_accuracy: 0.7597 - val_loss: 0.4858
Epoch 3/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9267 - loss: 0.3040 - val_accuracy: 0.8780 - val_loss: 0.3703
Epoch 4/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9359 - loss: 0.2573 - val_accuracy: 0.9352 - val_loss: 0.2122
Epoch 5/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9505 - loss: 0.2058 - val_accuracy: 0.9066 - val_loss: 0.2405
Epoch 6/100
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9358 - loss: 0.2230 - val_accuracy: 0.9390 - val_loss: 0.1970
Epoch 7/100
[1m101/101[0m [32m━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9055 - loss: 0.8518 - val_accuracy: 0.9480 - val_loss: 0.7321
Epoch 2/100
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9688 - loss: 0.2959 - val_accuracy: 0.9546 - val_loss: 0.2765
Epoch 3/100
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9617 - loss: 0.2260 - val_accuracy: 0.9546 - val_loss: 0.2283
Epoch 4/100
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9629 - loss: 0.1915 - val_accuracy: 0.9291 - val_loss: 0.2280
Epoch 5/100
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9611 - loss: 0.1940 - val_accuracy: 0.9480 - val_loss: 0.2097
Epoch 6/100
[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9633 - loss: 0.1621 - val_accuracy: 0.9247 - val_loss: 0.2505
Epoch 7/100
[1m113/113[0m [32m━