In [None]:
# Imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import os
from datetime import datetime, timedelta
import sys

# Configuraci√≥n
DATA_FILE = "../data/pollution_data.json"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Cargar Datos

In [None]:
# Verificar que el archivo existe
if not os.path.exists(DATA_FILE):
    print(f"Error: {DATA_FILE} no encontrado")
    print("Primero debes ejecutar el dashboard para recolectar datos")
else:
    # Cargar datos
    with open(DATA_FILE, 'r') as f:
        data = json.load(f)
    
    df = pd.DataFrame(data)
    
    print(f"Datos cargados: {len(df)} registros")
    print(f"\nRango temporal: {df['timestamp'].min()} a {df['timestamp'].max()}")
    print(f"\nColumnas: {df.columns.tolist()}")
    print(f"\nTipo de datos:\n{df.dtypes}")

In [None]:
# Convertir timestamp a datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Primeros registros
print("Primeros 5 registros:")
df.head()

## 2. Estad√≠sticas Descriptivas

In [None]:
# Estad√≠sticas por gas
gases = ['co', 'no', 'no2', 'o3', 'so2', 'nh3', 'pm25', 'pm10']
available_gases = [g for g in gases if g in df.columns]

stats = df[available_gases + ['aqi']].describe()
print("Estad√≠sticas Descriptivas (Œºg/m¬≥):")
print(stats.round(2))

In [None]:
# Correlaci√≥n entre variables
correlation = df[available_gases + ['aqi']].corr()

import plotly.express as px
fig = px.imshow(correlation, 
                 labels=dict(color="Correlaci√≥n"),
                 title="Matriz de Correlaci√≥n - Gases y AQI",
                 color_continuous_scale="RdBu_r",
                 zmin=-1, zmax=1)
fig.show()

## 3. Visualizaciones

In [None]:
# AQI a lo largo del tiempo
fig = px.line(df, 
              x='timestamp', 
              y='aqi',
              title='Evoluci√≥n del √çndice de Calidad del Aire (AQI)',
              labels={'aqi': 'AQI Level (1-5)', 'timestamp': 'Tiempo'},
              markers=True)
fig.show()

In [None]:
# Distribuci√≥n de AQI
aqi_counts = df['aqi'].value_counts().sort_index()
aqi_labels = {1: 'Good', 2: 'Fair', 3: 'Moderate', 4: 'Poor', 5: 'Very Poor'}

fig = px.bar(x=[aqi_labels.get(i, str(i)) for i in aqi_counts.index],
             y=aqi_counts.values,
             title='Distribuci√≥n de Niveles de AQI',
             labels={'x': 'Nivel de AQI', 'y': 'Frecuencia'},
             color=aqi_counts.index,
             color_continuous_scale='RdYlGn_r')
fig.show()

In [None]:
# Boxplot de gases
# Reformatear datos para boxplot
boxplot_data = []
for gas in available_gases:
    for value in df[gas]:
        boxplot_data.append({'Gas': gas.upper(), 'Concentraci√≥n': value})

df_box = pd.DataFrame(boxplot_data)

fig = px.box(df_box, 
            x='Gas', 
            y='Concentraci√≥n',
            title='Distribuci√≥n de Concentraciones de Gases (Œºg/m¬≥)',
            points='outliers')
fig.show()

In [None]:
# Serie temporal de todos los gases
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('CO', 'NO‚ÇÇ', 'PM2.5', 'PM10'),
    specs=[[{'secondary_y': False}, {'secondary_y': False}],
           [{'secondary_y': False}, {'secondary_y': False}]]
)

if 'co' in df.columns:
    fig.add_trace(go.Scatter(x=df['timestamp'], y=df['co'], name='CO', mode='lines'), row=1, col=1)
if 'no2' in df.columns:
    fig.add_trace(go.Scatter(x=df['timestamp'], y=df['no2'], name='NO‚ÇÇ', mode='lines'), row=1, col=2)
if 'pm25' in df.columns:
    fig.add_trace(go.Scatter(x=df['timestamp'], y=df['pm25'], name='PM2.5', mode='lines'), row=2, col=1)
if 'pm10' in df.columns:
    fig.add_trace(go.Scatter(x=df['timestamp'], y=df['pm10'], name='PM10', mode='lines'), row=2, col=2)

fig.update_layout(title_text="Serie Temporal de Contaminantes Principales", 
                 height=600,
                 showlegend=False)
fig.show()

## 4. An√°lisis por Hora del D√≠a

In [None]:
# Extraer hora del d√≠a
df['hour'] = df['timestamp'].dt.hour
df['date'] = df['timestamp'].dt.date

# AQI promedio por hora
hourly_aqi = df.groupby('hour')['aqi'].agg(['mean', 'min', 'max', 'count'])

fig = px.bar(x=hourly_aqi.index, 
            y=hourly_aqi['mean'],
            error_y=hourly_aqi['mean'] - hourly_aqi['min'],
            title='AQI Promedio por Hora del D√≠a',
            labels={'x': 'Hora', 'y': 'AQI Promedio'})
fig.show()

## 5. Calidad de Datos

In [None]:
# Valores faltantes
print("Valores faltantes por columna:")
print(df.isnull().sum())

# Estad√≠sticas de densidad
print(f"\nDensidad de datos: {(1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100:.2f}%")

In [None]:
# Detectar outliers usando IQR
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    return (series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR))

print("Outliers detectados por gas:")
for gas in available_gases:
    outliers = detect_outliers_iqr(df[gas]).sum()
    pct = (outliers / len(df)) * 100
    print(f"  {gas.upper()}: {outliers} ({pct:.2f}%)")

## 6. Resumen Ejecutivo

In [None]:
print("="*60)
print("RESUMEN EJECUTIVO - AN√ÅLISIS DE DATOS")
print("="*60)

print(f"\nüìä Dataset:")
print(f"  - Total de registros: {len(df):,}")
print(f"  - Rango temporal: {df['timestamp'].min().strftime('%Y-%m-%d %H:%M')} a {df['timestamp'].max().strftime('%Y-%m-%d %H:%M')}")
print(f"  - Duraci√≥n: {(df['timestamp'].max() - df['timestamp'].min()).total_seconds() / 3600:.1f} horas")

print(f"\nüå¨Ô∏è Niveles de AQI:")
for i in range(1, 6):
    count = (df['aqi'] == i).sum()
    pct = (count / len(df)) * 100
    labels = {1: 'Good üòä', 2: 'Fair üòê', 3: 'Moderate ‚ö†Ô∏è', 4: 'Poor üò∑', 5: 'Very Poor üíÄ'}
    print(f"  - {labels[i]}: {count} ({pct:.1f}%)")

print(f"\nüìà Concentraciones Promedio (Œºg/m¬≥):")
for gas in available_gases:
    mean = df[gas].mean()
    print(f"  - {gas.upper()}: {mean:.2f}")

print(f"\n‚úÖ Calidad de Datos:")
print(f"  - Completitud: {(1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100:.2f}%")
print(f"  - Registro sin valores faltantes: {len(df.dropna())} / {len(df)}")

print("\n" + "="*60)