In [2]:
# ----------------------------
# IMPORTS
# ----------------------------
from pathlib import Path
import pandas as pd
import os

# ----------------------------
# BASE DIRECTORY
# ----------------------------
BASE_DIR = Path.cwd().parent  # Project root (one level up from notebooks)

# ----------------------------
# DATA DIRECTORIES
# ----------------------------
DATA_RAW_DIR = BASE_DIR / "data" / "raw"
DATA_PROCESSED_DIR = BASE_DIR / "data" / "processed"

# RESULTS, MODELS, REPORTS DIRECTORIES
RESULTS_DIR = BASE_DIR / "results"
FORECASTS_DIR = RESULTS_DIR / "forecasts"
MODELS_DIR = BASE_DIR / "models"
REPORTS_DIR = BASE_DIR / "reports"

# Ensure directories exist
RESULTS_DIR.mkdir(exist_ok=True)
FORECASTS_DIR.mkdir(exist_ok=True)
REPORTS_DIR.mkdir(exist_ok=True)

print(f"✅ Base directory: {BASE_DIR}")
print(f"✅ Raw data path: {DATA_RAW_DIR}")
print(f"✅ Processed data path: {DATA_PROCESSED_DIR}")

# ----------------------------
# HELPER FUNCTION: SAFE CSV LOAD
# ----------------------------
def safe_read_csv(path, fix_unnamed=False):
    """Read CSV safely; return None if file not found. Optionally fix Unnamed columns."""
    try:
        df = pd.read_csv(path, engine='python')
        if fix_unnamed:
            df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        print(f"✅ Loaded: {path.name} | Shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"⚠️ File not found: {path}")
        return None

# ----------------------------
# LOAD RAW CSV FILES
# ----------------------------
city_day_df = safe_read_csv(DATA_RAW_DIR / "city_day.csv")
city_hour_df = safe_read_csv(DATA_RAW_DIR / "city_hour.csv")
station_day_df = safe_read_csv(DATA_RAW_DIR / "station_day.csv")
station_hour_df = safe_read_csv(DATA_RAW_DIR / "station_hour.csv")

# ----------------------------
# LOAD PROCESSED CSV FILES
# ----------------------------
city_day_processed_df = safe_read_csv(DATA_PROCESSED_DIR / "city_day_processed.csv")
city_day_sample_df = safe_read_csv(DATA_PROCESSED_DIR / "city_day_sample.csv")

city_hour_processed_df = safe_read_csv(DATA_PROCESSED_DIR / "city_hour_processed.csv", fix_unnamed=True)
city_hour_sample_df = safe_read_csv(DATA_PROCESSED_DIR / "city_hour_sample.csv", fix_unnamed=True)

station_day_processed_df = safe_read_csv(DATA_PROCESSED_DIR / "station_day.csv")
station_day_sample_df = safe_read_csv(DATA_PROCESSED_DIR / "station_day_sample.csv")

station_hour_processed_df = safe_read_csv(DATA_PROCESSED_DIR / "station_hour.csv")
station_hour_sample_df = safe_read_csv(DATA_PROCESSED_DIR / "station_hour_sample.txt", fix_unnamed=True)

# ----------------------------
# PRINT SUMMARY
# ----------------------------
print("\n✅ Data loading complete.")


✅ Base directory: c:\Users\LENOVO\Desktop\project by info
✅ Raw data path: c:\Users\LENOVO\Desktop\project by info\data\raw
✅ Processed data path: c:\Users\LENOVO\Desktop\project by info\data\processed
✅ Loaded: city_day.csv | Shape: (29531, 16)
✅ Loaded: city_hour.csv | Shape: (707875, 16)
✅ Loaded: station_day.csv | Shape: (108035, 16)
✅ Loaded: station_hour.csv | Shape: (2589083, 16)
✅ Loaded: city_day_processed.csv | Shape: (29531, 17)
✅ Loaded: city_day_sample.csv | Shape: (100, 17)
✅ Loaded: city_hour_processed.csv | Shape: (707875, 37)
✅ Loaded: city_hour_sample.csv | Shape: (100, 37)
⚠️ File not found: c:\Users\LENOVO\Desktop\project by info\data\processed\station_day.csv
✅ Loaded: station_day_sample.csv | Shape: (100, 17)
⚠️ File not found: c:\Users\LENOVO\Desktop\project by info\data\processed\station_hour.csv
⚠️ File not found: c:\Users\LENOVO\Desktop\project by info\data\processed\station_hour_sample.txt

✅ Data loading complete.


In [3]:
# Load Forecasts
forecast_files = ["forecast_aqi.csv", "forecast.csv"]
forecast_data = None
for f in forecast_files:
    file = RESULTS_DIR / f
    if file.exists():
        forecast_data = pd.read_csv(file)
        print(f"Loaded {f} with {len(forecast_data)} rows")
        break

# Load Alerts
alerts = None
for f in ["alerts.csv", "alerts_summary.csv"]:
    file = RESULTS_DIR / f
    if file.exists():
        alerts = pd.read_csv(file)
        print(f"Loaded {f}")
        break

# Load Performance
performance = None
for f in ["model_performance.csv", "forecast_metrics_summary.csv"]:
    file = RESULTS_DIR / f
    if file.exists():
        performance = pd.read_csv(file)
        print(f"Loaded {f}")
        break

# Load Best Models
best_models_file = RESULTS_DIR / "best_models.csv"
best_models = pd.read_csv(best_models_file) if best_models_file.exists() else None

print("✅ Data loaded")


Loaded alerts.csv
Loaded model_performance.csv
✅ Data loaded


In [7]:
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go

# ----------------------------
# BASE & RESULTS DIRECTORIES
# ----------------------------
BASE_DIR = Path.cwd().parent
RESULTS_DIR = BASE_DIR / "results"
FORECASTS_DIR = RESULTS_DIR / "forecasts"

# ----------------------------
# HELPER FUNCTION TO LOAD CSV
# ----------------------------
def safe_read_csv(path):
    try:
        df = pd.read_csv(path, engine='python')
        return df
    except FileNotFoundError:
        print(f"⚠️ File not found: {path}")
        return None

# ----------------------------
# LOAD ALL FORECAST CSVs INTO A DICTIONARY
# ----------------------------
forecast_files = list(FORECASTS_DIR.glob("*.csv"))
forecast_dfs = {}
for file in forecast_files:
    df = safe_read_csv(file)
    if df is not None:
        forecast_dfs[file.stem] = df  # key = filename without extension

print(f"✅ Forecast files loaded: {list(forecast_dfs.keys())}")

# ----------------------------
# GAUGE PLOT FOR A CITY/POLLUTANT
# ----------------------------
forecast_key = "forecast_Ahmedabad_PM2.5"  # change to the desired forecast file

forecast_data = forecast_dfs.get(forecast_key, None)

if forecast_data is not None and not forecast_data.empty:
    latest = forecast_data.iloc[0]  # take the first row
    
    city = latest.get("City", "Unknown")
    aqi_val = latest.get("AQI", 0)
    category = latest.get("Category", "Unknown")

    fig = go.Figure(go.Indicator(
        mode="gauge+number",
        value=aqi_val,
        title={"text": f"{city} – {category}"},
        gauge={
            "axis": {"range": [0, 500]},
            "bar": {"color": "orange"},
            "steps": [
                {"range": [0, 50], "color": "green"},
                {"range": [51, 100], "color": "yellow"},
                {"range": [101, 150], "color": "orange"},
                {"range": [151, 200], "color": "red"},
                {"range": [201, 300], "color": "purple"},
                {"range": [301, 500], "color": "maroon"}
            ]
        }
    ))

    fig.show()
else:
    print("⚠️ No forecast data available for", forecast_key)


✅ Forecast files loaded: ['forecast_accuracy_metrics', 'forecast_Ahmedabad_CO', 'forecast_Ahmedabad_NO', 'forecast_Ahmedabad_NO2', 'forecast_Ahmedabad_NOx', 'forecast_Ahmedabad_O3', 'forecast_Ahmedabad_PM10', 'forecast_Ahmedabad_PM2.5', 'forecast_Ahmedabad_SO2', 'forecast_Aizawl_CO', 'forecast_Aizawl_NH3', 'forecast_Aizawl_NO', 'forecast_Aizawl_NO2', 'forecast_Aizawl_NOx', 'forecast_Aizawl_O3', 'forecast_Aizawl_PM10', 'forecast_Aizawl_PM2.5', 'forecast_Aizawl_SO2', 'forecast_Amaravati_CO', 'forecast_Amaravati_NH3', 'forecast_Amaravati_NO', 'forecast_Amaravati_NO2', 'forecast_Amaravati_NOx', 'forecast_Amaravati_O3', 'forecast_Amaravati_PM10', 'forecast_Amaravati_PM2.5', 'forecast_Amaravati_SO2', 'forecast_Amritsar_CO', 'forecast_Amritsar_NH3', 'forecast_Amritsar_NO', 'forecast_Amritsar_NO2', 'forecast_Amritsar_NOx', 'forecast_Amritsar_O3', 'forecast_Amritsar_PM10', 'forecast_Amritsar_PM2.5', 'forecast_Amritsar_SO2', 'forecast_Bengaluru_CO', 'forecast_Bengaluru_NH3', 'forecast_Bengaluru_

In [10]:
import pandas as pd
import plotly.express as px
from pathlib import Path

# ----------------------------
BASE_DIR = Path.cwd().parent
FORECASTS_DIR = BASE_DIR / "results" / "forecasts"

# ----------------------------
# Load one forecast CSV
forecast_file = FORECASTS_DIR / "forecast_Delhi_PM2.5.csv"
forecast_data = pd.read_csv(forecast_file, engine="python")

# ----------------------------
# Extract city and pollutant from filename
filename_parts = forecast_file.stem.split("_")  # ['forecast', 'Delhi', 'PM2.5']
city = filename_parts[1]
pollutant = filename_parts[2]

# ----------------------------
# Ensure date column exists and convert if needed
if "Date" in forecast_data.columns or "date" in forecast_data.columns:
    date_col = "Date" if "Date" in forecast_data.columns else "date"
    forecast_data[date_col] = pd.to_datetime(forecast_data[date_col])
else:
    # If no date column, create a simple 7-day index
    forecast_data["Date"] = pd.date_range(start=pd.Timestamp.today(), periods=len(forecast_data))

# ----------------------------
# Take first 7 days
forecast_7days = forecast_data.head(7)

# ----------------------------
# Plot AQI line (or pollutant values)
y_col = forecast_data.columns[1]  # second column is usually the pollutant values
fig = px.line(forecast_7days, x="Date", y=y_col,
              title=f"7-Day Forecast for {city} ({pollutant})",
              markers=True)
fig.show()


In [19]:
from pathlib import Path

BASE_DIR = Path.cwd().parent
ALERTS_FILE = BASE_DIR / "results" / "alerts.csv"

print("Alerts file exists:", ALERTS_FILE.exists())


Alerts file exists: True


In [20]:
import pandas as pd

if ALERTS_FILE.exists():
    alerts = pd.read_csv(ALERTS_FILE, engine="python")
    print("Alerts file loaded. Shape:", alerts.shape)
    print("Columns:", alerts.columns)
else:
    alerts = None
    print("⚠️ Alerts file not found")


Alerts file loaded. Shape: (42, 7)
Columns: Index(['Date', 'City', 'AQI', 'AQI_Bucket', 'Bucket_Color', 'AQI_Dominant',
       'Alert_Level'],
      dtype='object')


In [21]:
import matplotlib.pyplot as plt
import seaborn as sns

if alerts is not None and not alerts.empty:
    print("Active Alerts:")
    display(alerts.head(10))

    if "Category" in alerts.columns:
        plt.figure(figsize=(8,5))
        sns.countplot(data=alerts, x="Category", palette="Set2")
        plt.title("Alert Categories")
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    else:
        print("⚠️ 'Category' column not found in alerts data")
else:
    print("⚠️ No alerts data available")


Active Alerts:


Unnamed: 0,Date,City,AQI,AQI_Bucket,Bucket_Color,AQI_Dominant,Alert_Level
0,2025-09-21,Ahmedabad,163,Moderate,Yellow,PM2.5,Moderate
1,2025-09-22,Ahmedabad,65,Satisfactory,Light Green,O3,Low
2,2025-09-23,Ahmedabad,105,Moderate,Yellow,O3,Moderate
3,2025-09-24,Ahmedabad,314,Very Poor,Red,NO2,Very High
4,2025-09-25,Ahmedabad,244,Poor,Orange,SO2,High
5,2025-09-26,Ahmedabad,162,Moderate,Yellow,SO2,Moderate
6,2025-09-27,Ahmedabad,255,Poor,Orange,NO2,High
7,2025-09-21,Bengaluru,324,Very Poor,Red,PM2.5,Very High
8,2025-09-22,Bengaluru,121,Moderate,Yellow,PM2.5,Moderate
9,2025-09-23,Bengaluru,449,Severe,Dark Red,NO2,Severe


⚠️ 'Category' column not found in alerts data


In [22]:
if performance is not None:
    print("Model Performance:")
    display(performance.head(10))

    fig = px.bar(performance, x="Model", y="RMSE",
                 title="Model RMSE Comparison", color="Model")
    fig.show()
else:
    print("⚠️ No performance data available")


Model Performance:


Unnamed: 0,Model,RMSE,MAE,R2
0,Prophet,15.2,10.5,0.88
1,LSTM,12.7,9.3,0.91


In [23]:
if best_models is not None:
    print("Best Models by Pollutant/City:")
    display(best_models.head(10))
else:
    print("⚠️ Best models file missing")


Best Models by Pollutant/City:


Unnamed: 0,index,Model,RMSE,MAE,R2
0,City,LSTM,12.7,9.3,0.91
1,Pollutant,Prophet,15.2,10.5,0.88
