In [1]:
#Extracting PM2.5 levels from the CPCB data
import pandas as pd

# Load the CSV
pm_df = pd.read_csv("CPCB Data/City_wise_raw_data_1Hr_2024_Faridabad_1Hr.csv")

# Parse Timestamp column
pm_df['Timestamp'] = pd.to_datetime(pm_df['Timestamp'], errors='coerce')

# Create a date range for 1st to 18th June
date_range = pd.date_range(start='2024-01-01 05:00:00', end='2024-06-18 05:00:00', freq='D')

# Filter for only 05:00 data in that date range
pm_filtered = pm_df[pm_df['Timestamp'].isin(date_range)]

# Keep only Timestamp and PM2.5 columns
pm_filtered = pm_filtered[['Timestamp', 'PM2.5 (µg/m³)']]

# Optional: Sort by date
pm_filtered = pm_filtered.sort_values('Timestamp')

# Print the results
print(pm_filtered)

# Save for training
pm_filtered.to_csv("pm25_cpcb_05AM_01janto18june.csv", index=False)


               Timestamp  PM2.5 (µg/m³)
5    2024-01-01 05:00:00         124.76
29   2024-01-02 05:00:00         133.32
53   2024-01-03 05:00:00         119.05
77   2024-01-04 05:00:00         133.41
101  2024-01-05 05:00:00          91.91
...                  ...            ...
3965 2024-06-14 05:00:00          48.43
3989 2024-06-15 05:00:00          49.21
4013 2024-06-16 05:00:00          93.25
4037 2024-06-17 05:00:00          55.74
4061 2024-06-18 05:00:00          58.61

[170 rows x 2 columns]


In [9]:
#Trying to identify the shape of AOD, Latitude, and Longitude
import h5py

file_path = "AOD Data/3DIMG_15JUN2024_0530_L2G_AOD_V02R00.h5"

with h5py.File(file_path, 'r') as f:
    print("Full structure of the HDF5 file:")
    def print_structure(name, obj):
        if isinstance(obj, h5py.Dataset):
            print(f"Dataset: {name} — shape: {obj.shape}")
    f.visititems(print_structure)


Full structure of the HDF5 file:
Dataset: AOD — shape: (1, 551, 551)
Dataset: latitude — shape: (551,)
Dataset: longitude — shape: (551,)
Dataset: time — shape: (1,)


In [10]:
import h5py
import numpy as np
import os
import pandas as pd

# Folder where your HDF5 files are stored
folder_path = "AOD Data/"

# Define Faridabad region bounds
lat_min, lat_max = 28.2, 28.5
lon_min, lon_max = 77.2, 77.5

# Initialize list to store results
results = []

# List all .h5 files in the folder
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith(".h5"):
        file_path = os.path.join(folder_path, filename)
        
        try:
            with h5py.File(file_path, 'r') as f:
                # Load datasets
                aod = f['AOD'][:].squeeze()
                lat = f['latitude'][:]
                lon = f['longitude'][:]
                
                # Flip for correct orientation
                lat_flipped = lat[::-1]
                aod_flipped = aod[::-1, :]
                lon2d, lat2d = np.meshgrid(lon, lat_flipped)

                # Apply mask for Faridabad region
                mask = (lat2d >= lat_min) & (lat2d <= lat_max) & (lon2d >= lon_min) & (lon2d <= lon_max)
                aod_faridabad_values = aod_flipped[mask]
                mean_aod = np.nanmean(aod_faridabad_values)

                # Extract date from filename (assumes format: 3DIMG_15JUN2024_0530_....h5)
                date_str = filename.split("_")[1]
                date = pd.to_datetime(date_str, format='%d%b%Y')

                # Append to results
                results.append({'Date': date, 'Mean_AOD': mean_aod})

        except Exception as e:
            print(f"Failed to process {filename}: {e}")

# Convert results to DataFrame
df = pd.DataFrame(results)
df.sort_values('Date', inplace=True)

# Save to CSV for training
df.to_csv("mean_aod_faridabad.csv", index=False)

print(df)


         Date    Mean_AOD
0  2024-06-01 -999.000000
1  2024-06-02 -776.836670
2  2024-06-03 -776.832703
3  2024-06-04 -999.000000
4  2024-06-05 -776.539917
5  2024-06-06 -776.747070
6  2024-06-07 -776.795715
7  2024-06-08 -999.000000
8  2024-06-09 -776.817932
9  2024-06-10 -776.841675
10 2024-06-11 -887.868042
11 2024-06-12 -776.887695
12 2024-06-13 -776.807800
13 2024-06-14 -776.628784
14 2024-06-15 -776.783203
15 2024-06-16 -776.845398
16 2024-06-17 -776.820801
17 2024-06-18 -776.784424


In [11]:
#Combine AOD and PM2.5 into One Dataframe
import pandas as pd

# Load both datasets
aod_df = pd.read_csv("mean_aod_faridabad.csv")
pm_df = pd.read_csv("pm25_cpcb_05AM_1to18june.csv")

# Convert date columns to datetime
aod_df['Date'] = pd.to_datetime(aod_df['Date'])
pm_df['Timestamp'] = pd.to_datetime(pm_df['Timestamp'])

# Extract just the date part but KEEP datetime64[ns] type
pm_df['Date'] = pm_df['Timestamp'].dt.normalize()

# Merge on Date
combined_df = pd.merge(aod_df, pm_df, left_on='Date', right_on='Date')

# Rename for simplicity
combined_df.rename(columns={"PM2.5 (µg/m³)": "PM2.5"}, inplace=True)

print(combined_df.head())


        Date  Mean_AOD           Timestamp  PM2.5
0 2024-06-01 -999.0000 2024-06-01 05:00:00  76.40
1 2024-06-02 -776.8367 2024-06-02 05:00:00  74.98
2 2024-06-03 -776.8327 2024-06-03 05:00:00  77.86
3 2024-06-04 -999.0000 2024-06-04 05:00:00  55.40
4 2024-06-05 -776.5399 2024-06-05 05:00:00  88.99


In [12]:
#Training a simple linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Features and target
X = combined_df[['Mean_AOD']]  # Feature
y = combined_df['PM2.5']       # Target

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Mean Absolute Error on test set:", mean_absolute_error(y_test, y_pred))

#Predicting PM2.5 for 19th June
aod_19_june = 0.97  # Replace this with your actual computed value
predicted_pm = model.predict([[aod_19_june]])
print("Predicted PM2.5 for 2024-06-19 at 05:00:", predicted_pm[0])


Mean Absolute Error on test set: 16.329789183384506
Predicted PM2.5 for 2024-06-19 at 05:00: 59.31425022570035




In [38]:
#Finding PM2.5 concentration for a single day
import pandas as pd
df = pd.read_csv("CPCB Data/City_wise_raw_data_1Hr_2024_Faridabad_1Hr.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors = "coerce")
target_time = pd.Timestamp("2024-05-11 05:00:00")
pm_at_time = df[df["Timestamp"] == target_time]
print(pm_at_time[["Timestamp", "PM2.5 (µg/m³)"]])

               Timestamp  PM2.5 (µg/m³)
3149 2024-05-11 05:00:00          88.95


In [14]:
#Prediction by Random Forest
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load and merge the datasets
aod_df = pd.read_csv("mean_aod_faridabad.csv")
pm_df = pd.read_csv("pm25_cpcb_05AM_1to18june.csv")

# Convert to datetime
aod_df['Date'] = pd.to_datetime(aod_df['Date'])
pm_df['Timestamp'] = pd.to_datetime(pm_df['Timestamp'])
pm_df['Date'] = pm_df['Timestamp'].dt.normalize()

# Merge on 'Date'
combined_df = pd.merge(aod_df, pm_df, on='Date')
combined_df.rename(columns={"PM2.5 (µg/m³)": "PM2.5"}, inplace=True)

# Select features and target
X = combined_df[['Mean_AOD']]
y = combined_df['PM2.5']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate model
y_pred = rf.predict(X_test)

print("✅ Model Evaluation:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))

#Prediction for 19th June
aod_19_june = 0.97

predicted_pm25 = rf.predict([[aod_19_june]])
print("\n🔮 Predicted PM2.5 for 19 June 2024 at 05:00:", predicted_pm25[0])


✅ Model Evaluation:
MAE: 7.778910714285711
RMSE: 8.834053260522328
R² Score: 0.5988373888613501

🔮 Predicted PM2.5 for 19 June 2024 at 05:00: 77.30559999999991




In [None]:
#Combining MOSDAC and CPCB data with MERRA and predicting the PM2.5 Level
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from netCDF4 import Dataset, num2date
from datetime import datetime, timedelta
import os
import h5py

# ---------- Step 1: Load INSAT AOD from CSV ----------
aod_df = pd.read_csv("aod_data.csv")
aod_df['Date'] = pd.to_datetime(aod_df['Date'])  # 🔄 Convert to datetime for merging
# print(aod_df.head())

# ---------- Step 2: Load CPCB PM2.5 ----------
pm_df = pd.read_csv("pm25_cpcb_05AM_01janto18june.csv")
pm_df['Timestamp'] = pd.to_datetime(pm_df['Timestamp'])
pm_df['Date'] = pm_df['Timestamp'].dt.normalize()

# ---------- Step 3: Load MERRA .nc4 files ----------
def extract_merra_features(nc_folder):
    records = []
    for file in os.listdir(nc_folder):
        if file.endswith(".nc"):
            ds = Dataset(os.path.join(nc_folder, file), 'r')

            time_var = ds.variables['time']
            times = num2date(time_var[:], units=time_var.units, only_use_cftime_datetimes=False)

            ps = ds.variables['PS'][:, 0, 0]
            qv2m = ds.variables['QV2M'][:, 0, 0]
            t2m = ds.variables['T2M'][:, 0, 0]
            ts = ds.variables['TS'][:, 0, 0]
            u10m = ds.variables['U10M'][:, 0, 0]
            # v10m = ds.variables['V10M'][:, 0, 0]
            qv10m = ds.variables['QV10M'][:, 0, 0]
            slp = ds.variables['SLP'][:, 0, 0]
            t10m = ds.variables['T10M'][:, 0, 0]
            t2mdew = ds.variables['T2MDEW'][:, 0, 0]
            tqi = ds.variables['TQI'][:, 0, 0]
            tql = ds.variables['TQL'][:, 0, 0]

            for i in range(len(times)):
                # Ensure times[i] is a native datetime object
                date_val = times[i]
                if hasattr(date_val, 'year'):
                    date_val = datetime(date_val.year, date_val.month, date_val.day)

                records.append({
                    "Date": date_val,
                    "PS": ps[i],
                    "QV2M": qv2m[i],
                    "T2M": t2m[i],
                    "TS": ts[i],
                    "U10M": u10m[i],
                    "QV10M": qv10m[i],
                    "SLP": slp[i],
                    "T10M": t10m[i],
                    "T2MDEW": t2mdew[i],
                    "TQI": tqi[i],
                    "TQL": tql[i],
                    # "U2M": u2m[i]
                })

    return pd.DataFrame(records)

merra_df = extract_merra_features("merra_downloads")  # 📁 Folder containing .nc4 files
merra_df = merra_df.groupby("Date").mean().reset_index()

aod_df['Date'] = pd.to_datetime(aod_df['Date'])
pm_df['Date'] = pd.to_datetime(pm_df['Date'])
merra_df['Date'] = pd.to_datetime(merra_df['Date'])  # ✅ This fixes the issue

# print(aod_df.dtypes)
# print(pm_df.dtypes)
# print(merra_df.dtypes)

# print(f"AOD records: {len(aod_df)}")
# print(f"CPCB records: {len(pm_df)}")
# print(f"MERRA records: {len(merra_df)}")

# ---------- Step 4: Merge All ----------
combined_df = pd.merge(aod_df, pm_df, on="Date")
# print(f"After merging AOD + CPCB: {len(combined_df)} records")

combined_df = pd.merge(combined_df, merra_df, on="Date")
# print(f"After merging with MERRA: {len(combined_df)} records")

# print("🔍 AOD dates:", aod_df['Date'].unique())
# print("🔍 CPCB dates:", pm_df['Date'].unique())
# print("🔍 MERRA dates:", merra_df['Date'].unique())


# ---------- Step 5: ML Model ----------
features = ['Mean_AOD', 'PS', 'QV2M', 'T2M', 'TS', 'U10M', 'QV10M', 'SLP', 'T10M', 'T2MDEW', 'TQI', 'TQL']
clean_df = combined_df.dropna(subset=features + ['PM2.5 (µg/m³)'])

# Create feature matrix and target vector
X = clean_df[features].copy()
y = clean_df['PM2.5 (µg/m³)']

# Optional feature engineering
X['Temp_Diff'] = X['TS'] - X['T2M']
X['Humidity_Ratio'] = X['QV2M'] / (X['T2M'] + 1e-3)  # prevent division by zero

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("✅ Evaluation:")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R²:", r2_score(y_test, y_pred))

# # ---------- Step 6: Predict for 20 June ----------
# # Replace with actual June 20 values from your MERRA and AOD
may_11 = {
    "Mean_AOD": [0.97],
    "PS": [98119.390],
    "QV2M": [0.007],
    "T2M": [313.68],
    "TS": [316.82],
    "U10M": [2.75],
    "QV10M": [0.007],
    "SLP": [100396.60],
    "T10M": [312.85],
    "T2MDEW": [283.28],
    "TQI": [0.0],
    "TQL": [0.0],
}
may_11_df = pd.DataFrame(may_11)

# Recreate engineered features
may_11_df['Temp_Diff'] = may_11_df['TS'] - may_11_df['T2M']
may_11_df['Humidity_Ratio'] = may_11_df['QV2M'] / (may_11_df['T2M'] + 1e-3)

# Ensure columns match training
X_input = may_11_df[X_train.columns]

# Predict
pred_pm = rf.predict(X_input)
print("\n🔮 Predicted PM2.5 for 11 May 2024 at 05:30 IST:", pred_pm[0])


✅ Evaluation:
MAE: 28.026991176470588
RMSE: 36.08490578960144
R²: 0.0041481018634580424

🔮 Predicted PM2.5 for 11 May 2024 at 05:30 IST: 96.84259999999999


In [None]:
#To read AOD value from a single .h5 file
import h5py
import numpy as np
from datetime import datetime
import os

# 📍 Location of interest: Faridabad (approx.)
target_lat = 28.4
target_lon = 77.3

# 📄 Path to one .h5 file (example)
file_path = "aod_folder/3DIMG_01MAY2024_0530_L2G_AOD_V02R00.h5"

# ✅ Read and extract AOD value
with h5py.File(file_path, 'r') as f:
    print("🔍 Keys in file:", list(f.keys()))  # Show root-level keys

    # Check if required datasets are present
    if 'AOD' in f and 'latitude' in f and 'longitude' in f:
        aod_data = f['AOD'][:]           # 2D array of AOD
        latitudes = f['latitude'][:]     # 1D array
        longitudes = f['longitude'][:]   # 1D array

        print("📐 AOD shape:", aod_data.shape)
        print("🧭 Latitude shape:", latitudes.shape)
        print("🧭 Longitude shape:", longitudes.shape)

        # Find nearest grid index to target lat/lon
        lat_idx = (np.abs(latitudes - target_lat)).argmin()
        lon_idx = (np.abs(longitudes - target_lon)).argmin()
        print("🔎 Nearest index — lat:", lat_idx, "lon:", lon_idx)

        # Handle possible shape issues
        try:
            aod_value = aod_data[0, lat_idx, lon_idx]
            print("📌 AOD value at Faridabad:", aod_value)
        except IndexError as e:
            print("❌ IndexError while accessing AOD value:", e)
            aod_value = np.nan
    else:
        print("❌ One or more required datasets not found in file.")
        aod_value = np.nan

    # Parse date from filename
    filename = os.path.basename(file_path)
    try:
        date_str = filename.split("_")[1]  # '01JUN2024'
        date_obj = datetime.strptime(date_str, "%d%b%Y").date()
        print("📆 Date parsed from filename:", date_obj)
    except Exception as e:
        print("❌ Error parsing date from filename:", e)
        date_obj = None

    # Store result
    record = {
        "Date": date_obj,
        "Mean_AOD": aod_value
    }

print("✅ Final Record:", record)


🔍 Keys in file: ['AOD', 'latitude', 'longitude', 'time']
📐 AOD shape: (1, 551, 551)
🧭 Latitude shape: (551,)
🧭 Longitude shape: (551,)
🔎 Nearest index — lat: 166 lon: 322
📌 AOD value at Faridabad: 0.779472
📆 Date parsed from filename: 2024-05-01
✅ Final Record: {'Date': datetime.date(2024, 5, 1), 'Mean_AOD': 0.779472}


In [2]:
#Merging all the .h5 into a single csv file
import h5py
import numpy as np
import pandas as pd
from datetime import datetime
import os

def extract_aod_from_folder(folder_path):
    records = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".h5"):
            file_path = os.path.join(folder_path, filename)
            try:
                with h5py.File(file_path, 'r') as f:
                    # Ensure keys exist (some files might be corrupt)
                    if 'AOD' not in f or 'latitude' not in f or 'longitude' not in f:
                        continue

                    aod_data = f['AOD'][:]
                    latitudes = f['latitude'][:]
                    longitudes = f['longitude'][:]

                    # Find nearest point to Faridabad
                    lat_idx = (np.abs(latitudes - 28.4)).argmin()
                    lon_idx = (np.abs(longitudes - 77.3)).argmin()

                    # Fix for shape mismatch
                    if aod_data.ndim == 3 and lat_idx < aod_data.shape[1] and lon_idx < aod_data.shape[2]:
                        aod_val = aod_data[0, lat_idx, lon_idx]
                    else:
                        aod_val = np.nan  # skip if shape mismatch

                    # Extract date from filename
                    date_str = filename.split("_")[1]  # '01JUN2024'
                    date_obj = datetime.strptime(date_str, "%d%b%Y").date()

                    records.append({
                        "Date": date_obj,
                        "Mean_AOD": aod_val
                    })

            except Exception as e:
                print(f"❌ Failed for {filename}: {e}")
    
    return pd.DataFrame(records)

# 🔁 Extract from all files
aod_df = extract_aod_from_folder("aod_folder")

# 💾 Save to CSV
aod_df.to_csv("aod_data.csv", index=False)
print("✅ Saved all AOD values to aod_data.csv")


✅ Saved all AOD values to aod_data.csv


In [39]:
#Reading the MERRA Varible values from the file
import xarray as xr

# Load NetCDF file
file_path = "merra_downloads\MERRA2_400.tavg1_2d_slv_Nx.20240511.SUB.nc"
ds = xr.open_dataset(file_path)

# 🔍 Step 1: Define coordinates of Faridabad (approx)
target_lat = 28.41
target_lon = 77.31

# 🔍 Step 2: Select the nearest location
nearest_lat = ds.sel(lat=target_lat, method="nearest").lat.values
nearest_lon = ds.sel(lon=target_lon, method="nearest").lon.values

# 🔍 Step 3: Select the first time step (if only one day is present)
time_step = ds.time.values[0]

# 🔍 Step 4: Extract values for each variable at that location and time
may_11 = {
    "PS":     [float(ds["PS"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "QV2M":   [float(ds["QV2M"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "T2M":    [float(ds["T2M"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "TS":     [float(ds["TS"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "U10M":   [float(ds["U10M"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "QV10M":   [float(ds["QV10M"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "SLP":   [float(ds["SLP"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "T10M":   [float(ds["T10M"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "T2MDEW":   [float(ds["T2MDEW"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "TQI":   [float(ds["TQI"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    "TQL":   [float(ds["TQL"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    # "U2M":   [float(ds["U2M"].sel(time=time_step, lat=nearest_lat, lon=nearest_lon).values)],
    # "Mean_AOD": [0.97]  # Add manually or fetch from INSAT/CPCB if needed
}

print(may_11)

{'PS': [98119.390625], 'QV2M': [0.007875049486756325], 'T2M': [313.6807861328125], 'TS': [316.8280334472656], 'U10M': [2.7565226554870605], 'QV10M': [0.007863740436732769], 'SLP': [100396.609375], 'T10M': [312.8543701171875], 'T2MDEW': [283.28582763671875], 'TQI': [0.0], 'TQL': [0.0]}
