In [17]:
print("Available AOD dates:", aod_df['Date'].min(), "to", aod_df['Date'].max())
print(aod_df['Date'].dt.strftime("%Y-%m-%d").tolist())


Available AOD dates: 2024-01-01 00:00:00 to 2024-06-18 00:00:00
['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08', '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12', '2024-01-13', '2024-01-14', '2024-01-15', '2024-01-16', '2024-01-17', '2024-01-18', '2024-01-19', '2024-01-20', '2024-01-21', '2024-01-22', '2024-01-23', '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-27', '2024-01-28', '2024-01-29', '2024-01-30', '2024-01-31', '2024-02-01', '2024-02-02', '2024-02-03', '2024-02-04', '2024-02-05', '2024-02-06', '2024-02-07', '2024-02-08', '2024-02-09', '2024-02-10', '2024-02-11', '2024-02-12', '2024-02-13', '2024-02-14', '2024-02-15', '2024-02-16', '2024-02-17', '2024-02-18', '2024-02-19', '2024-02-20', '2024-02-21', '2024-02-22', '2024-02-23', '2024-02-24', '2024-02-25', '2024-02-26', '2024-02-27', '2024-02-28', '2024-02-29', '2024-03-01', '2024-03-02', '2024-03-03', '2024-03-04', '2024-03-05', '2024-03-06', '2024-03-07

In [19]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import datetime

# Load datasets
aod_df = pd.read_csv("CPCB Data/mean_aod_faridabad.csv")
pm_df = pd.read_csv("CPCB Data/City_wise_raw_data_1Hr_2024_Faridabad_1Hr.csv")

print("AOD columns:", aod_df.columns)
print("PM columns:", pm_df.columns)

# Format date
aod_df['Date'] = pd.to_datetime(aod_df['Date'])
pm_df['Timestamp'] = pd.to_datetime(pm_df['Timestamp'])
pm_df['Date'] = pm_df['Timestamp'].dt.normalize()

# Merge both
df = pd.merge(aod_df, pm_df, on='Date')
df = df.rename(columns={"PM2.5 (µg/m³)": "PM2.5"})

# Clean nulls
df = df.dropna(subset=["Mean_AOD", "PM2.5"])

# Feature and target
X = df[["Mean_AOD"]]
y = df["PM2.5"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=None)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Eval
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5

r2 = r2_score(y_test, y_pred)

print(f"✅ Model Evaluation")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

# 🔮 Predict for any realistic date
target_date = pd.to_datetime("2024-02-03")  # you can change it
aod_row = aod_df[aod_df["Date"] == target_date]

if not aod_row.empty:
    future_aod = aod_row["Mean_AOD"].values[0]
    predicted_pm = model.predict([[future_aod]])[0]
    print(f"🔮 Predicted PM2.5 for {target_date.date()}: {predicted_pm}")
else:
    print(f"❌ No AOD data found for {target_date.date()}")


AOD columns: Index(['Date', 'Mean_AOD'], dtype='object')
PM columns: Index(['Timestamp', 'PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)',
       'NO2 (µg/m³)', 'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
       'Ozone (µg/m³)', 'Benzene (µg/m³)', 'Toluene (µg/m³)', 'Xylene (µg/m³)',
       'O Xylene (µg/m³)', 'Eth-Benzene (µg/m³)', 'MP-Xylene (µg/m³)',
       'AT (°C)', 'RH (%)', 'WS (m/s)', 'WD (deg)', 'RF (mm)', 'TOT-RF (mm)',
       'SR (W/mt2)', 'BP (mmHg)', 'VWS (m/s)'],
      dtype='object')
✅ Model Evaluation
MAE: 36.92021756625384
RMSE: 50.74436283707568
R2: 0.08365825654128001
🔮 Predicted PM2.5 for 2024-02-03: 117.66050684177806




In [20]:
import datetime

# Make sure date column exists in date-only form
pm_df['Date'] = pd.to_datetime(pm_df['Timestamp']).dt.date

# Target date
target_date = datetime.date(2024, 2, 3)

# Filter PM2.5 values
pm_values = pm_df[pm_df['Date'] == target_date]['PM2.5 (µg/m³)']

# Show mean or all values
if not pm_values.empty:
    real_pm_mean = pm_values.mean()
    print(f"✅ Real PM2.5 on {target_date}: {real_pm_mean:.2f}")
    print(pm_values.describe())  # optional: shows count, mean, min, max, etc.
else:
    print(f"❌ No PM2.5 data available for {target_date}")


✅ Real PM2.5 on 2024-02-03: 101.29
count     24.000000
mean     101.294167
std       59.017038
min       61.840000
25%       70.892500
50%       82.155000
75%       91.570000
max      323.380000
Name: PM2.5 (µg/m³), dtype: float64


In [21]:
print("Dates in AOD:", aod_df['Date'].dt.strftime("%Y-%m-%d").tolist())
print("Dates in PM:", pm_df['Timestamp'].dt.date.unique())


Dates in AOD: ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08', '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12', '2024-01-13', '2024-01-14', '2024-01-15', '2024-01-16', '2024-01-17', '2024-01-18', '2024-01-19', '2024-01-20', '2024-01-21', '2024-01-22', '2024-01-23', '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-27', '2024-01-28', '2024-01-29', '2024-01-30', '2024-01-31', '2024-02-01', '2024-02-02', '2024-02-03', '2024-02-04', '2024-02-05', '2024-02-06', '2024-02-07', '2024-02-08', '2024-02-09', '2024-02-10', '2024-02-11', '2024-02-12', '2024-02-13', '2024-02-14', '2024-02-15', '2024-02-16', '2024-02-17', '2024-02-18', '2024-02-19', '2024-02-20', '2024-02-21', '2024-02-22', '2024-02-23', '2024-02-24', '2024-02-25', '2024-02-26', '2024-02-27', '2024-02-28', '2024-02-29', '2024-03-01', '2024-03-02', '2024-03-03', '2024-03-04', '2024-03-05', '2024-03-06', '2024-03-07', '2024-03-08', '2024-03-09', '2024-03-10', '2024