In [4]:
import os
import glob
import pandas as pd

def load_pm_pollutants_fixed(dir_path="data/stations_csvs", out_file="data/stations_combined.csv"):
    files = glob.glob(os.path.join(dir_path, "*.csv"))
    print(f"Found {len(files)} CSV files.")
    os.makedirs(os.path.dirname(out_file), exist_ok=True)

    first = True
    for i, f in enumerate(files, 1):
        try:
            # Read column headers first
            cols = [c.strip() for c in pd.read_csv(f, nrows=0).columns]

            # Detect possible matching column names
            mapping = {}
            for c in cols:
                c_lower = c.lower()
                if "date" in c_lower or "time" in c_lower:
                    mapping[c] = "Timestamp"
                elif "pm2" in c_lower:
                    mapping[c] = "PM2.5"
                elif "pm10" in c_lower:
                    mapping[c] = "PM10"
                elif "o3" in c_lower or "ozone" in c_lower:
                    mapping[c] = "O3"
                elif "co" in c_lower and not "co2" in c_lower:
                    mapping[c] = "CO"

            if not mapping:
                print(f"⚠️ Skipped {f} (no relevant columns found)")
                continue

            df = pd.read_csv(f, usecols=mapping.keys(), encoding_errors="ignore")
            df.rename(columns=mapping, inplace=True)
            df["StationFile"] = os.path.basename(f)

            # Make sure all columns exist
            for col in ["Timestamp", "PM2.5", "PM10", "O3", "CO"]:
                if col not in df.columns:
                    df[col] = None

            if first:
                df.to_csv(out_file, index=False)
                first = False
            else:
                df.to_csv(out_file, mode="a", index=False, header=False)

            print(f"[{i}/{len(files)}] Processed {os.path.basename(f)} ({len(df)} rows)")
            del df

        except Exception as e:
            print(f"❌ Skipped {f}: {e}")

    print(f"\n✅ Done! Fixed merged file saved to: {out_file}")

# Run this to rebuild Kaggle merged dataset
load_pm_pollutants_fixed("data/stations_csvs", "data/stations_combined.csv")

Found 11 CSV files.
[1/11] Processed bengaluru_combined.csv (1827 rows)
[2/11] Processed chennai_combined.csv (1827 rows)
[3/11] Processed delhi_combined.csv (1827 rows)
[4/11] Processed gwalior_combined.csv (1827 rows)
[5/11] Processed hyderabad_combined.csv (1827 rows)
[6/11] Processed jaipur_combined.csv (1827 rows)
[7/11] Processed kolkata_combined.csv (1827 rows)
[8/11] Processed lucknow_combined.csv (1827 rows)
[9/11] Processed mumbai_combined.csv (1827 rows)
⚠️ Skipped data/stations_csvs\stations_info.csv (no relevant columns found)
[11/11] Processed visakhapatnam_combined.csv (1827 rows)

✅ Done! Fixed merged file saved to: data/stations_combined.csv


In [5]:
merged_kaggle = pd.read_csv("data/stations_combined.csv", on_bad_lines="skip", low_memory=False)
merged_kaggle.head(10)

Unnamed: 0,Timestamp,PM2.5,PM10,CO,O3,StationFile
0,01-01-2020,,,,,bengaluru_combined.csv
1,02-01-2020,43.67,134.0,0.91,21.82,bengaluru_combined.csv
2,03-01-2020,30.58,74.42,0.96,23.31,bengaluru_combined.csv
3,04-01-2020,66.35,155.68,2.54,29.7,bengaluru_combined.csv
4,05-01-2020,48.0,99.13,1.14,31.01,bengaluru_combined.csv
5,06-01-2020,23.75,63.34,1.08,25.82,bengaluru_combined.csv
6,07-01-2020,24.67,72.0,0.98,30.37,bengaluru_combined.csv
7,08-01-2020,34.18,79.06,0.99,29.61,bengaluru_combined.csv
8,09-01-2020,41.61,98.0,1.11,30.97,bengaluru_combined.csv
9,10-01-2020,38.95,97.7,0.97,31.89,bengaluru_combined.csv


In [6]:
city_df = pd.read_csv("data/cities_combined.csv")

# Make sure Timestamp format matches
for df in [merged_kaggle, city_df]:
    for col in df.columns:
        if "time" in col.lower() or "date" in col.lower():
            df.rename(columns={col: "Timestamp"}, inplace=True)
            break
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")

merged_kaggle["Source"] = "KaggleStation"
city_df["Source"] = "CityCombined"

keep_cols = ["Timestamp", "PM2.5", "PM10", "O3", "CO", "Source"]
for col in keep_cols:
    if col not in city_df.columns:
        city_df[col] = None

combined_df = pd.concat([merged_kaggle[keep_cols], city_df[keep_cols]], ignore_index=True)
print("✅ Combined dataset shape:", combined_df.shape)
combined_df.head()


  city_df = pd.read_csv("data/cities_combined.csv")


✅ Combined dataset shape: (14313933, 6)


Unnamed: 0,Timestamp,PM2.5,PM10,O3,CO,Source
0,2020-01-01,,,,,KaggleStation
1,2020-02-01,43.67,134.0,21.82,0.91,KaggleStation
2,2020-03-01,30.58,74.42,23.31,0.96,KaggleStation
3,2020-04-01,66.35,155.68,29.7,2.54,KaggleStation
4,2020-05-01,48.0,99.13,31.01,1.14,KaggleStation


In [7]:
import pandas as pd

# Load the new Kaggle dataset (the fixed one)
kaggle_df = pd.read_csv("data/stations_combined.csv", on_bad_lines="skip", low_memory=False)

# Load your Combined City Pollution dataset
city_df = pd.read_csv("data/cities_combined.csv")

print("Kaggle dataset shape:", kaggle_df.shape)
print("City dataset shape:", city_df.shape)

# Show first few rows from both
display(kaggle_df.head(3))
display(city_df.head(3))


  city_df = pd.read_csv("data/cities_combined.csv")


Kaggle dataset shape: (18270, 6)
City dataset shape: (14295663, 7)


Unnamed: 0,Timestamp,PM2.5,PM10,CO,O3,StationFile
0,01-01-2020,,,,,bengaluru_combined.csv
1,02-01-2020,43.67,134.0,0.91,21.82,bengaluru_combined.csv
2,03-01-2020,30.58,74.42,0.96,23.31,bengaluru_combined.csv


Unnamed: 0,Timestamp,Timestamp.1,PM2.5,PM10,CO,O3,StationFile
0,2016-07-01 10:00:00,2016-07-01 11:00:00,10.67,39.0,0.48,14.5,AP001.csv
1,2016-07-01 11:00:00,2016-07-01 12:00:00,2.0,39.0,0.49,15.0,AP001.csv
2,2016-07-01 12:00:00,2016-07-01 13:00:00,,,,,AP001.csv


In [8]:
# Normalize timestamp names
for df in [kaggle_df, city_df]:
    for col in df.columns:
        if "time" in col.lower() or "date" in col.lower():
            df.rename(columns={col: "Timestamp"}, inplace=True)
            break

# Force datetime conversion + DROP invalid timestamps
for df in [kaggle_df, city_df]:
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
    df.dropna(subset=["Timestamp"], inplace=True)

# Add source column
kaggle_df["Source"] = "KaggleStation"
city_df["Source"] = "CityCombined"

# Keep only the matching pollutant columns
keep_cols = ["Timestamp", "PM2.5", "PM10", "O3", "CO", "Source"]

# Add missing columns
for col in keep_cols:
    if col not in city_df.columns:
        city_df[col] = None

# Merge both datasets
combined_df = pd.concat(
    [kaggle_df[keep_cols], city_df[keep_cols]],
    ignore_index=True
)

print("✅ Combined dataset shape:", combined_df.shape)
combined_df.head()


✅ Combined dataset shape: (14302863, 6)


Unnamed: 0,Timestamp,PM2.5,PM10,O3,CO,Source
0,2020-01-01,,,,,KaggleStation
1,2020-02-01,43.67,134.0,21.82,0.91,KaggleStation
2,2020-03-01,30.58,74.42,23.31,0.96,KaggleStation
3,2020-04-01,66.35,155.68,29.7,2.54,KaggleStation
4,2020-05-01,48.0,99.13,31.01,1.14,KaggleStation


In [9]:
# Check missing values
print(combined_df.isna().sum())

# Drop rows with completely empty pollutant data
combined_df = combined_df.dropna(subset=["PM2.5", "PM10", "O3", "CO"], how="all")

# Fill remaining missing values with median (optional)
combined_df[["PM2.5", "PM10", "O3", "CO"]] = combined_df[["PM2.5", "PM10", "O3", "CO"]].fillna(
    combined_df.median(numeric_only=True)
)

print("✅ After cleaning:", combined_df.shape)
combined_df.head()

Timestamp          0
PM2.5        3829690
PM10         4422984
O3           3294466
CO           3267519
Source             0
dtype: int64
✅ After cleaning: (12134002, 6)


Unnamed: 0,Timestamp,PM2.5,PM10,O3,CO,Source
1,2020-02-01,43.67,134.0,21.82,0.91,KaggleStation
2,2020-03-01,30.58,74.42,23.31,0.96,KaggleStation
3,2020-04-01,66.35,155.68,29.7,2.54,KaggleStation
4,2020-05-01,48.0,99.13,31.01,1.14,KaggleStation
5,2020-06-01,23.75,63.34,25.82,1.08,KaggleStation


In [10]:
combined_df.to_csv("data/master_airquality_clean.csv", index=False)
print("✅ Saved clean dataset: data/master_airquality_clean.csv")

✅ Saved clean dataset: data/master_airquality_clean.csv


#IMPORTANT PROJECT INSTALLATION DETAILS AFTER THIS

In [11]:
# pip install (run in a notebook cell with `!` or in terminal)
!pip install --upgrade pip # python -m pip install --upgrade pip
!pip install pandas numpy scikit-learn xgboost joblib matplotlib seaborn
!pip install aif360 metaflow wandb bentoml whylabs-sdk fastparquet # pip install whylogs whylabs-client whylabs-toolkit


Usage:   
  pip install [options] <requirement specifier> [package-index-options] ...
  pip install [options] -r <requirements file> [package-index-options] ...
  pip install [options] [-e] <vcs project url> ...
  pip install [options] [-e] <local project path> ...
  pip install [options] <archive url/path> ...

no such option: -m




ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [None]:
# Fix Timestamp -> datetime, create time features, and show diagnostics
import pandas as pd
import numpy as np

# load dataset if not in memory
try:
    df  # if df exists, we use it
except NameError:
    df = pd.read_csv("data/master_airquality_clean.csv", low_memory=False)

# Find likely timestamp column(s)
possible_ts = [c for c in df.columns if any(k in c.lower() for k in ["time","date","timestamp"])]
print("Timestamp-like columns found:", possible_ts)

# If 'Timestamp' already present but not datetime, try to convert it.
if 'Timestamp' in df.columns:
    ts_col = 'Timestamp'
else:
    ts_col = possible_ts[0] if possible_ts else None

if ts_col is None:
    raise RuntimeError("No timestamp-like column found. Please tell me the column names: " + ", ".join(df.columns))

# Convert robustly (try several formats)
df[ts_col] = pd.to_datetime(df[ts_col], errors='coerce', dayfirst=True, infer_datetime_format=True)

# If too many NaT, try alternate parsing (common alternative formats)
nat_frac = df[ts_col].isna().mean()
print(f"After first parse, NaT fraction = {nat_frac:.3f}")

if nat_frac > 0.25:
    # try parsing with no dayfirst
    df[ts_col] = pd.to_datetime(df[ts_col].astype(str), errors='coerce', dayfirst=False, infer_datetime_format=True)
    nat_frac2 = df[ts_col].isna().mean()
    print(f"After second parse (dayfirst=False), NaT fraction = {nat_frac2:.3f}")

# Rename unified column to 'Timestamp'
df.rename(columns={ts_col: 'Timestamp'}, inplace=True)

# Drop rows with missing Timestamp (can't use them for time features)
n_before = len(df)
df = df[~df['Timestamp'].isna()].copy()
n_after = len(df)
print(f"Dropped {n_before - n_after} rows with unparseable Timestamp")

# Create time features
df['hour'] = df['Timestamp'].dt.hour
df['dayofweek'] = df['Timestamp'].dt.dayofweek
df['month'] = df['Timestamp'].dt.month

# Ensure numeric pollutant columns exist
for col in ['PM2.5','PM10','O3','CO']:
    if col not in df.columns:
        df[col] = np.nan
    else:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Quick diagnostics
print("Dataset now has shape:", df.shape)
print("Timestamp min/max:", df['Timestamp'].min(), "/", df['Timestamp'].max())
display(df.head(5))

  df[ts_col] = pd.to_datetime(df[ts_col], errors='coerce', dayfirst=True, infer_datetime_format=True)


Timestamp-like columns found: ['Timestamp', 'Timestamp.1']
After first parse, NaT fraction = 0.000


In [None]:
# Make sure features list exists
features = ['PM10', 'O3', 'CO', 'hour', 'dayofweek', 'month']

In [None]:
from sklearn.model_selection import train_test_split

n = len(df)
test_size = int(0.2 * n)
train_df = df.iloc[:n - test_size].copy()
test_df  = df.iloc[n - test_size:].copy()

X_train = train_df[features]
y_train = train_df['PM2.5']
X_test  = test_df[features]
y_test  = test_df['PM2.5']

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (3817005, 6) Test: (954251, 6)


In [None]:
from sklearn.impute import SimpleImputer
import numpy as np

# Impute (fill) missing values in features
imputer = SimpleImputer(strategy="median")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=features)
X_test = pd.DataFrame(imputer.transform(X_test), columns=features)

# Just to confirm:
print("✅ Missing values after imputation:")
print("Train:", np.isnan(X_train.values).sum(), "Test:", np.isnan(X_test.values).sum())

✅ Missing values after imputation:
Train: 0 Test: 0


In [None]:
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
import os

os.makedirs("models", exist_ok=True)

from sklearn.metrics import mean_squared_error

def eval_model(name, model, X_test, y_test):
    preds = model.predict(X_test)
    mse = root_mean_squared_error(y_test, preds) # squared=False Parameter removed
    rmse = mse**0.5  # Calculate RMSE manually
    r2 = r2_score(y_test, preds)
    print(f"{name} -> RMSE: {rmse:.4f}, R2: {r2:.4f}")
    return {"rmse": rmse, "r2": r2}

results = {}

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
results['linear'] = eval_model("LinearRegression", lr, X_test, y_test)
joblib.dump(lr, "models/linear_reg.joblib")

# Random Forest
from sklearn.ensemble import RandomForestRegressor
import gc

rf = RandomForestRegressor(
    n_estimators=100,       # fewer trees (cut memory ~½)
    max_depth=20,           # limit depth to control tree size
    n_jobs=-1,              # still use all cores
    random_state=42
)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_test)
results['rf'] = eval_model("RandomForest", rf, X_test, y_test)
joblib.dump(rf, "models/rf_reg.joblib")

gc.collect()   # free memory

results

LinearRegression -> RMSE: 7.7607, R2: 0.4424
RandomForest -> RMSE: 7.5511, R2: 0.5002


{'linear': {'rmse': 7.760700774543638, 'r2': 0.44238797498292237},
 'rf': {'rmse': 7.551129424744999, 'r2': 0.5002232696278106}}

In [None]:
import xgboost as xgb
# XGBoost
xgr = xgb.XGBRegressor(n_estimators=300, tree_method='hist', random_state=42, verbosity=0)
xgr.fit(X_train, y_train)
results['xgb'] = eval_model("XGBoost", xgr, X_test, y_test)
xgr.save_model("models/xgb_reg.json")

results

XGBoost -> RMSE: 7.4999, R2: 0.5137


{'linear': {'rmse': 7.760700774543638, 'r2': 0.44238797498292237},
 'rf': {'rmse': 7.551129424744999, 'r2': 0.5002232696278106},
 'xgb': {'rmse': 7.499889417578389, 'r2': 0.5136512374929307}}

In [None]:
!pip install protobuf==3.20.*

Collecting protobuf==3.20.*
  Using cached protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Using cached protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.3
    Uninstalling protobuf-4.25.3:
      Successfully uninstalled protobuf-4.25.3
Successfully installed protobuf-3.20.3


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.18.0 requires opt-einsum>=2.3.2, which is not installed.


In [None]:
# governance_check.py (run in notebook cell or save as file and run)
import pandas as pd
import numpy as np, json
from sklearn.metrics import mean_absolute_error
import joblib

# load data and model
df_all = df.copy()   # your cleaned dataframe in memory
model = joblib.load("models/rf_reg.joblib")  # or use best model

# Regression fairness: MAE by Source and (if available) by City
df_all['pred'] = model.predict(df_all[features].fillna(df_all[features].median()))
reg_mae_source = df_all.groupby('Source').apply(lambda g: mean_absolute_error(g['PM2.5'], g['pred'])).to_dict()
reg_mae_city = df_all.groupby('City').apply(lambda g: mean_absolute_error(g['PM2.5'], g['pred'])) if 'City' in df_all.columns else None

report = {
    "regression_mae_by_source": reg_mae_source,
    "sample_top_cities_mae": (reg_mae_city.sort_values(ascending=False).head(10).to_dict() if reg_mae_city is not None else {})
}

# AIF360 classification proxy (High vs NotHigh) + reweighing
try:
    from aif360.datasets import BinaryLabelDataset
    from aif360.algorithms.preprocessing import Reweighing

    df_clf = df_all.copy()
    df_clf['label_high'] = (df_clf['PM2.5'] >= 60).astype(int)
    df_clf['protected'] = (df_clf['Source'] == 'CityCombined').astype(int)
    X = df_clf[features].fillna(df_clf[features].median())
    data_for_aif = pd.DataFrame(np.hstack([X.values, df_clf['label_high'].values.reshape(-1,1), df_clf['protected'].values.reshape(-1,1)]),
                                columns = [*features,'label','protected'])
    dataset = BinaryLabelDataset(df=data_for_aif, label_names=['label'], protected_attribute_names=['protected'])
    rw = Reweighing(unprivileged_groups=[{'protected':0}], privileged_groups=[{'protected':1}])
    dataset_transf = rw.fit_transform(dataset)
    # show weight summary
    unique_weights = np.unique(dataset_transf.instance_weights)[:10].tolist()
    report['aif360_weights_sample'] = unique_weights
except Exception as e:
    report['aif360_error'] = str(e)

open("governance_report.json","w").write(json.dumps(report, indent=2))
print("Saved governance_report.json")
print(report)


NameError: name 'df' is not defined