<a href="https://colab.research.google.com/github/Koushikl0l/AQI/blob/main/AQI_Demo_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

# # Load the dataset
# df = pd.read_csv("IND_and_Nep_AQI_Dataset.csv")

# Select and rename relevant columns
columns_to_keep = ['Filename', 'AQI', 'AQI_Class', 'CO', 'SO2', 'NO2', 'O3', 'PM2.5', 'PM10']
df_selected = df[columns_to_keep].copy()
df_selected.columns = ['filename', 'aqi', 'aqi_class', 'co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']

# Clean the data: drop NaNs and infinities
df_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df_selected.dropna()

# Map AQI to class
def map_aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)

# 🔢 SET YOUR DESIRED SAMPLE COUNT PER CLASS HERE
sample_per_class = 750  # 👈 Change this value to 500, 1000, etc.

# Get minimum available count per class
available_counts = df_cleaned['aqi_class_mapped'].value_counts()
min_available = available_counts.min()

# Warn if asking for more samples than available in any class
if sample_per_class > min_available:
    print(f"⚠️ WARNING: Requested {sample_per_class} samples per class, but smallest class only has {min_available} samples.")
    sample_per_class = min_available
    print(f"Using {sample_per_class} samples per class instead.")

# Create the balanced dataset
balanced_df = (
    df_cleaned.groupby('aqi_class_mapped', group_keys=False)
    .apply(lambda x: x.sample(n=sample_per_class, random_state=42))
    .reset_index(drop=True)
)

# Display results
print("Balanced class distribution:")
print(balanced_df['aqi_class_mapped'].value_counts())


NameError: name 'df' is not defined

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os
from sklearn.preprocessing import StandardScaler

# ----------------- Custom Dataset -----------------
class AQIRegressionDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

        # Use AQI as regression target
        self.targets = self.data['aqi'].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])

        # Load image and convert to tensor
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        target = torch.tensor(self.targets[idx], dtype=torch.float32)

        return image, target


In [4]:
# Set seed for reproducibility
SEED = 42
torch.manual_seed(SEED)

# Path to your 224x224x3 image directory
image_dir = "/path/to/images"  # 👈 update this

# Initialize dataset
full_dataset = AQIRegressionDataset(balanced_df, image_dir=image_dir, transform=None)

# Split: 70/15/15
total_size = len(full_dataset)
train_size = int(0.7 * total_size)
val_size = int(0.15 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    full_dataset,
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(SEED)
)

# DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


NameError: name 'balanced_df' is not defined

In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms
from PIL import Image
import os

# ================== Your exact dataset preparation code ===================

# Assuming df is already loaded somewhere above as your CSV data
# Uncomment and update below if needed:
# df = pd.read_csv("/content/drive/MyDrive/AQI/IND_and_Nep_AQI_Dataset.csv")

columns_to_keep = ['Filename', 'AQI', 'AQI_Class', 'CO', 'SO2', 'NO2', 'O3', 'PM2.5', 'PM10']
df_selected = df[columns_to_keep].copy()
df_selected.columns = ['filename', 'aqi', 'aqi_class', 'co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']

df_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df_selected.dropna()

def map_aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)

sample_per_class = 750

available_counts = df_cleaned['aqi_class_mapped'].value_counts()
min_available = available_counts.min()
if sample_per_class > min_available:
    print(f"⚠️ WARNING: Requested {sample_per_class} samples per class, but smallest class only has {min_available} samples.")
    sample_per_class = min_available
    print(f"Using {sample_per_class} samples per class instead.")

balanced_df = (
    df_cleaned.groupby('aqi_class_mapped', group_keys=False)
    .apply(lambda x: x.sample(n=sample_per_class, random_state=42))
    .reset_index(drop=True)
)

print("Balanced class distribution:")
print(balanced_df['aqi_class_mapped'].value_counts())

# ================== Dataset Class ===================

class AQIRegressionDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

        # Sensor features used in the model
        self.sensor_cols = ['co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']
        self.sensor_data = self.data[self.sensor_cols].values.astype(np.float32)
        self.aqi = self.data['aqi'].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        sensor = torch.tensor(self.sensor_data[idx], dtype=torch.float32)
        target = torch.tensor(self.aqi[idx], dtype=torch.float32)
        return image, sensor, target

# ================== Model Definition ===================

class MultimodalAQIPredictor(nn.Module):
    def __init__(self, sensor_dim=6, embed_dim=128):
        super().__init__()
        mobilenet = models.mobilenet_v2(pretrained=True)
        self.image_encoder = mobilenet.features
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.img_fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1280, embed_dim),
            nn.ReLU()
        )
        # Predict sensor features from image embedding (used in val/test)
        self.sensor_predictor = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, sensor_dim)
        )
        # Regress AQI from concatenated [image emb + sensor]
        self.aqi_regressor = nn.Sequential(
            nn.Linear(embed_dim + sensor_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, image, sensor=None, mode='train'):
        img_feat = self.image_encoder(image)
        img_feat = self.avgpool(img_feat)
        img_feat = self.img_fc(img_feat)

        pred_sensor = self.sensor_predictor(img_feat)

        if mode == 'train':
            # During training: use true sensor values
            assert sensor is not None, "Sensor data required in training"
            fused = torch.cat([img_feat, sensor], dim=1)
        else:
            # During val/test: use predicted sensor features
            fused = torch.cat([img_feat, pred_sensor], dim=1)

        aqi_pred = self.aqi_regressor(fused).squeeze(1)
        return aqi_pred, pred_sensor

# ================== Loss Function ===================

def custom_loss(aqi_pred, aqi_true, pred_sensor, true_sensor, alpha=0.4):
    loss_aqi = F.mse_loss(aqi_pred, aqi_true)
    loss_sensor = F.mse_loss(pred_sensor, true_sensor)
    return (1 - alpha) * loss_aqi + alpha * loss_sensor

# ================== Training and Validation Functions ===================

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for images, sensors, targets in dataloader:
        images = images.to(device)
        sensors = sensors.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        aqi_pred, pred_sensor = model(images, sensors, mode='train')
        loss = custom_loss(aqi_pred, targets, pred_sensor, sensors)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def validate_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for images, sensors, targets in dataloader:
            images = images.to(device)
            targets = targets.to(device)

            aqi_pred, _ = model(images, sensor=None, mode='eval')
            loss = F.mse_loss(aqi_pred, targets)  # only AQI loss
            total_loss += loss.item()
    return total_loss / len(dataloader)

# ================== Main ===================

if __name__ == "__main__":
    import torch.optim as optim
    from torchvision import transforms

    SEED = 42
    torch.manual_seed(SEED)

    image_dir = "/content/drive/MyDrive/AQI/All_img"  # <---- set your image folder path here

    transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    dataset = AQIRegressionDataset(balanced_df, image_dir, transform=transform)

    total_len = len(dataset)
    train_len = int(0.7 * total_len)
    val_len = int(0.15 * total_len)
    test_len = total_len - train_len - val_len

    train_ds, val_ds, test_ds = random_split(dataset, [train_len, val_len, test_len],
                                             generator=torch.Generator().manual_seed(SEED))

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalAQIPredictor(sensor_dim=6, embed_dim=128).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    epochs = 20
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, device)
        val_loss = validate_epoch(model, val_loader, device)
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

    torch.save(model.state_dict(), "multimodal_aqi_model.pth")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)
  .apply(lambda x: x.sample(n=sample_per_class, random_state=42))


Balanced class distribution:
aqi_class_mapped
Good                              750
Hazardous                         750
Moderate                          750
Unhealthy                         750
Unhealthy for Sensitive Groups    750
Very Unhealthy                    750
Name: count, dtype: int64
Epoch 1/20 - Train Loss: 27628.1442 - Val Loss: 26651.0302


KeyboardInterrupt: 

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms
from PIL import Image
import os
from tqdm import tqdm

# ================== Data Cleaning & Normalization ===================

# Assuming df is already loaded
# df = pd.read_csv("/content/drive/MyDrive/AQI/IND_and_Nep_AQI_Dataset.csv")

columns_to_keep = ['Filename', 'AQI', 'AQI_Class', 'CO', 'SO2', 'NO2', 'O3', 'PM2.5', 'PM10']
df_selected = df[columns_to_keep].copy()
df_selected.columns = ['filename', 'aqi', 'aqi_class', 'co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']

df_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df_selected.dropna()

def map_aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)

# Normalize AQI
aqi_mean = df_cleaned['aqi'].mean()
aq_i_std = df_cleaned['aqi'].std()
df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std

# Balance classes
sample_per_class = 750
available_counts = df_cleaned['aqi_class_mapped'].value_counts()
sample_per_class = min(sample_per_class, available_counts.min())

balanced_df = (
    df_cleaned.groupby('aqi_class_mapped', group_keys=False)
    .apply(lambda x: x.sample(n=sample_per_class, random_state=42))
    .reset_index(drop=True)
)

print("Balanced class distribution:")
print(balanced_df['aqi_class_mapped'].value_counts())

# ================== Dataset Class ===================

class AQIRegressionDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

        self.sensor_cols = ['co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']
        self.sensor_data = self.data[self.sensor_cols].values.astype(np.float32)
        self.aqi = self.data['aqi_norm'].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        sensor = torch.tensor(self.sensor_data[idx], dtype=torch.float32)
        target = torch.tensor(self.aqi[idx], dtype=torch.float32)
        return image, sensor, target

# ================== Model Definition ===================

class MultimodalAQIPredictor(nn.Module):
    def __init__(self, sensor_dim=6, embed_dim=128):
        super().__init__()
        mobilenet = models.mobilenet_v2(pretrained=True)
        self.image_encoder = mobilenet.features
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.img_fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1280, embed_dim),
            nn.ReLU()
        )
        self.sensor_predictor = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, sensor_dim)
        )
        self.aqi_regressor = nn.Sequential(
            nn.Linear(embed_dim + sensor_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, image, sensor=None, mode='train'):
        img_feat = self.image_encoder(image)
        img_feat = self.avgpool(img_feat)
        img_feat = self.img_fc(img_feat)

        pred_sensor = self.sensor_predictor(img_feat)

        if mode == 'train':
            assert sensor is not None
            fused = torch.cat([img_feat, sensor], dim=1)
        else:
            fused = torch.cat([img_feat, pred_sensor], dim=1)

        aqi_pred = self.aqi_regressor(fused).squeeze(1)
        return aqi_pred, pred_sensor

# ================== Loss Function ===================

def custom_loss(aqi_pred, aqi_true, pred_sensor, true_sensor, alpha=0.4):
    loss_aqi = F.mse_loss(aqi_pred, aqi_true)
    loss_sensor = F.mse_loss(pred_sensor, true_sensor)
    return (1 - alpha) * loss_aqi + alpha * loss_sensor

# ================== Training and Validation ===================

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, desc="Training", leave=False)
    for images, sensors, targets in loop:
        images = images.to(device)
        sensors = sensors.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        aqi_pred, pred_sensor = model(images, sensors, mode='train')
        loss = custom_loss(aqi_pred, targets, pred_sensor, sensors)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    return total_loss / len(dataloader)

def validate_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0
    loop = tqdm(dataloader, desc="Validating", leave=False)
    with torch.no_grad():
        for images, sensors, targets in loop:
            images = images.to(device)
            targets = targets.to(device)

            aqi_pred, _ = model(images, sensor=None, mode='eval')
            loss = F.mse_loss(aqi_pred, targets)
            total_loss += loss.item()
            loop.set_postfix(val_loss=loss.item())
    return total_loss / len(dataloader)

# ================== Main Training Loop ===================

if __name__ == "__main__":
    import torch.optim as optim

    SEED = 42
    torch.manual_seed(SEED)

    image_dir = "/content/drive/MyDrive/AQI/All_img"
    transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    dataset = AQIRegressionDataset(balanced_df, image_dir, transform=transform)

    total_len = len(dataset)
    train_len = int(0.7 * total_len)
    val_len = int(0.15 * total_len)
    test_len = total_len - train_len - val_len

    train_ds, val_ds, test_ds = random_split(dataset, [train_len, val_len, test_len],
                                             generator=torch.Generator().manual_seed(SEED))

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalAQIPredictor(sensor_dim=6, embed_dim=128).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    epochs = 20
    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, device)
        val_loss = validate_epoch(model, val_loader, device)
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

    torch.save(model.state_dict(), "multimodal_aqi_model.pth")
    print("✅ Training complete.")
    print(f"AQI Mean: {aqi_mean:.2f} | AQI Std: {aq_i_std:.2f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std
  .apply(lambda x: x.sample(n=sample_per_class, random_state=42))


Balanced class distribution:
aqi_class_mapped
Good                              750
Hazardous                         750
Moderate                          750
Unhealthy                         750
Unhealthy for Sensitive Groups    750
Very Unhealthy                    750
Name: count, dtype: int64




Epoch 1/20 - Train Loss: 6269.8061 - Val Loss: 1.7277




KeyboardInterrupt: 

In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms
from PIL import Image
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import joblib

# ================== Data Cleaning & Normalization ===================

# Assuming df is already loaded
df = pd.read_csv("/content/drive/MyDrive/AQI/IND_and_Nep_AQI_Dataset.csv")

columns_to_keep = ['Filename', 'AQI', 'AQI_Class', 'CO', 'SO2', 'NO2', 'O3', 'PM2.5', 'PM10']
df_selected = df[columns_to_keep].copy()
df_selected.columns = ['filename', 'aqi', 'aqi_class', 'co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']

df_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df_selected.dropna()

def map_aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)

# Normalize AQI target
aqi_mean = df_cleaned['aqi'].mean()
aq_i_std = df_cleaned['aqi'].std()
df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std

# Normalize sensor features BEFORE balancing and dataset creation
sensor_cols = ['co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']
scaler = StandardScaler()
df_cleaned[sensor_cols] = scaler.fit_transform(df_cleaned[sensor_cols])

# Save the scaler for inference later
joblib.dump(scaler, "sensor_scaler.pkl")

# Balance classes
sample_per_class = 350
available_counts = df_cleaned['aqi_class_mapped'].value_counts()
sample_per_class = min(sample_per_class, available_counts.min())

balanced_df = (
    df_cleaned.groupby('aqi_class_mapped', group_keys=False)
    .apply(lambda x: x.sample(n=sample_per_class, random_state=42))
    .reset_index(drop=True)
)

print("Balanced class distribution:")
print(balanced_df['aqi_class_mapped'].value_counts())

# ================== Dataset Class ===================

class AQIRegressionDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

        self.sensor_cols = sensor_cols
        self.sensor_data = self.data[self.sensor_cols].values.astype(np.float32)
        self.aqi = self.data['aqi_norm'].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        sensor = torch.tensor(self.sensor_data[idx], dtype=torch.float32)
        target = torch.tensor(self.aqi[idx], dtype=torch.float32)
        return image, sensor, target

# ================== Model Definition ===================

class MultimodalAQIPredictor(nn.Module):
    def __init__(self, sensor_dim=6, embed_dim=128):
        super().__init__()
        mobilenet = models.mobilenet_v2(pretrained=True)
        self.image_encoder = mobilenet.features
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.img_fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1280, embed_dim),
            nn.ReLU()
        )
        self.sensor_predictor = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, sensor_dim)
        )
        self.aqi_regressor = nn.Sequential(
            nn.Linear(embed_dim + sensor_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, image, sensor=None, mode='train'):
        img_feat = self.image_encoder(image)
        img_feat = self.avgpool(img_feat)
        img_feat = self.img_fc(img_feat)

        pred_sensor = self.sensor_predictor(img_feat)

        if mode == 'train':
            assert sensor is not None
            fused = torch.cat([img_feat, sensor], dim=1)
        else:
            fused = torch.cat([img_feat, pred_sensor], dim=1)

        aqi_pred = self.aqi_regressor(fused).squeeze(1)
        return aqi_pred, pred_sensor

# ================== Loss Function ===================

def custom_loss(aqi_pred, aqi_true, pred_sensor, true_sensor, alpha=0.4):
    loss_aqi = F.mse_loss(aqi_pred, aqi_true)
    loss_sensor = F.mse_loss(pred_sensor, true_sensor)
    return (1 - alpha) * loss_aqi + alpha * loss_sensor

# ================== Training and Validation ===================

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    total_mse = 0
    loop = tqdm(dataloader, desc="Training", leave=False)
    for images, sensors, targets in loop:
        images = images.to(device)
        sensors = sensors.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        aqi_pred, pred_sensor = model(images, sensors, mode='train')
        loss = custom_loss(aqi_pred, targets, pred_sensor, sensors)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        # Also track pure MSE on AQI for RMSE metric
        mse = F.mse_loss(aqi_pred, targets).item()
        total_mse += mse

        loop.set_postfix(loss=loss.item(), mse=mse)
    avg_loss = total_loss / len(dataloader)
    avg_mse = total_mse / len(dataloader)
    return avg_loss, avg_mse

def validate_epoch(model, dataloader, device):
    model.eval()
    total_mse = 0
    loop = tqdm(dataloader, desc="Validating", leave=False)
    with torch.no_grad():
        for images, sensors, targets in loop:
            images = images.to(device)
            targets = targets.to(device)

            aqi_pred, _ = model(images, sensor=None, mode='eval')
            mse = F.mse_loss(aqi_pred, targets).item()
            total_mse += mse
            loop.set_postfix(val_mse=mse)
    avg_mse = total_mse / len(dataloader)
    return avg_mse

# ================== Main Training Loop ===================

if __name__ == "__main__":
    import torch.optim as optim

    SEED = 42
    torch.manual_seed(SEED)

    image_dir = "/content/drive/MyDrive/AQI/All_img"
    transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    dataset = AQIRegressionDataset(balanced_df, image_dir, transform=transform)

    total_len = len(dataset)
    train_len = int(0.7 * total_len)
    val_len = int(0.15 * total_len)
    test_len = total_len - train_len - val_len

    train_ds, val_ds, test_ds = random_split(dataset, [train_len, val_len, test_len],
                                             generator=torch.Generator().manual_seed(SEED))

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalAQIPredictor(sensor_dim=6, embed_dim=128).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    epochs = 20
    for epoch in range(epochs):
        train_loss, train_mse = train_epoch(model, train_loader, optimizer, device)
        val_mse = validate_epoch(model, val_loader, device)

        # Convert normalized MSE to RMSE in original AQI scale
        train_rmse = (train_mse ** 0.5) * aq_i_std
        val_rmse = (val_mse ** 0.5) * aq_i_std

        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {train_loss:.4f} - Train RMSE: {train_rmse:.2f} AQI - "
              f"Val RMSE: {val_rmse:.2f} AQI")

    torch.save(model.state_dict(), "multimodal_aqi_model.pth")
    print("✅ Training complete.")
    print(f"AQI Mean: {aqi_mean:.2f} | AQI Std: {aq_i_std:.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[sensor_cols] = scaler.fit_transform(df_cleaned[sensor_c

Balanced class distribution:
aqi_class_mapped
Good                              350
Hazardous                         350
Moderate                          350
Unhealthy                         350
Unhealthy for Sensitive Groups    350
Very Unhealthy                    350
Name: count, dtype: int64


Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 35.5MB/s]


Epoch 1/20 - Train Loss: 0.6287 - Train RMSE: 73.92 AQI - Val RMSE: 40.52 AQI




Epoch 2/20 - Train Loss: 0.2751 - Train RMSE: 37.11 AQI - Val RMSE: 33.68 AQI




Epoch 3/20 - Train Loss: 0.1739 - Train RMSE: 29.09 AQI - Val RMSE: 36.92 AQI




KeyboardInterrupt: 

In [9]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms
from PIL import Image
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import joblib

# ================== Data Cleaning & Normalization ===================

df = pd.read_csv("/content/drive/MyDrive/AQI/IND_and_Nep_AQI_Dataset.csv")

columns_to_keep = ['Filename', 'AQI', 'AQI_Class', 'CO', 'SO2', 'NO2', 'O3', 'PM2.5', 'PM10']
df_selected = df[columns_to_keep].copy()
df_selected.columns = ['filename', 'aqi', 'aqi_class', 'co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']

df_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df_selected.dropna()

def map_aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)

# Mean and std for AQI
aqi_mean = df_cleaned['aqi'].mean()
aq_i_std = df_cleaned['aqi'].std()

# Normalize AQI target
df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std

# Normalize sensor features BEFORE balancing and dataset creation
sensor_cols = ['co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']
scaler = StandardScaler()
df_cleaned[sensor_cols] = scaler.fit_transform(df_cleaned[sensor_cols])

# Save scaler for inference
joblib.dump(scaler, "sensor_scaler.pkl")

# Balance classes
sample_per_class = 350
available_counts = df_cleaned['aqi_class_mapped'].value_counts()
sample_per_class = min(sample_per_class, available_counts.min())

balanced_df = (
    df_cleaned.groupby('aqi_class_mapped', group_keys=False)
    .apply(lambda x: x.sample(n=sample_per_class, random_state=42))
    .reset_index(drop=True)
)

print("Balanced class distribution:")
print(balanced_df['aqi_class_mapped'].value_counts())

# ================== Dataset Class ===================

class AQIRegressionDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

        self.sensor_cols = sensor_cols
        self.sensor_data = self.data[self.sensor_cols].values.astype(np.float32)
        self.aqi = self.data['aqi_norm'].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        sensor = torch.tensor(self.sensor_data[idx], dtype=torch.float32)
        target = torch.tensor(self.aqi[idx], dtype=torch.float32)
        return image, sensor, target

# ================== Model Definition ===================

class MultimodalAQIPredictor(nn.Module):
    def __init__(self, sensor_dim=6, embed_dim=128):
        super().__init__()
        mobilenet = models.mobilenet_v2(pretrained=True)
        self.image_encoder = mobilenet.features
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.img_fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1280, embed_dim),
            nn.ReLU()
        )
        self.sensor_predictor = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, sensor_dim)
        )
        self.aqi_regressor = nn.Sequential(
            nn.Linear(embed_dim + sensor_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, image, sensor=None, mode='train'):
        img_feat = self.image_encoder(image)
        img_feat = self.avgpool(img_feat)
        img_feat = self.img_fc(img_feat)

        pred_sensor = self.sensor_predictor(img_feat)

        if mode == 'train':
            assert sensor is not None
            fused = torch.cat([img_feat, sensor], dim=1)
        else:
            fused = torch.cat([img_feat, pred_sensor], dim=1)

        aqi_pred = self.aqi_regressor(fused).squeeze(1)
        return aqi_pred, pred_sensor

# ================== Loss Function ===================

def custom_loss(aqi_pred, aqi_true, pred_sensor, true_sensor, alpha=0.4):
    loss_aqi = F.mse_loss(aqi_pred, aqi_true)
    loss_sensor = F.mse_loss(pred_sensor, true_sensor)
    return (1 - alpha) * loss_aqi + alpha * loss_sensor

# ================== AQI Class Mapping for batches ===================

def map_aqi_class_tensor(aqi_values):
    classes = []
    for aqi in aqi_values:
        if aqi <= 50:
            classes.append("Good")
        elif aqi <= 100:
            classes.append("Moderate")
        elif aqi <= 150:
            classes.append("Unhealthy for Sensitive Groups")
        elif aqi <= 200:
            classes.append("Unhealthy")
        elif aqi <= 300:
            classes.append("Very Unhealthy")
        else:
            classes.append("Hazardous")
    return classes

# ================== Train and Validate ===================

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    total_mse = 0
    total_correct = 0
    total_samples = 0

    loop = tqdm(dataloader, desc="Training", leave=False)
    for images, sensors, targets in loop:
        images = images.to(device)
        sensors = sensors.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        aqi_pred_norm, pred_sensor = model(images, sensors, mode='train')
        loss = custom_loss(aqi_pred_norm, targets, pred_sensor, sensors)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_mse += F.mse_loss(aqi_pred_norm, targets).item()

        # Convert normalized to original AQI scale for class accuracy
        aqi_pred = aqi_pred_norm.detach().cpu().numpy() * aq_i_std + aqi_mean
        aqi_true = targets.detach().cpu().numpy() * aq_i_std + aqi_mean

        pred_classes = map_aqi_class_tensor(aqi_pred)
        true_classes = map_aqi_class_tensor(aqi_true)

        correct = sum(p == t for p, t in zip(pred_classes, true_classes))
        total_correct += correct
        total_samples += len(aqi_true)

        loop.set_postfix(loss=loss.item(),
                         mse=total_mse/(loop.n+1),
                         acc=100*total_correct/total_samples)

    avg_loss = total_loss / len(dataloader)
    avg_mse = total_mse / len(dataloader)
    accuracy = 100 * total_correct / total_samples
    return avg_loss, avg_mse, accuracy

def validate_epoch(model, dataloader, device):
    model.eval()
    total_mse = 0
    total_correct = 0
    total_samples = 0

    loop = tqdm(dataloader, desc="Validating", leave=False)
    with torch.no_grad():
        for images, sensors, targets in loop:
            images = images.to(device)
            sensors = sensors.to(device)
            targets = targets.to(device)

            aqi_pred_norm, _ = model(images, sensors, mode='train')

            total_mse += F.mse_loss(aqi_pred_norm, targets).item()

            aqi_pred = aqi_pred_norm.cpu().numpy() * aq_i_std + aqi_mean
            aqi_true = targets.cpu().numpy() * aq_i_std + aqi_mean

            pred_classes = map_aqi_class_tensor(aqi_pred)
            true_classes = map_aqi_class_tensor(aqi_true)

            correct = sum(p == t for p, t in zip(pred_classes, true_classes))
            total_correct += correct
            total_samples += len(aqi_true)

            loop.set_postfix(val_mse=total_mse/(loop.n+1),
                             val_acc=100*total_correct/total_samples)

    avg_mse = total_mse / len(dataloader)
    accuracy = 100 * total_correct / total_samples
    return avg_mse, accuracy

# ================== Main Training Loop ===================

if __name__ == "__main__":
    import torch.optim as optim

    SEED = 42
    torch.manual_seed(SEED)

    image_dir = "/content/drive/MyDrive/AQI/All_img"
    transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    dataset = AQIRegressionDataset(balanced_df, image_dir, transform=transform)

    total_len = len(dataset)
    train_len = int(0.7 * total_len)
    val_len = int(0.15 * total_len)
    test_len = total_len - train_len - val_len

    train_ds, val_ds, test_ds = random_split(dataset, [train_len, val_len, test_len],
                                             generator=torch.Generator().manual_seed(SEED))

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalAQIPredictor(sensor_dim=6, embed_dim=128).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    epochs = 20
    for epoch in range(epochs):
        train_loss, train_mse, train_acc = train_epoch(model, train_loader, optimizer, device)
        val_mse, val_acc = validate_epoch(model, val_loader, device)

        train_rmse = (train_mse ** 0.5) * aq_i_std
        val_rmse = (val_mse ** 0.5) * aq_i_std

        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {train_loss:.4f} - Train RMSE: {train_rmse:.2f} AQI - Train Acc: {train_acc:.2f}% - "
              f"Val RMSE: {val_rmse:.2f} AQI - Val Acc: {val_acc:.2f}%")

    torch.save(model.state_dict(), "multimodal_aqi_model.pth")
    print("✅ Training complete.")
    print(f"AQI Mean: {aqi_mean:.2f} | AQI Std: {aq_i_std:.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[sensor_cols] = scaler.fit_transform(df_cleaned[sensor_c

Balanced class distribution:
aqi_class_mapped
Good                              350
Hazardous                         350
Moderate                          350
Unhealthy                         350
Unhealthy for Sensitive Groups    350
Very Unhealthy                    350
Name: count, dtype: int64




Epoch 1/20 - Train Loss: 0.6287 - Train RMSE: 73.92 AQI - Train Acc: 40.68% - Val RMSE: 36.97 AQI - Val Acc: 53.97%




Epoch 2/20 - Train Loss: 0.2751 - Train RMSE: 37.11 AQI - Train Acc: 59.18% - Val RMSE: 30.72 AQI - Val Acc: 65.40%




Epoch 3/20 - Train Loss: 0.1739 - Train RMSE: 29.09 AQI - Train Acc: 63.81% - Val RMSE: 33.88 AQI - Val Acc: 61.90%




Epoch 4/20 - Train Loss: 0.1302 - Train RMSE: 25.57 AQI - Train Acc: 68.44% - Val RMSE: 26.21 AQI - Val Acc: 66.03%




Epoch 5/20 - Train Loss: 0.1038 - Train RMSE: 24.27 AQI - Train Acc: 70.20% - Val RMSE: 23.59 AQI - Val Acc: 72.38%




Epoch 6/20 - Train Loss: 0.0779 - Train RMSE: 21.25 AQI - Train Acc: 71.84% - Val RMSE: 26.20 AQI - Val Acc: 71.11%




Epoch 7/20 - Train Loss: 0.0557 - Train RMSE: 17.97 AQI - Train Acc: 76.80% - Val RMSE: 20.57 AQI - Val Acc: 77.46%




Epoch 8/20 - Train Loss: 0.0498 - Train RMSE: 17.62 AQI - Train Acc: 78.23% - Val RMSE: 20.94 AQI - Val Acc: 77.14%




Epoch 9/20 - Train Loss: 0.0484 - Train RMSE: 17.64 AQI - Train Acc: 76.12% - Val RMSE: 22.94 AQI - Val Acc: 73.65%




Epoch 10/20 - Train Loss: 0.0477 - Train RMSE: 19.59 AQI - Train Acc: 74.83% - Val RMSE: 24.89 AQI - Val Acc: 72.70%




Epoch 11/20 - Train Loss: 0.0386 - Train RMSE: 15.84 AQI - Train Acc: 81.22% - Val RMSE: 18.97 AQI - Val Acc: 78.41%




Epoch 12/20 - Train Loss: 0.0406 - Train RMSE: 16.15 AQI - Train Acc: 79.25% - Val RMSE: 22.18 AQI - Val Acc: 73.02%




KeyboardInterrupt: 

In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import models, transforms
from PIL import Image
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import joblib

# ================== Data Cleaning & Normalization ===================

df = pd.read_csv("/content/drive/MyDrive/AQI/IND_and_Nep_AQI_Dataset.csv")

columns_to_keep = ['Filename', 'AQI', 'AQI_Class', 'CO', 'SO2', 'NO2', 'O3', 'PM2.5', 'PM10']
df_selected = df[columns_to_keep].copy()
df_selected.columns = ['filename', 'aqi', 'aqi_class', 'co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']

df_selected.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df_selected.dropna()

def map_aqi_class(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Moderate"
    elif aqi <= 150:
        return "Unhealthy for Sensitive Groups"
    elif aqi <= 200:
        return "Unhealthy"
    elif aqi <= 300:
        return "Very Unhealthy"
    else:
        return "Hazardous"

df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)

# Mean and std for AQI
aqi_mean = df_cleaned['aqi'].mean()
aq_i_std = df_cleaned['aqi'].std()

# Normalize AQI target
df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std

# Normalize sensor features BEFORE balancing and dataset creation
sensor_cols = ['co', 'so2', 'no2', 'o3', 'pm2_5', 'pm10']
scaler = StandardScaler()
df_cleaned[sensor_cols] = scaler.fit_transform(df_cleaned[sensor_cols])

# Save scaler for inference
joblib.dump(scaler, "sensor_scaler.pkl")

# Balance classes
sample_per_class = 1300
available_counts = df_cleaned['aqi_class_mapped'].value_counts()
sample_per_class = min(sample_per_class, available_counts.min())

balanced_df = (
    df_cleaned.groupby('aqi_class_mapped', group_keys=False)
    .apply(lambda x: x.sample(n=sample_per_class, random_state=42))
    .reset_index(drop=True)
)

print("Balanced class distribution:")
print(balanced_df['aqi_class_mapped'].value_counts())

# ================== Dataset Class ===================

class AQIRegressionDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.data = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform

        self.sensor_cols = sensor_cols
        self.sensor_data = self.data[self.sensor_cols].values.astype(np.float32)
        self.aqi = self.data['aqi_norm'].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['filename'])
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        sensor = torch.tensor(self.sensor_data[idx], dtype=torch.float32)
        target = torch.tensor(self.aqi[idx], dtype=torch.float32)
        return image, sensor, target

# ================== Model Definition with Dropout ===================

class MultimodalAQIPredictor(nn.Module):
    def __init__(self, sensor_dim=6, embed_dim=128):
        super().__init__()
        mobilenet = models.mobilenet_v2(pretrained=True)
        self.image_encoder = mobilenet.features
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.img_fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1280, embed_dim),
            nn.ReLU(),
            nn.Dropout(0.1)  # Dropout added here
        )
        self.sensor_predictor = nn.Sequential(
            nn.Linear(embed_dim, 64),
            nn.ReLU(),
            nn.Linear(64, sensor_dim)
        )
        self.aqi_regressor = nn.Sequential(
            nn.Linear(embed_dim + sensor_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.1),  # Dropout added here
            nn.Linear(128, 1)
        )

    def forward(self, image, sensor=None, mode='train'):
        img_feat = self.image_encoder(image)
        img_feat = self.avgpool(img_feat)
        img_feat = self.img_fc(img_feat)

        pred_sensor = self.sensor_predictor(img_feat)

        if mode == 'train':
            assert sensor is not None
            fused = torch.cat([img_feat, sensor], dim=1)
        else:
            fused = torch.cat([img_feat, pred_sensor], dim=1)

        aqi_pred = self.aqi_regressor(fused).squeeze(1)
        return aqi_pred, pred_sensor

# ================== Loss Function ===================

def custom_loss(aqi_pred, aqi_true, pred_sensor, true_sensor, alpha=0.4):
    loss_aqi = F.mse_loss(aqi_pred, aqi_true)
    loss_sensor = F.mse_loss(pred_sensor, true_sensor)
    return (1 - alpha) * loss_aqi + alpha * loss_sensor

# ================== AQI Class Mapping for batches ===================

def map_aqi_class_tensor(aqi_values):
    classes = []
    for aqi in aqi_values:
        if aqi <= 50:
            classes.append("Good")
        elif aqi <= 100:
            classes.append("Moderate")
        elif aqi <= 150:
            classes.append("Unhealthy for Sensitive Groups")
        elif aqi <= 200:
            classes.append("Unhealthy")
        elif aqi <= 300:
            classes.append("Very Unhealthy")
        else:
            classes.append("Hazardous")
    return classes

# ================== Train and Validate ===================

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    total_mse = 0
    total_correct = 0
    total_samples = 0

    loop = tqdm(dataloader, desc="Training", leave=False)
    for images, sensors, targets in loop:
        images = images.to(device)
        sensors = sensors.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        aqi_pred_norm, pred_sensor = model(images, sensors, mode='train')
        loss = custom_loss(aqi_pred_norm, targets, pred_sensor, sensors)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_mse += F.mse_loss(aqi_pred_norm, targets).item()

        # Convert normalized to original AQI scale for class accuracy
        aqi_pred = aqi_pred_norm.detach().cpu().numpy() * aq_i_std + aqi_mean
        aqi_true = targets.detach().cpu().numpy() * aq_i_std + aqi_mean

        pred_classes = map_aqi_class_tensor(aqi_pred)
        true_classes = map_aqi_class_tensor(aqi_true)

        correct = sum(p == t for p, t in zip(pred_classes, true_classes))
        total_correct += correct
        total_samples += len(aqi_true)

        loop.set_postfix(loss=loss.item(),
                         mse=total_mse/(loop.n+1),
                         acc=100*total_correct/total_samples)

    avg_loss = total_loss / len(dataloader)
    avg_mse = total_mse / len(dataloader)
    accuracy = 100 * total_correct / total_samples
    return avg_loss, avg_mse, accuracy

def validate_epoch(model, dataloader, device):
    model.eval()
    total_mse = 0
    total_correct = 0
    total_samples = 0
    sensors = None
    loop = tqdm(dataloader, desc="Validating", leave=False)
    with torch.no_grad():
        for images, sensors, targets in loop:
            images = images.to(device)
            sensors = sensors.to(device)
            targets = targets.to(device)

            aqi_pred_norm, _ = model(images, sensors, mode='eval')

            total_mse += F.mse_loss(aqi_pred_norm, targets).item()

            aqi_pred = aqi_pred_norm.cpu().numpy() * aq_i_std + aqi_mean
            aqi_true = targets.cpu().numpy() * aq_i_std + aqi_mean

            pred_classes = map_aqi_class_tensor(aqi_pred)
            true_classes = map_aqi_class_tensor(aqi_true)

            correct = sum(p == t for p, t in zip(pred_classes, true_classes))
            total_correct += correct
            total_samples += len(aqi_true)

            loop.set_postfix(val_mse=total_mse/(loop.n+1),
                             val_acc=100*total_correct/total_samples)

    avg_mse = total_mse / len(dataloader)
    accuracy = 100 * total_correct / total_samples
    return avg_mse, accuracy

# ================== Main Training Loop with Early Stopping ===================

if __name__ == "__main__":
    import torch.optim as optim

    SEED = 42
    torch.manual_seed(SEED)

    image_dir = "/content/drive/MyDrive/AQI/All_img"
    transform = transforms.Compose([
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    dataset = AQIRegressionDataset(balanced_df, image_dir, transform=transform)

    total_len = len(dataset)
    train_len = int(0.7 * total_len)
    val_len = int(0.15 * total_len)
    test_len = total_len - train_len - val_len

    train_ds, val_ds, test_ds = random_split(dataset, [train_len, val_len, test_len],
                                             generator=torch.Generator().manual_seed(SEED))

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultimodalAQIPredictor(sensor_dim=6, embed_dim=128).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    epochs = 12
    best_val_rmse = float('inf')
    patience = 3
    wait = 0
    best_model_path = "best_multimodal_aqi_model.pth"

    for epoch in range(epochs):
        train_loss, train_mse, train_acc = train_epoch(model, train_loader, optimizer, device)
        val_mse, val_acc = validate_epoch(model, val_loader, device)

        train_rmse = (train_mse ** 0.5) * aq_i_std
        val_rmse = (val_mse ** 0.5) * aq_i_std

        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {train_loss:.4f} - Train RMSE: {train_rmse:.2f} AQI - Train Acc: {train_acc:.2f}% - "
              f"Val RMSE: {val_rmse:.2f} AQI - Val Acc: {val_acc:.2f}%")

        # Early stopping logic
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            wait = 0
            torch.save(model.state_dict(), best_model_path)
            print(f"✅ New best model saved (Val RMSE: {val_rmse:.2f})")
        else:
            wait += 1
            print(f"⏳ No improvement. Early stopping patience: {wait}/{patience}")
            if wait >= patience:
                print("⛔ Early stopping triggered.")
                break

    print("✅ Training complete.")
    print(f"AQI Mean: {aqi_mean:.2f} | AQI Std: {aq_i_std:.2f}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_class_mapped'] = df_cleaned['aqi'].apply(map_aqi_class)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['aqi_norm'] = (df_cleaned['aqi'] - aqi_mean) / aq_i_std
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[sensor_cols] = scaler.fit_transform(df_cleaned[sensor_c

Balanced class distribution:
aqi_class_mapped
Good                              1300
Hazardous                         1300
Moderate                          1300
Unhealthy                         1300
Unhealthy for Sensitive Groups    1300
Very Unhealthy                    1300
Name: count, dtype: int64




Epoch 1/12 - Train Loss: 0.3708 - Train RMSE: 52.77 AQI - Train Acc: 51.72% - Val RMSE: 24.29 AQI - Val Acc: 62.56%
✅ New best model saved (Val RMSE: 24.29)




Epoch 2/12 - Train Loss: 0.1310 - Train RMSE: 29.72 AQI - Train Acc: 65.48% - Val RMSE: 19.59 AQI - Val Acc: 72.91%
✅ New best model saved (Val RMSE: 19.59)




KeyboardInterrupt: 

In [19]:
# Test phase
model.load_state_dict(torch.load('/content/best_multimodal_aqi_model.pth'))
test_mse, test_acc = validate_epoch(model, test_loader, device)
test_rmse = (test_mse ** 0.5) * aqi_std
print(f"\n🧪 Final Test Results → RMSE: {test_rmse:.2f} AQI | Accuracy: {test_acc:.2f}%")

                                                                                         


🧪 Final Test Results → RMSE: 20.27 AQI | Accuracy: 70.60%




In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Dataset
class AQIDataset(Dataset):
    def __init__(self, dataframe, image_folder, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.image_folder = image_folder
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = os.path.join(self.image_folder, row['filename'])
        image = Image.open(image_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        sensors = torch.tensor(row[['CO', 'SO2', 'NO2', 'O3', 'PM25', 'PM10']].values, dtype=torch.float32)
        aqi = torch.tensor(row['AQI'], dtype=torch.float32)
        aqi_class = torch.tensor(row['aqi_class'], dtype=torch.long)

        return image, sensors, aqi, aqi_class

# Model
class MultiModalAQINet(nn.Module):
    def __init__(self):
        super(MultiModalAQINet, self).__init__()
        # Image branch
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.image_fc = nn.Linear(64, 32)

        # Sensor branch
        self.sensor_fc = nn.Sequential(
            nn.Linear(6, 32),
            nn.ReLU(),
            nn.Linear(32, 32)
        )

        # Fusion
        self.fusion_fc = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # For AQI regression
        )

        # Sensor prediction (for eval mode)
        self.sensor_decoder = nn.Sequential(
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 6)
        )

    def forward(self, image, sensor=None, mode='train'):
        image_feat = self.cnn(image)
        image_feat = image_feat.view(image_feat.size(0), -1)
        image_feat = self.image_fc(image_feat)

        if mode == 'train':
            sensor_feat = self.sensor_fc(sensor)
            combined = torch.cat((image_feat, sensor_feat), dim=1)
            aqi_pred = self.fusion_fc(combined)
            return aqi_pred.squeeze(1), None
        else:
            pred_sensor = self.sensor_decoder(image_feat)
            sensor_feat = self.sensor_fc(pred_sensor)
            combined = torch.cat((image_feat, sensor_feat), dim=1)
            aqi_pred = self.fusion_fc(combined)
            return aqi_pred.squeeze(1), pred_sensor

# Evaluation

def validate_epoch(model, dataloader, device):
    model.eval()
    total_mse = 0.0
    total_correct = 0
    total = 0
    with torch.no_grad():
        for images, sensors, aqi, aqi_class in dataloader:
            images, sensors, aqi, aqi_class = images.to(device), sensors.to(device), aqi.to(device), aqi_class.to(device)
            aqi_pred_norm, _ = model(images, sensors, mode='eval')
            mse = F.mse_loss(aqi_pred_norm, aqi)
            total_mse += mse.item() * images.size(0)

            pred_class = torch.round(aqi_pred_norm).long()
            correct = (pred_class == aqi_class).sum().item()
            total_correct += correct
            total += images.size(0)

    avg_mse = total_mse / total
    accuracy = (total_correct / total) * 100
    return avg_mse, accuracy

# Main
if __name__ == "__main__":
    torch.backends.cudnn.benchmark = True

    df = pd.read_csv("cleaned_data.csv")

    # Normalize AQI
    aqi_mean = df['aqi'].mean()
    aqi_std = df['aqi'].std()

    df['aqi'] = (df['aqi'] - aqi_mean) / aqi_std

    # Train/val/test split
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor()
    ])

    train_ds = AQIDataset(train_df, image_folder="data/images", transform=transform)
    val_ds = AQIDataset(val_df, image_folder="data/images", transform=transform)
    test_ds = AQIDataset(test_df, image_folder="data/images", transform=transform)

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_ds, batch_size=32, num_workers=2)
    test_loader = DataLoader(test_ds, batch_size=32, num_workers=2)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiModalAQINet().to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    best_val_mse = float('inf')
    best_model_path = "best_model.pth"

    for epoch in range(1, 101):
        model.train()
        total_loss = 0.0
        for images, sensors, aqi, _ in train_loader:
            images, sensors, aqi = images.to(device), sensors.to(device), aqi.to(device)
            optimizer.zero_grad()
            pred, _ = model(images, sensors, mode='train')
            loss = F.mse_loss(pred, aqi)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * images.size(0)

        train_mse = total_loss / len(train_loader.dataset)
        val_mse, val_acc = validate_epoch(model, val_loader, device)

        if val_mse < best_val_mse:
            best_val_mse = val_mse
            torch.save(model.state_dict(), best_model_path)

        train_rmse = (train_mse ** 0.5) * aqi_std
        val_rmse = (val_mse ** 0.5) * aqi_std
        print(f"Epoch {epoch} | Train RMSE: {train_rmse:.2f} | Val RMSE: {val_rmse:.2f} | Val Acc: {val_acc:.2f}%")

    torch.cuda.empty_cache()

    # Test phase
    model.load_state_dict(torch.load(best_model_path))
    test_mse, test_acc = validate_epoch(model, test_loader, device)
    test_rmse = (test_mse ** 0.5) * aqi_std
    print(f"\n🧪 Final Test Results → RMSE: {test_rmse:.2f} AQI | Accuracy: {test_acc:.2f}%")
