In [1]:
import os
import joblib
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

from scipy.spatial import cKDTree

import gdown
import zipfile


# Import Data

**Important Notes** :
- The code loads both the training and submission datasets because the training data is required to compute the mean and standard deviation for the standardization of features.
- This ensures consistent scaling is applied across both datasets.

In [2]:
file_id = "1qtKxonXd7Cqj-GFgLozXEsz2iPKxWZL0"
file_url = f"https://drive.google.com/uc?id={file_id}"

output_dir = "/content/"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "data.zip")
gdown.download(url=file_url, output=output_path, quiet=False, use_cookies=False)



Downloading...
From (original): https://drive.google.com/uc?id=1qtKxonXd7Cqj-GFgLozXEsz2iPKxWZL0
From (redirected): https://drive.google.com/uc?id=1qtKxonXd7Cqj-GFgLozXEsz2iPKxWZL0&confirm=t&uuid=ebef360a-2168-4115-bcc7-fd069cbc8f28
To: /content/data.zip
100%|██████████| 2.01G/2.01G [00:11<00:00, 176MB/s]


'/content/data.zip'

In [3]:
zip_path = os.path.join(output_dir, "data.zip")
extract_dir = os.path.join(output_dir, "data")

# Create extraction directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)



In [4]:
# ---------------------------
# Load Train Data
# ---------------------------
train_df = pd.read_csv(os.path.join('data', 'Training_data_uhi_index_2025-02-18.csv'))

# ---------------------------
# Load 01_data_satellite
# ---------------------------
s1_train_df = pd.read_pickle(os.path.join("data", "data_processed", "01_data_satellite", "train", "sentinel1_timeseries_train.pkl"))
s2_train_df = pd.read_pickle(os.path.join("data", "data_processed", "01_data_satellite", "train", "sentinel2_timeseries_train.pkl"))
ls_train_df = pd.read_pickle(os.path.join("data", "data_processed", "01_data_satellite", "train", "landsat8_timeseries_train.pkl"))

satellite_data = {'s1': s1_train_df, 's2': s2_train_df, 'ls': ls_train_df}

# ---------------------------
# Load 02_data_weather
# ---------------------------
weather_train_dict = joblib.load(os.path.join("data", "data_processed", "02_data_weather", "weather_features_train.pkl"))

# ---------------------------
# Load 03_data_footprint
# ---------------------------
root_footprint_train_images = os.path.join("data", "data_processed", "03_data_footprint", "train")

# ---------------------------
# Load 04_data_reference_points
# ---------------------------
similarity_train_df = pd.read_pickle(os.path.join("data", "data_processed", "04_data_reference_points", "similarity_features_train.pkl"))

# ---------------------------
# Load 05_data_human_feature_eng
# ---------------------------
feature_engineering_train_df = pd.read_pickle(os.path.join("data", "data_processed", "05_data_human_feature_eng", "human_feat_eng_train.pkl"))

In [5]:
# ---------------------------
# Load Submission Data
# ---------------------------
submission_template = pd.read_csv(os.path.join('data', 'Submission_template.csv'))

# ---------------------------
# Load 01_data_satellite
# ---------------------------
s1_sub_df = pd.read_pickle(os.path.join("data", "data_processed", "01_data_satellite", "submission", "sentinel1_timeseries_submission.pkl"))
s2_sub_df = pd.read_pickle(os.path.join("data", "data_processed", "01_data_satellite", "submission", "sentinel2_timeseries_submission.pkl"))
ls_sub_df = pd.read_pickle(os.path.join("data", "data_processed", "01_data_satellite", "submission", "landsat8_timeseries_submission.pkl"))

satellite_sub_data = {'s1': s1_sub_df, 's2': s2_sub_df, 'ls': ls_sub_df}
# ---------------------------
# Load 02_data_weather
# ---------------------------
weather_sub_dict = joblib.load(os.path.join("data", "data_processed", "02_data_weather", "weather_features_sub.pkl"))

# ---------------------------
# Load 03_data_footprint
# ---------------------------
root_footprint_sub_images = os.path.join("data", "data_processed", "03_data_footprint", "submission")

# ---------------------------
# Load 04_data_reference_points
# ---------------------------
similarity_sub_df = pd.read_pickle(os.path.join("data", "data_processed", "04_data_reference_points", "similarity_features_submission.pkl"))

# ---------------------------
# Load 05_data_human_feature_eng
# ---------------------------
feature_engineering_sub_df = pd.read_pickle(os.path.join("data", "data_processed", "05_data_human_feature_eng", "human_feat_eng_sub.pkl"))

# Model Ineference

### Define Dataset Class

In [6]:
# Define the dataset : UnifiedUHIDataset
class UnifiedUHIDataset(Dataset):
    def __init__(self, main_df, satellite_dfs, weather_dict, image_dir,
                 similarity_df, feature_engineering_df, transforms, mode='train'):
        """
        Unified Urban Heat Island Dataset for Machine Learning.

        Args:
            main_df (DataFrame/GeoDataFrame): Contains UHI targets and coordinates.
            satellite_dfs (dict): Dictionary with keys 's1', 's2', 'ls' containing satellite DataFrames.
            weather_dict (dict): Weather features dictionary keyed by (lat, lon).
            image_dir (str): Directory containing geographic images.
            similarity_df (DataFrame): DataFrame with similarity metrics between locations.
            feature_engineering_df (DataFrame): Feature engineered DataFrame.
            transforms (callable): Image transforms.
            mode (str): 'train' or 'submission' mode.
        """

        self.main_gdf = main_df
        self.image_dir = image_dir
        self.transforms = transforms
        self.mode = mode

        print("Processing Satellite Data & Statistics")
        self.satellite_data = self._process_satellite_data(satellite_dfs)
        self.satellite_stats = self._compute_satellite_stats(satellite_dfs)

        print("Processing Weather Data & Statistics")
        self.weather_dict = weather_dict
        self.feature_keys = list(next(iter(self.weather_dict.values())).keys())
        self.weather_stats = self._compute_weather_stats()

        print("Processing Feature Engineering Data & Statistics")
        self.feature_eng_data, self.feature_names = self._process_feature_engineering(feature_engineering_df)
        self.num_feature_eng = len(self.feature_names) if self.feature_eng_data else 0
        self.feature_eng_stats = self._compute_feature_eng_stats() if self.mode == 'train' else (None, None)

        print("Processing Similarity Data & Statistics")
        self.similarity_data, self.similarity_names = self._process_similarity_data(similarity_df)
        self.num_similarity = len(self.similarity_names) if self.similarity_data else 0
        self.similarity_stats = self._compute_similarity_stats() if self.mode == 'train' else (None, None)

        print("\nFiltering Valid Locations")
        self.valid_locations = self._filter_valid_locations()
        if self.valid_locations:
            print(f"Found {len(self.valid_locations)} valid locations")
            print("Data Processing Completed: All Data Available")
        else:
            print("Data Processing Completed: Missing Data in Sources")

    def _process_satellite_data(self, satellite_dfs):
        """Process satellite data into location-keyed dictionaries"""
        processed = {}
        for sat_name, df in satellite_dfs.items():
            grouped = df.groupby(['Longitude', 'Latitude'])
            processed[sat_name] = {
                (lon, lat): group.sort_values('Period')
                .drop(columns=['Longitude', 'Latitude', 'Period', 'UHI Index', 'datetime'], errors='ignore')
                .values.astype(np.float32)
                for (lon, lat), group in grouped
            }
        return processed

    def _process_feature_engineering(self, df):
        """Process feature-engineered data and extract feature names."""
        if df is None:
            return {}, []

        feature_names = df.columns.drop(['Longitude', 'Latitude']).tolist()
        feature_dict = {}
        for _, row in df.iterrows():
            lon, lat = row['Longitude'], row['Latitude']
            features = row[feature_names].values.astype(np.float32)
            feature_dict[(lon, lat)] = features

        return feature_dict, feature_names


    def _compute_satellite_stats(self, satellite_dfs):
        """Compute normalization stats for each satellite feature"""
        stats = {}
        for sat_name, df in satellite_dfs.items():
            features = df.drop(columns=['Longitude', 'Latitude', 'Period', 'UHI Index', 'datetime'], errors='ignore')
            stats[sat_name] = (features.mean().values, features.std().values)
        return stats

    def _compute_weather_stats(self):
        sum_ = defaultdict(float)
        sum_sq = defaultdict(float)
        count = defaultdict(int)

        for idx in range(len(self.main_gdf)):
            lat = self.main_gdf.loc[idx, 'Latitude']
            lon = self.main_gdf.loc[idx, 'Longitude']
            weather = self.weather_dict.get((lat, lon), {})

            for k in self.feature_keys:
                if k in weather:
                    data = weather[k].astype(np.float32)
                    if data.size > 0:
                        sum_[k] += np.nansum(data)  # Use nan-safe sum
                        sum_sq[k] += np.nansum(data**2)
                        count[k] += np.count_nonzero(~np.isnan(data))  # Only count non-NaN elements

        mean = {k: sum_[k] / count[k] if count[k] > 0 else 0.0 for k in self.feature_keys}
        std = {
            k: np.sqrt(
                max(sum_sq[k] / count[k] - (sum_[k] / count[k]) ** 2, 0) + 1e-8
            ) if count[k] > 0 else 1.0  # Avoid division by zero and ensure non-negative variance
            for k in self.feature_keys
        }

        return mean, std

    def _compute_feature_eng_stats(self):
        """Compute mean and std for each feature individually."""
        if not self.feature_names:
            return ({}, {})

        # Collect all values for each feature
        feature_values = {name: [] for name in self.feature_names}
        for loc_features in self.feature_eng_data.values():
            for i, name in enumerate(self.feature_names):
                feature_values[name].append(loc_features[i])

        # Compute statistics
        mean_dict, std_dict = {}, {}
        for name in self.feature_names:
            values = np.array(feature_values[name], dtype=np.float32)
            mean = np.nanmean(values)
            std = np.nanstd(values) + 1e-8  # Avoid division by zero
            mean_dict[name] = mean
            std_dict[name] = std

        return (mean_dict, std_dict)

    def _process_similarity_data(self, df):
        """Process similarity dataframe into location-keyed dictionary."""
        if df is None:
            return {}, []
        feature_names = df.columns.drop(['Longitude', 'Latitude']).tolist()
        feature_dict = {}
        for _, row in df.iterrows():
            lon, lat = row['Longitude'], row['Latitude']
            features = row[feature_names].values.astype(np.float32)
            feature_dict[(lon, lat)] = features
        return feature_dict, feature_names

    def _compute_similarity_stats(self):
        """Compute normalization stats for similarity features."""
        if not self.similarity_names:
            return ({}, {})
        feature_values = {name: [] for name in self.similarity_names}
        for loc_features in self.similarity_data.values():
            for i, name in enumerate(self.similarity_names):
                feature_values[name].append(loc_features[i])

        # Compute mean and std for each feature
        mean_dict, std_dict = {}, {}
        for name in self.similarity_names:
            values = np.array(feature_values[name], dtype=np.float32)
            mean = np.nanmean(values)
            std = np.nanstd(values) + 1e-8
            mean_dict[name] = mean
            std_dict[name] = std

        return mean_dict, std_dict



    def _filter_valid_locations(self):
        """Find locations with complete data across all sources"""
        valid = []
        for idx, row in self.main_gdf.iterrows():

            lon, lat = row['Longitude'], row['Latitude']

            # Check satellite data availability
            sat_available = all(
                (lon, lat) in self.satellite_data[sat_name]
                for sat_name in ['s1', 's2', 'ls']
            )

            # Check weather data
            weather_available = (lat, lon) in self.weather_dict

            # Check image file existence
            img_path = os.path.join( self.image_dir, f"building_road_plot_{lon}_{lat}.png")

            image_available = os.path.exists(img_path)

            feature_eng_available = (lon, lat) in self.feature_eng_data

            similarity_available = (lon, lat) in self.similarity_data

            if all([sat_available, weather_available, image_available,
                    feature_eng_available, similarity_available]):  # UPDATED
                valid.append(idx)



        return valid


    def __len__(self):
        return len(self.valid_locations)

    def __getitem__(self, idx):
        row = self.main_gdf.iloc[self.valid_locations[idx]]
        lon, lat = row['Longitude'], row['Latitude']

        # Satellite data
        satellite_features = {}
        for sat_name in ['s1', 's2', 'ls']:
            data = self.satellite_data[sat_name][(lon, lat)]
            mean, std = self.satellite_stats[sat_name]
            normalized = (data - mean) / std
            satellite_features[sat_name] = normalized

        # Weather data: Normalize each feature individually and stack into a 2D array
        weather_raw = self.weather_dict[(lat, lon)]
        weather_list = []
        mean_dict, std_dict = self.weather_stats  # these are dictionaries keyed by feature
        for k in self.feature_keys:
            # Ensure the key exists in the weather data
            if k in weather_raw:
                data = weather_raw[k].astype(np.float32)
                norm_data = (data - mean_dict[k]) / std_dict[k]
                weather_list.append(norm_data)
        # Stack along the feature axis (each column is one weather feature)
        # Assume all weather arrays have the same sequence length.
        weather_data = np.stack(weather_list, axis=1)
        weather_tensor = torch.tensor(weather_data, dtype=torch.float32)


        # Check image file existence
        img_path = os.path.join( self.image_dir, f"building_road_plot_{lon}_{lat}.png")

        image = Image.open(img_path).convert('RGB')
        if self.transforms:
            image = self.transforms(image)


        # Feature-engineered data
        feature_eng = self.feature_eng_data[(lon, lat)]
        mean_dict, std_dict = self.feature_eng_stats

        # Normalize each feature by its stats
        normalized_features = []
        for i, name in enumerate(self.feature_names):
            val = feature_eng[i]
            mean = mean_dict[name]
            std = std_dict[name]

            norm_val = (val - mean) / std
            normalized_features.append(norm_val)

        feature_eng_tensor = torch.tensor(normalized_features, dtype=torch.float32)





        # Process similarity data
        similarity = self.similarity_data[(lon, lat)]
        sim_mean, sim_std = self.similarity_stats

        normalized_similarity = [
            (similarity[i] - sim_mean[name]) / sim_std[name]
            for i, name in enumerate(self.similarity_names)
        ]
        similarity_tensor = torch.tensor(normalized_similarity, dtype=torch.float32)




        # Target
        target = torch.tensor(row['UHI Index'], dtype=torch.float32) if self.mode == 'train' else torch.tensor(-1)


        return (
            torch.tensor(satellite_features['s1'], dtype=torch.float32),
            torch.tensor(satellite_features['s2'], dtype=torch.float32),
            torch.tensor(satellite_features['ls'], dtype=torch.float32),
            weather_tensor,
            image,
            feature_eng_tensor,
            similarity_tensor,
            target,
            torch.tensor([lon, lat], dtype=torch.float32)
        )


    # Correct the collate_fn in the UnifiedUHIDataset class
    @staticmethod
    def collate_fn(batch):
        """Handle variable-length sequences and ensure image size consistency"""
        def pad_sequences(sequences):
            max_len = max(seq.shape[0] for seq in sequences)
            padded = torch.zeros(len(sequences), max_len, sequences[0].shape[1])
            for i, seq in enumerate(sequences):
                padded[i, :seq.shape[0]] = seq
            return padded

        s1_list, s2_list, ls_list, weather_list, image_list, feature_eng_list, similarity_list, target_list, loc_list = zip(*batch)

        return {  # Fixed 'ls' key to use ls_list and corrected syntax
            's1': pad_sequences(s1_list),
            's2': pad_sequences(s2_list),
            'ls': pad_sequences(ls_list),  # Corrected from s2_list to ls_list
            'weather': pad_sequences(weather_list),
            'image': torch.stack(image_list),
            'feature_eng': torch.stack(feature_eng_list),
            'similarity': torch.stack(similarity_list),
            'target': torch.stack(target_list),
            'location': torch.stack(loc_list)
        }


### Define Model Clas

In [7]:
# 1. Enhanced GRU/LSTM with attention mechanism
class TemporalAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, outputs):
        attn_weights = F.softmax(self.attention(outputs).squeeze(-1), dim=1)
        return (outputs * attn_weights.unsqueeze(-1)).sum(1)

class EnhancedGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super().__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers=num_layers,
                          dropout=0.3, batch_first=True, bidirectional=True)
        self.attention = TemporalAttention(hidden_size * 2)
        self.norm = nn.LayerNorm(hidden_size * 2)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        outputs, _ = self.gru(x)
        attended = self.attention(outputs)
        return self.fc(self.norm(attended))

# 2. Enhanced CNN with more regularization
class ResidualBlock(nn.Module):
    """Basic residual block for image processing"""
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        return F.relu(out)

class DeepCNN(nn.Module):
    def __init__(self, output_size):
        super().__init__()
        self.base = nn.Sequential(
            nn.Conv2d(3, 64, 7, stride=2, padding=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Dropout2d(0.1),
            nn.MaxPool2d(3, stride=2, padding=1)
        )
        self.res_blocks = nn.Sequential(
            ResidualBlock(64, 128, stride=2),
            ResidualBlock(128, 256, stride=2),
        )
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256, output_size)

    def forward(self, x):
        x = self.base(x)
        x = self.res_blocks(x)
        x = self.adaptive_pool(x)
        x = self.dropout(x.view(x.size(0), -1))
        return self.fc(x)


# 3. Enhanced feature extractors with skip connections
class FeatureEngExtractor(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.LayerNorm(128),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, output_size),
        )

    def forward(self, x):
        return self.block(x)

# 4. Improved fusion network
class FusionNetwork(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.LayerNorm(64),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
        )

    def forward(self, x):
        return self.net(x).squeeze()

# Unified predictor remains unchanged (automatically handles concatenated features)
class UnifiedUHIPredictor(nn.Module):
    def __init__(self,
                 s1_input=8, s2_input=52, ls_input=12, weather_input=31,
                 s1_hidden=16, s2_hidden=32, ls_hidden=24, weather_hidden=32,
                 num_feature_eng=12, num_similarity=3,
                 image_output=64, feature_eng_output=64, similarity_output=32,

                ):
        super().__init__()


        s1_output = s1_hidden * 2
        s2_output = s2_hidden * 2
        ls_output = ls_hidden * 2
        weather_output = weather_hidden * 2

        # Bidirectional feature extractors
        self.s1_gru = EnhancedGRU(s1_input, s1_hidden, s1_output)
        self.s2_gru = EnhancedGRU(s2_input, s2_hidden, s2_output, num_layers=3)
        self.ls_gru = EnhancedGRU(ls_input, ls_hidden, ls_output)
        self.weather_lstm = EnhancedGRU(weather_input, weather_hidden, weather_output, num_layers=3)
        self.image_cnn = DeepCNN(output_size=image_output)

        # Add feature-engineered processor
        self.feature_eng_extractor = FeatureEngExtractor(input_size=num_feature_eng,output_size=feature_eng_output)

        # Add similarity processor
        self.similarity_extractor = FeatureEngExtractor(input_size=num_similarity,output_size=similarity_output)




        # Fusion input now includes bidirectional outputs automatically
        fusion_input_dim = s1_output + s2_output + ls_output + weather_output + image_output + feature_eng_output + similarity_output

        self.fusion = FusionNetwork(fusion_input_dim)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0.1)
        elif isinstance(module, nn.GRU) or isinstance(module, nn.LSTM):
            for name, param in module.named_parameters():
                if 'weight_ih' in name:
                    nn.init.xavier_normal_(param.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(param.data)
                elif 'bias' in name:
                    nn.init.constant_(param.data, 0)


    def forward(self, batch):
        s1_emb = self.s1_gru(batch['s1'])
        s2_emb = self.s2_gru(batch['s2'])
        ls_emb = self.ls_gru(batch['ls'])
        w_emb = self.weather_lstm(batch['weather'])
        img_emb = self.image_cnn(batch['image'])
        feature_eng_emb = self.feature_eng_extractor(batch['feature_eng'])
        similarity_emb = self.similarity_extractor(batch['similarity'])

        # Concatenate all features
        combined = torch.cat([s1_emb, s2_emb, ls_emb, w_emb, img_emb, feature_eng_emb, similarity_emb], dim=1)

        return self.fusion(combined).squeeze()


### Inference on Submission

In [8]:
# Create the train dataset to exract the statistics for normalization

# Transforms for images
image_transforms = transforms.Compose([
    transforms.Resize(256),          # Resize shorter side to 256
    transforms.CenterCrop(256),      # Ensure square 256x256 images
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
# Initialize dataset and loaders
training_dataset = UnifiedUHIDataset(
    train_df, satellite_data, weather_train_dict,
    root_footprint_train_images,
    similarity_train_df,
    feature_engineering_train_df,
    transforms=image_transforms
    )

satellite_stats = training_dataset.satellite_stats
weather_stats = training_dataset.weather_stats
feature_eng_stats = training_dataset.feature_eng_stats
similarity_stats = training_dataset.similarity_stats

Processing Satellite Data & Statistics
Processing Weather Data & Statistics
Processing Feature Engineering Data & Statistics
Processing Similarity Data & Statistics

Filtering Valid Locations
Found 11229 valid locations
Data Processing Completed: All Data Available


In [9]:
# Create the submission dataset
submission_dataset = UnifiedUHIDataset(
    submission_template, satellite_sub_data, weather_sub_dict,
    root_footprint_sub_images,
    similarity_sub_df,
    feature_engineering_sub_df,
    transforms=image_transforms,
    mode='submission'
    )


submission_dataset.satellite_stats = satellite_stats
submission_dataset.weather_stats = weather_stats
submission_dataset.feature_eng_stats = feature_eng_stats
submission_dataset.similarity_stats = similarity_stats

num_feature_eng = training_dataset.num_feature_eng
num_similarity = training_dataset.num_similarity

# Create DataLoader
submission_loader = DataLoader(
    submission_dataset,
    batch_size=128,
    shuffle=False,
    collate_fn=submission_dataset.collate_fn,
    num_workers=8
)

Processing Satellite Data & Statistics
Processing Weather Data & Statistics
Processing Feature Engineering Data & Statistics
Processing Similarity Data & Statistics

Filtering Valid Locations
Found 1040 valid locations
Data Processing Completed: All Data Available




In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


model = UnifiedUHIPredictor(
    num_feature_eng=num_feature_eng, num_similarity=num_similarity
    ).to(device)

best_model_path = os.path.join('data', 'best_model.pth')
model.load_state_dict(torch.load(best_model_path))
model.eval()

UnifiedUHIPredictor(
  (s1_gru): EnhancedGRU(
    (gru): GRU(8, 16, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    (attention): TemporalAttention(
      (attention): Sequential(
        (0): Linear(in_features=32, out_features=32, bias=True)
        (1): Tanh()
        (2): Linear(in_features=32, out_features=1, bias=True)
      )
    )
    (norm): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (fc): Linear(in_features=32, out_features=32, bias=True)
  )
  (s2_gru): EnhancedGRU(
    (gru): GRU(52, 32, num_layers=3, batch_first=True, dropout=0.3, bidirectional=True)
    (attention): TemporalAttention(
      (attention): Sequential(
        (0): Linear(in_features=64, out_features=64, bias=True)
        (1): Tanh()
        (2): Linear(in_features=64, out_features=1, bias=True)
      )
    )
    (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (fc): Linear(in_features=64, out_features=64, bias=True)
  )
  (ls_gru): EnhancedGRU(
    (gru): GR

In [11]:
predictions = []
locations = []

with torch.no_grad():
    for _batch in submission_loader:
        inputs = {k: v.to(device) for k, v in _batch.items() if k != 'target'}
        batch_locs = _batch['location'].cpu().numpy()
        outputs = model(inputs).cpu().numpy().flatten()
        predictions.extend(outputs.tolist())
        locations.extend(batch_locs.tolist())

# Convert to DataFrame
results_df = pd.DataFrame(locations, columns=['Longitude', 'Latitude'])
results_df['UHI_Predicted'] = predictions
results_df = results_df.groupby(['Longitude', 'Latitude']).last().reset_index()
# Build a k-D tree from the results_df coordinates for fast nearest neighbor search.
tree = cKDTree(results_df[['Longitude', 'Latitude']].values)

# Query the tree for the nearest neighbor for each point in the submission template.
# dists: Euclidean distances (not used here, but can be useful for quality checks)
# indices: indices of the nearest neighbors in results_df
dists, indices = tree.query(submission_template[['Longitude', 'Latitude']].values, k=1)

# Use the indices to assign the corresponding predicted UHI values to the submission template.
submission_template['UHI Index'] = results_df.iloc[indices]['UHI_Predicted'].values

In [12]:
# Export the final submission as a CSV file.
output_dir = 'data'
os.makedirs(output_dir, exist_ok=True)

save_submission_path = os.path.join(output_dir, 'submission.csv')
submission_template.to_csv(save_submission_path, index=False)

# Optional: Display the final DataFrame to verify the matching.
print(submission_template)


      Longitude   Latitude  UHI Index
0    -73.971665  40.788763   0.959750
1    -73.971928  40.788875   0.960044
2    -73.967080  40.789080   0.960680
3    -73.972550  40.789082   0.960373
4    -73.969697  40.787953   0.958746
...         ...        ...        ...
1035 -73.919388  40.813803   1.042457
1036 -73.931033  40.833178   1.043393
1037 -73.934647  40.854542   1.040872
1038 -73.917223  40.815413   1.037010
1039 -73.911645  40.804402   1.036858

[1040 rows x 3 columns]
