In [2]:
# !pip install -q kaggle
# !mkdir -p ~/.kaggle
# !mv kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

In [None]:
# !pip install pandas numpy torch torchvision transformers tqdm scikit-learn pillow

In [None]:
!hf auth login --token <InsertHuggingfaceToken>

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `DinoV3` has been saved to /home/jef9921/.cache/huggingface/stored_tokens
Your token has been saved to /home/jef9921/.cache/huggingface/token
Login successful.
The current active token is: `DinoV3`


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms.functional as F
from transformers import CLIPVisionModel, AutoModel
from PIL import Image
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
import warnings

# Suppress minor warnings from transformers
warnings.filterwarnings("ignore")

In [3]:
BASE_DIR = Path("kaggle_dataset")
TRAIN_DIR = BASE_DIR / "train_images"
TEST_DIR   = BASE_DIR / "test_images"
# Download data
if not BASE_DIR.exists() or not any(BASE_DIR.iterdir()):
    print("Downloading and unzipping data...")
    os.makedirs(BASE_DIR, exist_ok=True)
    print("Downloading...")
    !kaggle competitions download -c geo-guessr-street-view-cs-gy-6643
    print("Unzipping...")
    !unzip -q geo-guessr-street-view-cs-gy-6643.zip
else:
    print("Geoguessr Data folder already exists.")

Geoguessr Data folder already exists.


In [None]:
# 1. CONFIGURATION
# ==========================================
class Config:
    # Paths
    TRAIN_CSV = BASE_DIR / "train_ground_truth.csv"
    TEST_CSV = BASE_DIR / "sample_submission.csv"
    TRAIN_IMG_DIR = BASE_DIR / "train_images/"
    TEST_IMG_DIR = BASE_DIR / "test_images/"
    
    SUBMISSION_FILE = "test_submission_35_epochs_knn_tta.csv" 

    RESUME_PATH = "best_model_full.pth"  # Might need to change to "best_model_full.pth"
    
    SAVE_DIR = "checkpoints"
    os.makedirs(SAVE_DIR, exist_ok=True)
    
    # Model Hyperparameters
    MODEL_NAME_CLIP = "geolocal/StreetCLIP"
    MODEL_NAME_DINO = "facebook/dinov3-vitl16-pretrain-lvd1689m"
    IMG_SIZE = 336
    BATCH_SIZE = 4        
    NUM_WORKERS = 4
    
    # KEY CHANGES FOR 35 EPOCHS
    EPOCHS = 35           # Changed from 10
               
    LR = 1e-7             
    
    # Increase Weight Decay to prevent overfitting
    WD = 1e-2             # Increased from 1e-4
    
    # Loss Weights
    W_CLS = 1.0
    W_GPS = 15.0          # Slightly increased to emphasize GPS precision in late training
    
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

For this part, we define utilities that are directly tied to how the model is trained and evaluated, along with the dataset class that structures the input data.

We first implement the Haversine distance function, which computes the true geographic distance between two latitude longitude pairs in kilometers. We use this instead of simple Euclidean distance because the Earth is spherical, and this metric matches the GPS evaluation used in the competition. During training, it provides an interpretable measure of how far predictions are from the ground truth in real-world units.

Next, we define a GPS normalization helper. Raw latitude and longitude values span different numeric ranges and scales, which makes direct regression unstable. We normalize both coordinates using fixed means and scales so that the regression head learns in a more balanced and well-conditioned space. At inference time, the predictions are converted back to real GPS coordinates using the inverse transformation.

Finally, we implement a custom dataset class for the Street View data. Each sample consists of four images captured at the same location, facing north, east, south, and west. We load all four images, apply the same transformations, and stack them into a single tensor so the model can reason jointly over multiple directions.

If an image fails to load, we substitute a black image to avoid breaking the training loop while keeping tensor shapes consistent. The dataset supports both training and test modes. In training mode, it returns the stacked images along with state and GPS labels. In test mode, it returns only the images and the sample identifier, which is required for submission generation.

In [None]:
# 2. UTILS & DATASET
# ==========================================
def haversine_distance(pred_lat, pred_lon, true_lat, true_lon):
    """Calculates Haversine distance in km."""
    R = 6371
    phi1, phi2 = torch.deg2rad(pred_lat), torch.deg2rad(true_lat)
    dphi = torch.deg2rad(true_lat - pred_lat)
    dlambda = torch.deg2rad(true_lon - pred_lon)
    a = torch.sin(dphi/2)**2 + torch.cos(phi1)*torch.cos(phi2)*torch.sin(dlambda/2)**2
    a = torch.clamp(a, 0, 1)
    c = 2 * torch.atan2(torch.sqrt(a), torch.sqrt(1-a))
    return R * c

class GPSNormalizer:
    def __init__(self):
        self.lat_mean, self.lat_scale = 37.0, 15.0
        self.lon_mean, self.lon_scale = -95.0, 30.0

    def normalize(self, lat, lon):
        return (lat - self.lat_mean)/self.lat_scale, (lon - self.lon_mean)/self.lon_scale

    def denormalize(self, n_lat, n_lon):
        return (n_lat * self.lat_scale) + self.lat_mean, (n_lon * self.lon_scale) + self.lon_mean

class StreetViewDataset(Dataset):
    def __init__(self, csv_path, images_dir, transform=None, is_test=False):
        self.data = pd.read_csv(csv_path)
        self.images_dir = images_dir
        self.transform = transform
        self.is_test = is_test
        self.directions = ['image_north', 'image_east', 'image_south', 'image_west']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        images = []
        for d in self.directions:
            path = os.path.join(self.images_dir, row[d])
            try:
                img = Image.open(path).convert('RGB')
            except:
                img = Image.new('RGB', (224, 224), (0, 0, 0))
            if self.transform:
                img = self.transform(img)
            images.append(img)
        
        img_tensor = torch.stack(images) # [4, 3, 224, 224]
        
        if self.is_test:
            return img_tensor, row['sample_id']
        
        return img_tensor, \
               torch.tensor(row['state_idx'], dtype=torch.long), \
               torch.tensor(row['latitude'], dtype=torch.float32), \
               torch.tensor(row['longitude'], dtype=torch.float32)


For this part, we design a single model that performs both state classification and GPS prediction by fusing information from multiple views and multiple visual backbones.

We use two pretrained vision encoders. StreetCLIP is chosen because it is trained on street-level imagery and captures location-specific visual cues such as road structure, signage, and environment. DINOv3 is added to provide strong general-purpose visual features that complement StreetCLIP and improve robustness across diverse scenes.

Each directional image is passed through both backbones independently. The output features from StreetCLIP and DINOv3 are projected into a common dimensionality and concatenated. We apply layer normalization after concatenation to stabilize training and align the combined feature scale.

The four directional feature vectors are then reshaped into a sequence and passed through a Transformer encoder. We use a Transformer because different directions contribute differently depending on the scene, and the model should learn how to weigh and relate these views instead of treating them equally.

After fusion, the features from all directions are flattened into a single global representation for the location. This representation is shared by two prediction heads. The state head outputs logits over all states for classification, while the GPS head predicts normalized latitude and longitude values for regression.

The forward method also supports returning the fused feature representation directly. This is used later for nearest neighbor based GPS refinement, where the model acts as a learned embedding extractor rather than only a predictor.

In [None]:
# 3. MODEL ARCHITECTURE
# ==========================================
class GeoFusionModel(nn.Module):
    def __init__(self, num_states=50, fusion_dim=512):
        super().__init__()
        # Backbones
        self.clip = CLIPVisionModel.from_pretrained(Config.MODEL_NAME_CLIP)
        self.dino = AutoModel.from_pretrained(Config.MODEL_NAME_DINO)
        
        # Projections
        self.clip_proj = nn.Linear(self.clip.config.hidden_size, fusion_dim)
        self.dino_proj = nn.Linear(self.dino.config.hidden_size, fusion_dim)
        self.norm = nn.LayerNorm(fusion_dim * 2)
        
        # Fusion
        encoder_layer = nn.TransformerEncoderLayer(d_model=fusion_dim*2, nhead=8, batch_first=True)
        self.fusion = nn.TransformerEncoder(encoder_layer, num_layers=2)
        
        # Heads
        self.state_head = nn.Linear(fusion_dim*2*4, num_states)
        self.gps_head = nn.Linear(fusion_dim*2*4, 2)

    def forward(self, x, return_feats = False):
        B, N, C, H, W = x.shape
        x_flat = x.view(B*N, C, H, W)
        
        # Extract
        clip_feat = self.clip_proj(self.clip(x_flat).pooler_output)
        dino_feat = self.dino_proj(self.dino(x_flat).last_hidden_state[:, 0, :])
        
        # Fuse
        feat = torch.cat([clip_feat, dino_feat], dim=1) # [B*4, fusion_dim*2]
        feat = self.norm(feat)
        feat = feat.view(B, N, -1) # [B, 4, dim]
        
        # Transformer
        feat = self.fusion(feat)
        
        # Flatten and Predict
        global_feat = feat.reshape(B, -1)

        if return_feats:
            # Return logits, dummy_gps, features
            return self.state_head(global_feat), self.gps_head(global_feat), global_feat
        
        return self.state_head(global_feat), self.gps_head(global_feat)

For this part, we define the training logic for a single epoch.

The model is set to training mode so that layers such as dropout and normalization behave correctly. We track three metrics during training: total loss, state classification accuracy, and average Haversine distance in kilometers. These give both optimization and task-level feedback.

We use cross-entropy loss for state classification and mean squared error loss for GPS regression. The two losses are combined using configurable weights. This allows us to balance coarse location recognition through state prediction with fine-grained geographic precision through GPS regression.

Before computing the GPS loss, latitude and longitude targets are normalized. This keeps the regression targets on a similar scale and prevents one coordinate from dominating the loss.

Mixed precision training is used during the forward pass to improve training efficiency on GPU. Gradients are scaled to avoid numerical underflow, and gradient clipping is applied to stabilize updates when fine-tuning large pretrained backbones.

After each update, we compute classification accuracy and convert the predicted GPS values back to real coordinates. The Haversine distance between predictions and ground truth is then calculated to measure real-world localization error.

All metrics are accumulated across the epoch and averaged at the end, providing a concise summary of training performance.

In [None]:
# 4. TRAINING & VALIDATION
# ==========================================
def train_epoch(model, loader, optimizer, scaler, normalizer):
    model.train()
    meters = {'loss': 0, 'acc': 0, 'dist': 0}
    
    criterion_cls = nn.CrossEntropyLoss(ignore_index=-1)
    criterion_gps = nn.MSELoss()
    
    pbar = tqdm(loader, desc="Training")
    for imgs, states, lats, lons in pbar:
        imgs, states = imgs.to(Config.DEVICE), states.to(Config.DEVICE)
        lats, lons = lats.to(Config.DEVICE), lons.to(Config.DEVICE)
        
        # Normalize targets
        n_lat, n_lon = normalizer.normalize(lats, lons)
        gps_targets = torch.stack([n_lat, n_lon], dim=1)
        
        # Mixed Precision Forward
        with torch.cuda.amp.autocast():
            logits, gps_preds = model(imgs)
            loss = (Config.W_CLS * criterion_cls(logits, states)) + \
                   (Config.W_GPS * criterion_gps(gps_preds, gps_targets))
        
        # Backward
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        
        # Metrics
        acc = (logits.argmax(1) == states).float().mean()
        p_lat, p_lon = normalizer.denormalize(gps_preds[:,0], gps_preds[:,1])
        dist = haversine_distance(p_lat, p_lon, lats, lons).mean()
        
        meters['loss'] += loss.item()
        meters['acc'] += acc.item()
        meters['dist'] += dist.item()
        pbar.set_postfix({'Loss': loss.item(), 'Acc': acc.item(), 'km': dist.item()})
        
    return {k: v/len(loader) for k, v in meters.items()}

For this part, we put together all components required to train the model end to end.

We first initialize the GPS normalizer and define the image preprocessing pipeline. All images are resized to a fixed resolution and normalized using standard ImageNet statistics. This ensures compatibility with the pretrained vision backbones and keeps the input distribution stable.

The full training dataset is then split into training and validation subsets. We use a 90 to 10 split to retain most of the data for learning while still having a small holdout set for monitoring training behavior.

Next, we initialize the GeoFusion model and move it to the appropriate device. We use the AdamW optimizer because it works well for fine-tuning large pretrained models and handles weight decay in a stable manner. Mixed precision training is enabled through a gradient scaler to improve efficiency.

Before starting training, we check whether a saved checkpoint is available. If so, we restore the model and optimizer states and continue training from the last saved epoch. This allows long training runs to be resumed without losing progress.

The training loop then runs for the specified number of epochs. In each epoch, we call the training function to update model parameters and compute training metrics. After every epoch, a full checkpoint is saved as a safety measure.

If the current epoch achieves a lower training loss than any previous epoch, the model is saved as the best version. Both a full checkpoint and a lightweight weights-only file are stored to support later inference and submission generation.

In [None]:
# 5. MAIN TRAINING
# ==========================================

# Setup
normalizer = GPSNormalizer()
tfm = transforms.Compose([
    transforms.Resize((Config.IMG_SIZE, Config.IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load Data (Assuming CSVs exist - Split Train for Val)
full_ds = StreetViewDataset(Config.TRAIN_CSV, Config.TRAIN_IMG_DIR, transform=tfm)
train_size = int(0.9 * len(full_ds))
val_size = len(full_ds) - train_size
train_ds, val_ds = torch.utils.data.random_split(full_ds, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=Config.NUM_WORKERS)
val_loader = DataLoader(val_ds, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=Config.NUM_WORKERS)

# Init Model
print(f"Loading Models: {Config.MODEL_NAME_CLIP} & {Config.MODEL_NAME_DINO}...")
model = GeoFusionModel().to(Config.DEVICE)

optimizer = optim.AdamW(model.parameters(), lr=Config.LR, weight_decay=Config.WD)
scaler = torch.cuda.amp.GradScaler() # For Mixed Precision
 
# Resume Logic
start_epoch = 0
best_loss = float('inf')

if Config.RESUME_PATH and os.path.exists(Config.RESUME_PATH):
    print(f"Resuming training from {Config.RESUME_PATH}...")
    checkpoint = torch.load(Config.RESUME_PATH, map_location=Config.DEVICE)
    
    # Scenario A: You saved a Full Checkpoint (Model + Optimizer + Epoch)
    if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scaler.load_state_dict(checkpoint['scaler_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        best_loss = checkpoint.get('best_loss', float('inf'))
        print(f"Full checkpoint loaded! Resuming from Epoch {start_epoch}")

    # FORCE LEARNING RATE OVERRIDE (Optional)
        # Explicitly overwrite the loaded LR with the one in the current Config
        # for param_group in optimizer.param_groups:
        #     param_group['lr'] = Config.LR
            
        # print(f"Optimizer learning rate successfully updated to: {Config.LR}")
        
    # Scenario B: You only saved Weights
    else:
        # If checkpoint is just the state_dict
        model.load_state_dict(checkpoint)
        print("Weights loaded successfully! (Optimizer reset because file contained weights only)")


# Training Loop (Adjusted Range)
print(f"Starting Training from Epoch {start_epoch+1} to {Config.EPOCHS}...")

for epoch in range(start_epoch, Config.EPOCHS):
    train_metrics = train_epoch(model, train_loader, optimizer, scaler, normalizer)
    print(f"Epoch {epoch+1} Train: {train_metrics}")
    
    # IMPROVED SAVING LOGIC 
    # Save a Full Checkpoint, so you can resume perfectly next time
    checkpoint_state = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scaler_state_dict': scaler.state_dict(),
        'best_loss': best_loss,
    }
    
    # Save latest checkpoint every epoch (safety net)
    torch.save(checkpoint_state, os.path.join(Config.SAVE_DIR, "last_checkpoint.pth"))
    
    # Save best model if loss improved
    if train_metrics['loss'] < best_loss:
        best_loss = train_metrics['loss']
        checkpoint_state['best_loss'] = best_loss # Update best loss in dict
        torch.save(checkpoint_state, "best_model_full.pth")
        # Also save just weights for inference script compatibility
        torch.save(model.state_dict(), "best_model.pth") 
        print("Saved Best Model!")

Loading Models: geolocal/StreetCLIP & facebook/dinov3-vitl16-pretrain-lvd1689m...
Resuming training from best_model_full.pth...
Full checkpoint loaded! Resuming from Epoch 30
Starting Training from Epoch 31 to 35...


Training: 100%|██████████| 14846/14846 [4:34:05<00:00,  1.11s/it, Loss=0.00581, Acc=1, km=55.1]  


Epoch 31 Train: {'loss': 0.010913128783051975, 'acc': 0.9999831604472585, 'dist': 67.92319890962334}
Saved Best Model!


Training: 100%|██████████| 14846/14846 [4:34:34<00:00,  1.11s/it, Loss=0.00313, Acc=1, km=37.2]   


Epoch 32 Train: {'loss': 0.010695938372213675, 'acc': 0.9999831604472585, 'dist': 67.37325995292494}
Saved Best Model!


Training: 100%|██████████| 14846/14846 [4:34:15<00:00,  1.11s/it, Loss=0.0164, Acc=1, km=97.3]    


Epoch 33 Train: {'loss': 0.010437070277856427, 'acc': 0.9999831604472585, 'dist': 66.83835264487551}
Saved Best Model!


Training: 100%|██████████| 14846/14846 [4:34:08<00:00,  1.11s/it, Loss=0.0103, Acc=1, km=62.7]    


Epoch 34 Train: {'loss': 0.010300078411381464, 'acc': 0.9999831604472585, 'dist': 66.2753403393965}
Saved Best Model!


Training: 100%|██████████| 14846/14846 [4:34:13<00:00,  1.11s/it, Loss=0.00826, Acc=1, km=76.9]  


Epoch 35 Train: {'loss': 0.0100834660223512, 'acc': 0.9999831604472585, 'dist': 65.7511261384731}
Saved Best Model!


For this part, we switch the model from training to inference mode and prepare the components needed for GPS refinement using nearest neighbors.

We load the best-performing model weights and set the model to evaluation mode. From this point onward, the model is used only for forward passes and feature extraction, with gradients disabled to reduce memory usage and improve speed.

Before running inference on the test set, we construct a knowledge base of embeddings from the training data. Each training sample is passed through the model to extract the fused feature representation learned during training. These embeddings act as reference points for nearest neighbor based GPS estimation.

To improve robustness, we apply five-crop test-time augmentation when extracting training embeddings. The original image and five cropped versions are all passed through the model, and the resulting feature vectors are averaged. This reduces sensitivity to framing and local viewpoint changes while keeping the representation stable.

All averaged embeddings are L2-normalized so that cosine similarity can be used reliably during nearest neighbor search. Along with the embeddings, we store the corresponding ground truth GPS coordinates.

Because this embedding extraction step is computationally expensive, the results are cached to disk. If cached embeddings already exist, they are loaded directly, avoiding redundant computation when rerunning inference or tuning K-NN parameters.

This embedding database forms the foundation for the subsequent K-NN based GPS refinement step used during test-time prediction.

In [None]:
# 6. INFERENCE & SUBMISSION (Integrated K-NN)
# ==========================================
print("Starting Inference...")

# Load Model
model.load_state_dict(torch.load("best_model.pth"))
model.to(Config.DEVICE)
model.eval()

# STEP A: PREPARE THE "KNOWLEDGE BASE" (Training Embeddings)
# ---------------------------------------------------------
TRAIN_EMB_FILE = "train_embeddings.npy"
TRAIN_GPS_FILE = "train_gps.npy"

if os.path.exists(TRAIN_EMB_FILE) and os.path.exists(TRAIN_GPS_FILE):
    print("Loading cached training embeddings...")
    train_emb = np.load(TRAIN_EMB_FILE)
    train_gps = np.load(TRAIN_GPS_FILE)
else:
    print("Cache missing! Generating training embeddings first (Required for K-NN)...")
    # Quick loop to generate training embeddings if they don't exist
    train_ds = StreetViewDataset(Config.TRAIN_CSV, Config.TRAIN_IMG_DIR, transform=tfm)
    train_loader = DataLoader(train_ds, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=Config.NUM_WORKERS)
    
    emb_list, gps_list = [], []
    with torch.no_grad():
        for imgs, _, lat, lon in tqdm(train_loader, desc="Indexing Train Set"):
            imgs = imgs.to(Config.DEVICE)
            # Use return_feats=True to get the vector
            _, _, feats = model(imgs, return_feats=True)

            # Five-Crop TTA
            B, N, C, H, W = imgs.shape
        
            # We treat the 4 sub-images (N/E/S/W) independently for cropping
            # Reshape to [B*4, C, H, W] to crop all directions at once
            flat_imgs = imgs.view(B*N, C, H, W)
            
            # Crop size: 80% of image (approx 268px)
            crop_size = int(H * 0.8) 
            
            # Get 5 crops: TL, TR, BL, BR, Center
            crops = transforms.FiveCrop(crop_size)(flat_imgs) 
            # crops is a tuple of 5 tensors, each [B*N, C, crop_h, crop_w]
            
            tta_feats_sum = feats.clone()
            
            for crop in crops:
                # Resize back to 336x336 so model accepts it
                crop_resized = F.resize(crop, [H, W])
                # Reshape back to [B, 4, C, H, W]
                crop_reshaped = crop_resized.view(B, N, C, H, W)
                
                _, _, feats_crop = model(crop_reshaped, return_feats=True)
                tta_feats_sum += feats_crop
            
            # Average all 6 views (Original + 5 Crops)
            feats_avg = tta_feats_sum / 6.0
            
            # Renormalize! (Crucial for Cosine Similarity)
            feats_avg = torch.nn.functional.normalize(feats_avg, p=2, dim=1)

            emb_list.append(feats_avg.cpu().numpy())
            gps_list.append(torch.stack([lat, lon], dim=1).numpy())
            
    train_emb = np.concatenate(emb_list)
    train_gps = np.concatenate(gps_list)
    
    # Save for next time
    np.save(TRAIN_EMB_FILE, train_emb)
    np.save(TRAIN_GPS_FILE, train_gps)

Starting Inference...
Cache missing! Generating training embeddings first (Required for K-NN)...


Indexing Train Set:   5%|▌         | 829/16495 [1:34:01<29:38:11,  6.81s/it]

In [None]:
# STEP B: INFERENCE LOOP (Collect Data)
# ---------------------------------------------------------
test_ds = StreetViewDataset(Config.TEST_CSV, Config.TEST_IMG_DIR, transform=tfm, is_test=True)
test_loader = DataLoader(test_ds, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=Config.NUM_WORKERS)

# We will store results in lists first, then process K-NN in one fast batch
all_sample_ids = []
all_test_embeddings = []
all_top5_states = []

with torch.no_grad():
    for imgs, sample_ids in tqdm(test_loader, desc="Predicting Test Set"):
        imgs = imgs.to(Config.DEVICE)
        
        # 1. Run Model
        # We need both logits (for state) and feats (for K-NN GPS)
        # We ignore the model's raw 'gps_preds' because we are using K-NN instead
        logits, _, feats = model.forward(imgs, return_feats=True)

        # Five-Crop TTA
        B, N, C, H, W = imgs.shape
        
        # We treat the 4 sub-images (N/E/S/W) independently for cropping
        # Reshape to [B*4, C, H, W] to crop all directions at once
        flat_imgs = imgs.view(B*N, C, H, W)
        
        # Crop size: 80% of image (approx 268px)
        crop_size = int(H * 0.8) 
        
        # Get 5 crops: TL, TR, BL, BR, Center
        crops = transforms.FiveCrop(crop_size)(flat_imgs) 
        # crops is a tuple of 5 tensors, each [B*N, C, crop_h, crop_w]
        
        tta_feats_sum = feats.clone()
        
        for crop in crops:
            # Resize back to 336x336 so model accepts it
            crop_resized = F.resize(crop, [H, W])
            # Reshape back to [B, 4, C, H, W]
            crop_reshaped = crop_resized.view(B, N, C, H, W)
            
            _, _, feats_crop = model(crop_reshaped, return_feats=True)
            tta_feats_sum += feats_crop
        
        # Average all 6 views (Original + 5 Crops)
        feats_avg = tta_feats_sum / 6.0
        
        # Renormalize! (Crucial for Cosine Similarity)
        feats_avg = torch.nn.functional.normalize(feats_avg, p=2, dim=1)
        
        # 2. Process States (Top 5)
        _, top5_indices = torch.topk(logits, 5, dim=1)
        
        # 3. Store Data
        all_test_embeddings.append(feats_avg.cpu().numpy())
        all_top5_states.append(top5_indices.cpu().numpy())
        all_sample_ids.extend(sample_ids.tolist())

# Concatenate all test embeddings into one big matrix
all_test_embeddings = np.concatenate(all_test_embeddings) # Shape: [N_test, 1024]
all_top5_states = np.concatenate(all_top5_states)         # Shape: [N_test, 5]

In [None]:
from sklearn.model_selection import train_test_split

# GRID SEARCH TO FIND OPTIMAL K AND TEMP FOR K-NN

# CONFIGURATION
# ==========================================
# Paths to your cached embedding/gps files
EMB_FILE = "train_embeddings.npy"
GPS_FILE = "train_gps.npy"

# Hyperparameter Grid to Search
K_VALUES = [5, 10, 15, 20, 25, 30, 40, 50, 75, 100]
TEMP_VALUES = [0.5, 1, 2, 5, 10, 15, 20, 30, 50]

# UTILS
# ==========================================
def haversine_np(lat1, lon1, lat2, lon2):
    """
    Vectorized Haversine Distance (Numpy)
    """
    R = 6371
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2) * np.sin(dlambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    print(f"Loading cached embeddings from {EMB_FILE}...")
    try:
        X = np.load(EMB_FILE)
        y = np.load(GPS_FILE)
    except FileNotFoundError:
        print("Error: train_embeddings.npy not found. Run main.py first to generate them!")
        exit()

    print(f"Data Loaded. Shape: {X.shape}")
    
    # Create a Validation Split
    # We pretend the 'val' set is our test set to measure performance
    print("Splitting data (80% Train / 20% Validation)...")
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=25)
    
    # Pre-compute Neighbors
    # We calculate the max K once to save time, then slice it for smaller K values
    max_k = max(K_VALUES)
    print(f"Fitting NN model (finding top {max_k} neighbors)...")
    
    knn = NearestNeighbors(n_neighbors=max_k, metric='cosine', n_jobs=-1)
    knn.fit(X_train)
    
    # Get distances and indices for the validation set
    # This is the heavy lifting; done only once
    dists_all, indices_all = knn.kneighbors(X_val)
    
    # Grid Search Loop
    print("\nStarting Grid Search...")
    print(f"{'K':<5} | {'Temp':<5} | {'Avg Error (km)':<15}")
    print("-" * 35)
    
    best_score = float('inf')
    best_params = {}
    
    # We iterate through K and Temperature to find the "Goldilocks" zone
    for k in K_VALUES:
        # Slice the pre-computed arrays to simulate using only 'k' neighbors
        dists_k = dists_all[:, :k]
        indices_k = indices_all[:, :k]
        
        for temp in TEMP_VALUES:
            # WEIGHTING LOGIC 
            # Similarity = 1 - Cosine Distance
            # High Temp = Sharp peaks (Only closest neighbor matters)
            # Low Temp = Flat peaks (All k neighbors matter equally)
            similarities = 1 - dists_k
            weights = np.exp(similarities * temp)
            
            # Normalize weights to sum to 1 (Add epsilon to avoid division by zero)
            weights_sum = np.sum(weights, axis=1, keepdims=True) + 1e-10
            weights_norm = weights / weights_sum
            
            # PREDICTION
            # Gather neighbor GPS coords
            # shape: [N_val, k, 2]
            neighbor_gps = y_train[indices_k]
            
            # Weighted Average
            # Multiply weights (N, k, 1) * gps (N, k, 2) -> sum over k -> (N, 2)
            weights_norm_exp = np.expand_dims(weights_norm, axis=2)
            pred_gps = np.sum(weights_norm_exp * neighbor_gps, axis=1)
            
            # SCORING
            errors = haversine_np(pred_gps[:,0], pred_gps[:,1], y_val[:,0], y_val[:,1])
            mean_error = np.mean(errors)
            
            print(f"{k:<5} | {temp:<5} | {mean_error:.4f} km")
            
            if mean_error < best_score:
                best_score = mean_error
                best_params = {'k': k, 'temp': temp}

    print("\n" + "="*35)
    print("GRID SEARCH RESULT")
    print("="*35)
    print(f"Best K: {best_params['k']}")
    print(f"Best Temperature: {best_params['temp']}")
    print(f"Validation Error: {best_score:.4f} km")
    print("="*35)
    print("\nUpdate your main.py with these values!")

In [None]:
# STEP C: BATCH K-NN REFINEMENT
# ---------------------------------------------------------

K_OPTIMAL = 5
TEMP_OPTIMAL = 50

print(f"Refining GPS with K-NN (K={K_OPTIMAL}, Temp={TEMP_OPTIMAL})...")

# Fit K-NN on the training data
print("Fitting K-NN model...")
# knn = NearestNeighbors(n_neighbors=5, metric='cosine', n_jobs=-1)
knn = NearestNeighbors(n_neighbors=K_OPTIMAL, metric='cosine', n_jobs=-1)
knn.fit(train_emb)

distances, indices = knn.kneighbors(all_test_embeddings)

refined_results = []

for i in range(len(all_sample_ids)):
    # Calculate Weighted GPS
    neighbor_indices = indices[i]
    neighbor_dists = distances[i]
    neighbor_gps = train_gps[neighbor_indices]
    
    # Similarity = 1 - Cosine Distance
    # Exponentiate to sharpen the weights (gives more power to very close matches)
    weights = np.exp((1 - neighbor_dists) * TEMP_OPTIMAL) 
    weights = weights / np.sum(weights)
    
    w_lat = np.sum(neighbor_gps[:, 0] * weights)
    w_lon = np.sum(neighbor_gps[:, 1] * weights)
    
    # Get State Predictions (Already computed)
    states = all_top5_states[i]
    
    # Create Row
    row = {
        'sample_id': all_sample_ids[i],
        'predicted_state_idx_1': states[0],
        'predicted_state_idx_2': states[1],
        'predicted_state_idx_3': states[2],
        'predicted_state_idx_4': states[3],
        'predicted_state_idx_5': states[4],
        'predicted_latitude': w_lat,   # <--- K-NN Refined
        'predicted_longitude': w_lon   # <--- K-NN Refined
    }
    refined_results.append(row)

# STEP D: FORMATTING & SAVING
# ---------------------------------------------------------
df_sub = pd.DataFrame(refined_results)

# Template Merge (Best Practice for Kaggle to ensure order/columns)
template = pd.read_csv(Config.TEST_CSV)
final_df = template.merge(df_sub, on='sample_id', how='left', suffixes=('', '_pred'))

# Overwrite columns
final_df['predicted_state_idx_1'] = final_df['predicted_state_idx_1_pred']
final_df['predicted_latitude'] = final_df['predicted_latitude_pred']
final_df['predicted_longitude'] = final_df['predicted_longitude_pred']

# Optional columns
for k in range(2, 6):
    col_name = f'predicted_state_idx_{k}'
    if f'{col_name}_pred' in final_df.columns:
        final_df[col_name] = final_df[f'{col_name}_pred']

# Save
final_df.to_csv(Config.SUBMISSION_FILE, index=False)
print(f"Submission saved to {Config.SUBMISSION_FILE}")