In [2]:
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn as nn
from sklearn.metrics import accuracy_score
import warnings
import math
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models.segmentation import deeplabv3_resnet50
from PIL import Image
from timm.models.vision_transformer import VisionTransformer
from sklearn.preprocessing import LabelEncoder
import joblib
from tqdm import tqdm
warnings.filterwarnings("ignore", message="numerical errors at iteration 0")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Semantic Segmentation Model
class SemanticSegmentor(nn.Module):
    def __init__(self):
        super().__init__()
        self.segmentation_model = deeplabv3_resnet50(pretrained=True)
        # self.segmentation_model.eval()  # Freeze weights

    def forward(self, x):
        with torch.no_grad():
            seg_map = self.segmentation_model(x)['out']
        return seg_map

In [4]:
#Multimodal Feature Fusion (MFF)
class MultimodalFusion(nn.Module):
    def __init__(self, embed_dim=768):
        super().__init__()
        self.projection = nn.Linear(embed_dim, embed_dim)  # f(.) projection
        self.back_projection = nn.Linear(embed_dim, embed_dim)  # g(.) back-projection

        # Attention module
        self.attention_mlp = nn.Sequential(
            nn.Linear(2 * embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, 2),
            nn.Softmax(dim=1)  # Generates [w_rgb, w_seg]
        )

    def forward(self, rgb_cls, seg_cls):
        # Compute modality attention
        att_input = torch.cat([rgb_cls, seg_cls], dim=1)
        weights = self.attention_mlp(att_input)  # [batch, 2]
        w_rgb, w_seg = weights[:, 0].unsqueeze(1), weights[:, 1].unsqueeze(1)

        # Weighted CLS token fusion (final layer)
        rgb_final = (1 + w_rgb) * rgb_cls
        seg_final = (1 + w_seg) * seg_cls
        fmm = torch.cat([rgb_final, seg_final], dim=1)  # Final fused feature

        return fmm
    

In [5]:
class PositionalEncoder(nn.Module):
    def __init__(self, dim_model: int, dropout_p: float = 0.1, max_len: int=1000):
        """Initializes the positional embedding layer to enrich data fed into transformers
           with positional information.
        Args:
            dim_model (int): model dimension
            dropout_p (float, optional): dropout for all embeddings. Defaults to 0.1.
            max_len (int, optional): determines how far the position can influence other tokens. Defaults to 1000.
        Note:
            This code is a modified version of: `<https://pytorch.org/tutorials/beginner/transformer_tutorial.html>`_.
        """
        super().__init__()

        # Dropout
        self.dropout = nn.Dropout(dropout_p)

        # Encoding
        pos_encoding = torch.zeros(max_len, dim_model)
        positions_list = torch.arange(0, max_len, dtype=torch.float).view(-1, 1)
        division_term = torch.exp(torch.arange(0, dim_model, 2).float() * (-math.log(10000.0)) / dim_model)

        # PE(pos, 2i) = sin(pos/1000^(2i/dim_model))
        pos_encoding[:, 0::2] = torch.sin(positions_list * division_term)

        # PE(pos, 2i + 1) = cos(pos/1000^(2i/dim_model))
        pos_encoding[:, 1::2] = torch.cos(positions_list * division_term)

        # Saving buffer (same as parameter without gradients needed)
        pos_encoding = pos_encoding.unsqueeze(0).transpose(0, 1)
        self.register_parameter('pos_encoding', nn.Parameter(pos_encoding, requires_grad=False))

    def forward(self, token_embedding: torch.tensor) -> torch.tensor:
        """Generates positional embeddings.
        Args:
            token_embedding (torch.tensor): original embeddings
        Returns:
            torch.tensor: transformed embeddings
        """
        # Residual connection + positional encoding
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])
    

In [6]:
class MultimodalTransformer(nn.Module):
    def __init__(self, img_size, patch_size = 8, vit_model: str = "vit_base_patch16_224", embed_dim: int = 768, num_layers: int = 3):
        super(MultimodalTransformer, self).__init__()

        # Load pre-trained ViT models for RGB and Semantic maps
        self.rgb_transformer = VisionTransformer(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim, depth=num_layers, num_heads=6, mlp_ratio=4)
        self.seg_transformer = VisionTransformer(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim, in_chans=1, depth=num_layers, num_heads=6, mlp_ratio=4)

        # Projection layers to align CLS token dimensions
        self.cls_projection = nn.Linear(embed_dim, embed_dim)  # f(.) projection
        self.back_projection = nn.Linear(embed_dim, embed_dim)  # g(.) back-projection

        # Attention fusion of CLS token from the last layer
        self.fusion_module = MultimodalFusion()

        #Positional Encoder
        self.pos_encoder = PositionalEncoder(dim_model=embed_dim)


    def forward(self, rgb_input, seg_input):
        # Extract embeddings from both transformers
        rgb_tokens = self.rgb_transformer.patch_embed(rgb_input)
        seg_tokens = self.seg_transformer.patch_embed(seg_input)

        cls_rgb = self.rgb_transformer.cls_token.expand(rgb_tokens.shape[0], -1, -1)
        cls_seg = self.seg_transformer.cls_token.expand(seg_tokens.shape[0], -1, -1)

        # Positional embedding
        #Concat the cls token for resective tokens(rgb or seg)
        rgb_tokens = torch.cat([cls_rgb, rgb_tokens], dim=1)
        seg_tokens = torch.cat([cls_seg, seg_tokens], dim=1)
        #add the positional embeddings
        rgb_tokens = self.pos_encoder(rgb_tokens)
        seg_tokens = self.pos_encoder(seg_tokens)

        for layer in range(len(self.rgb_transformer.blocks)):
            rgb_tokens = self.rgb_transformer.blocks[layer](rgb_tokens)
            seg_tokens = self.seg_transformer.blocks[layer](seg_tokens)

            # Extract CLS tokens after each layer
            cls_rgb = self.cls_projection(rgb_tokens[:, 0])
            cls_seg = self.cls_projection(seg_tokens[:, 0])

            # Sum CLS tokens and append back to patch tokens
            fused_cls = self.back_projection(cls_rgb + cls_seg)
            rgb_tokens = torch.cat([fused_cls.unsqueeze(1), rgb_tokens[:, 1:]], dim=1)
            seg_tokens = torch.cat([fused_cls.unsqueeze(1), seg_tokens[:, 1:]], dim=1)

        # Final CLS tokens from last layer
        cls_rgb = rgb_tokens[:, 0]
        cls_seg = seg_tokens[:, 0]

        # Attention fusion of the cls tokens from the last layer
        fmm = self.fusion_module(cls_rgb, cls_seg)

        return fmm


In [7]:
#Define a mlp layer that predict a class based on the input
class mlp(nn.Module):
  def __init__(self, input_dim = 1536, output_dim = 10):
    super(mlp, self).__init__()
    self.fc1 = nn.Linear(input_dim, 1024)
    self.fc2 = nn.Linear(1024, 2048)
    self.fc3 = nn.Linear(2048, output_dim)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(0.5)

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.fc2(x)
    x = self.relu(x)
    x = self.dropout(x)
    x = self.fc3(x)
    return x

In [8]:
#Define a classifier for based on the Semantic Segmentor and multimodal model.
class MultiModalClassifier(nn.Module):
  def __init__(self, input_dim = 256,feature_size = 1536,  output_dim = 2781):
    super(MultiModalClassifier, self).__init__()
    self.sematic_segmantic = SemanticSegmentor()
    self.multimodal = MultimodalTransformer(input_dim)
    self.mlp = mlp(feature_size, output_dim)

  # x is the rgb image of the shape (batch_size, channels, height, width)
  def forward(self, x):
    #Generate the semantic maps for the input images

    semantic_output = self.sematic_segmantic(x)
    semantic_map = torch.argmax(semantic_output.squeeze(), dim=1).unsqueeze(1).float()

    # Pass the semantic map and the rgb images through the MultimodalTransformer
    multimodal_output = self.multimodal(x, semantic_map)# (batch, 1536)

    # pass throught the mlp to get the classes
    output = self.mlp(multimodal_output)
    return output

## Working with the Data
Prepare the data for the training


In [9]:
def denormalize(img_tensor):
    """Reverse normalization using ImageNet stats"""
    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    return img_tensor * std + mean

In [10]:
class ImageLabelDataset(Dataset):
    def __init__(self, image_dir, dataframe, transform=None):
        self.image_dir = image_dir
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_filename = row['filename']
        label = row['polygon_label']
        img_path = os.path.join(self.image_dir, img_filename)
        
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        label = torch.tensor(label, dtype=torch.long)
        
        return image, label

In [11]:
def getDataset():
    img_path = '/home/godwinkhalko/DLCV/00'
    label_path = '/home/godwinkhalko/DLCV/labelled_points.xlsx'
    
    df = pd.read_excel(label_path, dtype={'id': str})
    
    image_files = os.listdir(img_path)
    id_to_filename = {}
    for f in image_files:
        id_ = os.path.splitext(f)[0] 
        id_to_filename[id_] = f

    label_encoder = LabelEncoder()

    df['polygon_label'] = label_encoder.fit_transform(df['polygon_label'])

    joblib.dump(label_encoder, 'label_encoder.pkl')

    filtered_df = df[df['id'].isin(id_to_filename.keys())]


    filtered_df['filename'] = filtered_df['id'].map(id_to_filename)

    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    dataset = ImageLabelDataset(image_dir=img_path, dataframe=filtered_df, transform=transform)
    return dataset


In [12]:
def showImages(dataloader):
    images, labels = next(iter(dataloader))
    
    fig, axs = plt.subplots(1, 5, figsize=(15, 3))

    for i in range(5):
        image = denormalize(images[i]).cpu().numpy()
        image = np.transpose(image, (1, 2, 0))
        image = np.clip(image, 0, 1)
        
        axs[i].imshow(image)
        axs[i].set_title(f"Label: {labels[i].item()}")
        axs[i].axis('off')

    plt.tight_layout()
    plt.show()

## Training the model

In [13]:
def train(model, dataset, epochs, batch_size, optimizer, criterion, save_point = 500):
    device = next(model.parameters()).device

    # Define the Dataloader
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4)

    train_loss = []
    train_accuracy = []
    checkpoint_path = "/home/godwinkhalko/DLCV/checkpoint.pth"
    modified_checkpoint_path = "/home/godwinkhalko/DLCV/mod_checkpoint.pth"
    # print(f"Checking if a checkpoint exists")
    # if os.path.exists(checkpoint_path):

    #     print(f"Loading checkpoint from {checkpoint_path}...")
    #     checkpoint = torch.load(checkpoint_path, map_location=device)
    #     model.load_state_dict(checkpoint['model_state_dict'])
        
    #     if 'optimizer_state_dict' in checkpoint:
    #         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    #     start_epoch = checkpoint.get('epoch', 0) + 1
    #     print(f"Checkpoint loaded, resuming from epoch {start_epoch}")
    # else:
    #     print("No checkpoint found, starting from scratch.")
    #     start_epoch = 0

    print("Started Training")
    for epoch in range(epochs):
        model.train()  # ✅ Ensure model is in training mode
        optimizer.zero_grad()  # ✅ Reset gradients before batch loop

        training_loss_batch = []
        training_accuracies_batch = []

        print(f"Started training for {epoch + 1}")
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()  # ✅ Reset gradients for every batch

            # Forward pass
            output= model(data)

            loss = criterion(output, target) 

            training_loss_batch.append(loss.item())

            # Backpropagation
            loss.backward()

            optimizer.step()  # ✅ Update weights

            # Calculate accuracy
            predicted_classes = torch.argmax(output, dim=1)
            accuracy = accuracy_score(target.cpu().numpy(), predicted_classes.cpu().numpy())
            training_accuracies_batch.append(accuracy)

            print(f"Epoch {epoch+1}/{epochs}, Batch {batch_idx+1}/{len(train_loader)}, Loss: {loss.item()}, Accuracy: {accuracy}", end="\r")
            
            if batch_idx % save_point == 0:
                torch.save({
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                    }, modified_checkpoint_path)
        # Average training loss and accuracy
        train_loss.append(np.mean(training_loss_batch))
        train_accuracy.append(np.mean(training_accuracies_batch))
        print(f"\n Epoch {epoch+1}/{epochs}, Training Loss: {np.mean(training_loss_batch)}, Training Accuracy: {np.mean(training_accuracies_batch)}")
    
    return model

In [None]:
#Initialize the training parameters
epochs = 100
batch_size = 16
learning_rate = 1e-4
weight_decay = 1e-5

#Initlaize the models and stuff
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model = MultiModalClassifier()
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss()
train_dataset = getDataset()

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [None]:
# #Run the training
# trained_model = train(model=model,
#                         dataset=train_dataset,
#                         epochs=100,
#                         batch_size=batch_size,
#                         optimizer=optimizer,
#                         criterion=criterion,
#                         save_point=50)

In [None]:
del model


In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()


In [18]:
label_path = '/home/godwinkhalko/DLCV/test.csv'
img_path = '/home/godwinkhalko/DLCV/00'

df = pd.read_csv(label_path, dtype={'id': str})

image_files = os.listdir(img_path)
id_to_filename = {}
for f in image_files:
    id_ = os.path.splitext(f)[0] 
    id_to_filename[id_] = f

filtered_df = df[df['id'].isin(id_to_filename.keys())]
filtered_df['filename'] = filtered_df['id'].map(id_to_filename)
filtered_df_mod = filtered_df[["id", "latitude", "longitude", "filename"]]

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# Model
model = MultiModalClassifier()

checkpoint = torch.load("/home/godwinkhalko/DLCV/Trained_Model.pth", map_location=device)
model.load_state_dict(checkpoint)
model.to(device)

MultiModalClassifier(
  (sematic_segmantic): SemanticSegmentor(
    (segmentation_model): DeepLabV3(
      (backbone): IntermediateLayerGetter(
        (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
        (layer1): Sequential(
          (0): Bottleneck(
            (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn3)

In [19]:
filtered_df_mod = filtered_df[["id", "latitude", "longitude", "filename"]]
filtered_df_mod.head()

Unnamed: 0,id,latitude,longitude,filename
0,547473234108938,-16.336027,45.62828,547473234108938.jpg
1,826109781317024,50.855687,56.147997,826109781317024.jpg
2,1006398440000844,37.956651,14.954485,1006398440000844.jpg
3,2943891539215481,12.373333,-8.909906,2943891539215481.jpg
4,122945119799579,7.510295,99.061884,122945119799579.jpg


In [20]:
class TestImageLabelDataset(Dataset):
    def __init__(self, image_dir, dataframe, transform=None):
        self.image_dir = image_dir
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_filename = row['filename']
        img_path = os.path.join(self.image_dir, img_filename)
        
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        
        return image, row['id'], row['latitude'], row['longitude']

In [28]:
import torch
import pandas as pd
import torch.nn.functional as F

model.eval()
results = []
transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

dataset = TestImageLabelDataset(image_dir="/home/godwinkhalko/DLCV/00", dataframe=filtered_df_mod, transform=transform)
# subset =Subset(dataset, list(range(10)))

test_loader = DataLoader(dataset=dataset, batch_size=32)



In [30]:
with torch.no_grad():
    for batch_idx, (ipnuts, ids, lats, lons) in enumerate(test_loader):
        # Assume each batch has the following
        # batch = (inputs, (id, lat, lon))
        print(f"\nBatch: {batch_idx} / {len(test_loader)}")
        inputs =ipnuts.to(device)
        
        outputs = model(inputs)
        probs = F.softmax(outputs, dim=1).cpu().numpy()  # shape: [batch_size, num_classes]

        for i in range(len(ids)):
            results.append({
                "id": ids[i],
                "latitude": lats[i],
                "longitude": lons[i],
                "softmax_probs": probs[i]  # This will be a NumPy array
            })

# Create DataFrame
df = pd.DataFrame(results)


Batch: 0 / 6567

Batch: 1 / 6567

Batch: 2 / 6567

Batch: 3 / 6567

Batch: 4 / 6567

Batch: 5 / 6567

Batch: 6 / 6567

Batch: 7 / 6567

Batch: 8 / 6567

Batch: 9 / 6567

Batch: 10 / 6567

Batch: 11 / 6567

Batch: 12 / 6567

Batch: 13 / 6567

Batch: 14 / 6567

Batch: 15 / 6567

Batch: 16 / 6567

Batch: 17 / 6567

Batch: 18 / 6567

Batch: 19 / 6567

Batch: 20 / 6567

Batch: 21 / 6567

Batch: 22 / 6567

Batch: 23 / 6567

Batch: 24 / 6567

Batch: 25 / 6567

Batch: 26 / 6567

Batch: 27 / 6567

Batch: 28 / 6567

Batch: 29 / 6567

Batch: 30 / 6567

Batch: 31 / 6567

Batch: 32 / 6567

Batch: 33 / 6567

Batch: 34 / 6567

Batch: 35 / 6567

Batch: 36 / 6567

Batch: 37 / 6567

Batch: 38 / 6567

Batch: 39 / 6567

Batch: 40 / 6567

Batch: 41 / 6567

Batch: 42 / 6567

Batch: 43 / 6567

Batch: 44 / 6567

Batch: 45 / 6567

Batch: 46 / 6567

Batch: 47 / 6567

Batch: 48 / 6567

Batch: 49 / 6567

Batch: 50 / 6567

Batch: 51 / 6567

Batch: 52 / 6567

Batch: 53 / 6567

Batch: 54 / 6567

Batch: 55 / 6567

B

KeyboardInterrupt: 

In [29]:
df


Unnamed: 0,id,latitude,longitude,softmax_probs
0,547473234108938,"tensor(-16.3360, dtype=torch.float64)","tensor(45.6283, dtype=torch.float64)","[0.009686001, 0.00047221873, 0.00042852398, 0...."
1,826109781317024,"tensor(50.8557, dtype=torch.float64)","tensor(56.1480, dtype=torch.float64)","[0.0069134636, 0.00048147028, 0.0003961817, 0...."
2,1006398440000844,"tensor(37.9567, dtype=torch.float64)","tensor(14.9545, dtype=torch.float64)","[0.00867515, 0.0005090673, 0.00042162492, 0.00..."
3,2943891539215481,"tensor(12.3733, dtype=torch.float64)","tensor(-8.9099, dtype=torch.float64)","[0.009685573, 0.0004990716, 0.000428763, 0.000..."
4,122945119799579,"tensor(7.5103, dtype=torch.float64)","tensor(99.0619, dtype=torch.float64)","[0.012565012, 0.00049636094, 0.00042835105, 0...."
5,523784905697835,"tensor(9.0405, dtype=torch.float64)","tensor(-11.7210, dtype=torch.float64)","[0.0069704424, 0.00049952534, 0.000438092, 0.0..."
6,938982170230321,"tensor(44.5749, dtype=torch.float64)","tensor(-0.8657, dtype=torch.float64)","[0.0058071865, 0.00052079814, 0.0004370921, 0...."
7,333119144910637,"tensor(23.7630, dtype=torch.float64)","tensor(-99.0120, dtype=torch.float64)","[0.0022568484, 0.00044630008, 0.00040570268, 0..."
8,2530858327060639,"tensor(10.3749, dtype=torch.float64)","tensor(-6.9190, dtype=torch.float64)","[0.0059636086, 0.0005331156, 0.00044378013, 0...."
9,639127933870005,"tensor(26.6556, dtype=torch.float64)","tensor(127.9535, dtype=torch.float64)","[0.0063706203, 0.00051135174, 0.00039089503, 0..."
