In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.models import resnet34



In [25]:


# Define custom dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, features_file, positions_file, orientations_file, transform=None):
        self.features = torch.load(features_file)
        self.positions = torch.load(positions_file)
        self.orientations = torch.load(orientations_file)
        self.transform = transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        features = self.features[idx]
        positions = self.positions[idx]
        orientations = self.orientations[idx]
        
        if self.transform:
            features = self.transform(features)

        return features, positions, orientations
    


In [34]:
    
class HourglassPose(nn.Module):
    def __init__(self):
        super(HourglassPose, self).__init__()
        
        # Load pre-trained ResNet34 as encoder
        resnet_model = resnet34(pretrained=True)
        self.encoder = nn.Sequential(*list(resnet_model.children())[:-2])  # Remove last avgpool and fc layers
        
        # Decoder
        self.deconv1 = nn.ConvTranspose2d(512, 256, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.deconv2 = nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.deconv3 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.deconv4 = nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.conv_final = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        
        # Skip connections
        self.skip1 = nn.Conv2d(512, 32, kernel_size=1)
        self.skip2 = nn.Conv2d(256, 32, kernel_size=1)
        self.skip3 = nn.Conv2d(128, 32, kernel_size=1)
        self.skip4 = nn.Conv2d(64, 32, kernel_size=1)
        
        # Regressor
        self.fc_loc = nn.Linear(32 * 56 * 56, 3)
        self.fc_ori = nn.Linear(32 * 56 * 56, 4)
        self.fc_trans = nn.Linear(32 * 56 * 56, 3)
        
    def forward(self, x):
        # Encoder
        x = self.encoder(x)
        skip1 = self.skip1(x)
        
        # Decoder
        x = self.deconv1(x)
        skip2 = self.skip2(x[:, :256, :, :])  # Ensure correct number of input channels
        x = torch.cat((x, skip2), dim=1)
        
        x = self.deconv2(x)
        skip3 = self.skip3(x[:, :128, :, :])  # Ensure correct number of input channels
        x = torch.cat((x, skip3), dim=1)
        
        x = self.deconv3(x)
        skip4 = self.skip4(x[:, :64, :, :])  # Ensure correct number of input channels
        x = torch.cat((x, skip4, skip1), dim=1)
        
        x = self.deconv4(x)
        x = self.conv_final(x)
        
        # Regressor
        x = x.view(x.size(0), -1)
        loc = self.fc_loc(x)
        ori = self.fc_ori(x)
        trans = self.fc_trans(x)
        
        return loc, ori, trans



In [39]:
class HourglassPose(nn.Module):
    def __init__(self):
        super(HourglassPose, self).__init__()
        
        # Load pre-trained ResNet34 as encoder
        resnet_model = resnet34(pretrained=True)
        self.encoder = nn.Sequential(*list(resnet_model.children())[:-2])  # Remove last avgpool and fc layers
        
        # Decoder
        self.deconv1 = nn.ConvTranspose2d(512, 256, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.deconv2 = nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.deconv3 = nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.deconv4 = nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1)
        self.conv_final = nn.Conv2d(32, 32, kernel_size=3, padding=1)
        
        # Skip connections
        self.skip1 = nn.Conv2d(512, 32, kernel_size=1)
        self.skip2 = nn.Conv2d(256, 128, kernel_size=1)  # Adjusted number of output channels
        self.skip3 = nn.Conv2d(128, 64, kernel_size=1)   # Adjusted number of output channels
        self.skip4 = nn.Conv2d(64, 32, kernel_size=1)
        
        # Regressor
        self.fc_loc = nn.Linear(32 * 56 * 56, 3)
        self.fc_ori = nn.Linear(32 * 56 * 56, 4)
        self.fc_trans = nn.Linear(32 * 56 * 56, 3)
        
    def forward(self, x):
        # Encoder
        x = self.encoder(x)
        skip1 = self.skip1(x)
        
        # Decoder
        x = self.deconv1(x)
        skip2 = self.skip2(x)  # Adjusted skip connection
        x = torch.cat((x, skip2), dim=1)
        
        x = self.deconv2(x)
        skip3 = self.skip3(x)  # Adjusted skip connection
        x = torch.cat((x, skip3), dim=1)
        
        x = self.deconv3(x)
        skip4 = self.skip4(x)
        x = torch.cat((x, skip4, skip1), dim=1)
        
        x = self.deconv4(x)
        x = self.conv_final(x)
        
        # Regressor
        x = x.view(x.size(0), -1)
        loc = self.fc_loc(x)
        ori = self.fc_ori(x)
        trans = self.fc_trans(x)
        
        return loc, ori, trans


In [40]:

# Define training parameters
batch_size = 32
learning_rate = 0.001
num_epochs = 10
#ft="D:/slam/VSLAM/Extracted/Features.pt"
#ps="D:/slam/VSLAM/Extracted/Positions.pt"
#ot="D:/slam/VSLAM/Extracted/Orientations.pt"

# Instantiate dataset and dataloader
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to match ResNet input size
    #transforms.ToTensor()
])
dataset = CustomDataset('D:/slam/VSLAM/Extracted/Features.pt', 'D:/slam/VSLAM/Extracted/Positions.pt', 'D:/slam/VSLAM/Extracted/Orientations.pt', transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Instantiate model, loss function, and optimizer
model = HourglassPose()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features, positions, orientations in dataloader:
        optimizer.zero_grad()


In [41]:
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for features, positions, orientations in dataloader:
        # Forward pass
        loc_pred, ori_pred, trans_pred = model(features)
        
        # Compute loss
        loc_loss = criterion(loc_pred, positions)
        ori_loss = criterion(ori_pred, orientations)
        total_loss = loc_loss + ori_loss
        
        # Backpropagation
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # Track loss
        epoch_loss += total_loss.item() * features.size(0)

    # Print epoch loss
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss / len(dataset):.4f}')


RuntimeError: Given transposed=1, weight of size [256, 128, 3, 3], expected input[32, 384, 14, 14] to have 256 channels, but got 384 channels instead