In [29]:
%matplotlib inline

In [30]:
from transformers import AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation, AutoProcessor, TrainingArguments, Trainer
from datasets import Dataset, load_dataset, DatasetDict
import torch
from torch.utils.data import DataLoader
import labelme2coco
import albumentations as A
from albumentations.pytorch import ToTensorV2
import evaluate
import os
from PIL import Image
import numpy as np
import json
from torch.optim import Adam
from tqdm.notebook import tqdm

In [31]:
import torch
from torch.utils.data import Dataset
from pycocotools.coco import COCO
from PIL import Image, ImageDraw
import os
from transformers import AutoProcessor  # Replace AutoProcessor with the specific processor if needed


In [None]:
class SemanticSegmentationDataset(Dataset):
    def __init__(self, root_dir, processor, transform=None):
        """
        Args:
            root_dir (str): Directory with all the images.
            processor: A Hugging Face processor to handle image and mask processing.
            transform (callable, optional): Optional transform to be applied on the image and mask.
        """
        self.root_dir = root_dir
        self.processor = processor
        self.transform = transform

        self.img_dir = os.path.join(self.root_dir, "images")
        self.ann_dir = os.path.join(self.root_dir, "labels")

        # read images
        image_file_names = []
        for root, dirs, files in os.walk(self.img_dir):
          image_file_names.extend(files)
        self.images = sorted(image_file_names)

        # read annotations
        annotation_file_names = []
        for root, dirs, files in os.walk(self.ann_dir):
          annotation_file_names.extend(files)
        self.annotations = sorted(annotation_file_names)

        assert len(self.images) == len(self.annotations), "There must be as many images as there are segmentation maps"

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = Image.open(os.path.join(self.img_dir, self.images[idx]))
        segmentation_map = Image.open(os.path.join(self.ann_dir, self.annotations[idx]))
        image=np.array(image)
        segmentation_map = np.array(segmentation_map)
        # Process images and masks using the processor
        encoded_inputs = self.processor(
            images=image,
            segmentation_maps=segmentation_map,
            task_inputs=["semantic"],  # Specify task if using multi-task processors
            return_tensors="pt",
            size=(512,512)
        )

        # encoded_inputs = {k: v.squeeze() if isinstance(v, torch.Tensor) else v[0] for k,v in encoded_inputs.items()}
        #for k,v in encoded_inputs.items():
        #  encoded_inputs[k].squeeze_() # remove batch dimension
        encoded_inputs = {k: v.squeeze() if isinstance(v, torch.Tensor) else v[0] for k,v in encoded_inputs.items()}
        return encoded_inputs

train_transform = A.Compose(
    [
        #A.Resize(256, 256),
        A.ShiftScaleRotate(shift_limit=0.25, scale_limit=0.2, rotate_limit=20, p=0.5),
        #A.RGBShift(r_shift_limit=25, g_shift_limit=25, b_shift_limit=25, p=0.5),
        #A.CLAHE (clip_limit=4.0, tile_grid_size=(8, 8), always_apply=False, p=0.5),
        A.HueSaturationValue (hue_shift_limit=5, sat_shift_limit=20, val_shift_limit=20, p=0.5),
        A.ColorJitter (brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2, p=0.5),
        A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.3),
        #A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        #ToTensorV2(),
    ]
)

In [33]:
# class COCOSegmentationDataset(Dataset):
#     def __init__(self, root_dir, annotation_file, processor, transform=None):
#         """
#         Args:
#             root_dir (str): Directory with all the images.
#             annotation_file (str): Path to the COCO annotation file.
#             processor: A Hugging Face processor to handle image and mask processing.
#             transform (callable, optional): Optional transform to be applied on the image and mask.
#         """
#         self.root_dir = root_dir
#         self.coco = COCO(annotation_file)
#         self.ids = list(self.coco.imgs.keys())
#         self.processor = processor
#         self.transform = transform

#     def __len__(self):
#         return len(self.ids)

#     def __getitem__(self, index):
#         # Load image and annotations
#         img_id = self.ids[index]
#         ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
#         anns = self.coco.loadAnns(ann_ids)
#         img_info = self.coco.loadImgs(img_id)[0]
        
#         # Load image
#         img_path = os.path.join(self.root_dir, img_info['file_name'])
#         print(img_info['file_name'])
#         image = Image.open(img_info['file_name']).convert("RGB")

#         # Initialize an empty mask
#         mask = Image.new('L', (img_info['width'], img_info['height']), 0)
        

#         # Draw each polygon annotation onto the mask
#         for ann in anns:
#             if 'segmentation' in ann:
#                 category_id = ann["category_id"]
#                 for segmentation in ann['segmentation']:
#                     polygon = [tuple(segmentation[i:i + 2]) for i in range(0, len(segmentation), 2)]
#                     ImageDraw.Draw(mask).polygon(polygon, outline=category_id, fill=category_id)
        
#         # Convert mask to a tensor
#         mask = torch.as_tensor(np.array(mask), dtype=torch.long)

#         # Process images and masks using the processor
#         encoded_inputs = self.processor(
#             images=image,
#             segmentation_maps=mask,
#             task_inputs=["semantic"],  # Specify task if using multi-task processors
#             return_tensors="pt",
#             size=(512,512)
#         )

#         # Apply additional transformations if specified
#         if self.transform:
#             encoded_inputs["pixel_values"] = self.transform(encoded_inputs["pixel_values"])
#             encoded_inputs["labels"] = self.transform(encoded_inputs["labels"])

#         encoded_inputs = {k: v.squeeze() if isinstance(v, torch.Tensor) else v[0] for k,v in encoded_inputs.items()}

#         return encoded_inputs


## Load pre-trained model and processror

In [50]:
id2label = {0:"_background_", 1:"foundation", 2:"front door", 3:"garage door", 4:"stairs"}
id2label = {int(k): v for k, v in id2label.items()}
label2id = {v: k for k, v in id2label.items()}

In [56]:
model_name = "shi-labs/oneformer_coco_swin_large"
model = AutoModelForUniversalSegmentation.from_pretrained(
    model_name,is_training=True,num_labels=len(id2label),
    id2label=id2label,label2id=label2id,
    ignore_mismatched_sizes=True)
# move model to GPU
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model.to(device)
image_processor = AutoProcessor.from_pretrained(model_name)  # Replace with model ID
image_processor.image_processor.num_text = model.config.num_queries - model.config.text_encoder_n_ctx

Some weights of OneFormerForUniversalSegmentation were not initialized from the model checkpoint at shi-labs/oneformer_coco_swin_large and are newly initialized: ['model.text_mapper.prompt_ctx.weight', 'model.text_mapper.text_encoder.ln_final.bias', 'model.text_mapper.text_encoder.ln_final.weight', 'model.text_mapper.text_encoder.positional_embedding', 'model.text_mapper.text_encoder.token_embedding.weight', 'model.text_mapper.text_encoder.transformer.layers.0.layer_norm1.bias', 'model.text_mapper.text_encoder.transformer.layers.0.layer_norm1.weight', 'model.text_mapper.text_encoder.transformer.layers.0.layer_norm2.bias', 'model.text_mapper.text_encoder.transformer.layers.0.layer_norm2.weight', 'model.text_mapper.text_encoder.transformer.layers.0.mlp.fc1.bias', 'model.text_mapper.text_encoder.transformer.layers.0.mlp.fc1.weight', 'model.text_mapper.text_encoder.transformer.layers.0.mlp.fc2.bias', 'model.text_mapper.text_encoder.transformer.layers.0.mlp.fc2.weight', 'model.text_mapper.t

In [36]:
# # Define the processor and model for OneFormer
# model_name = "shi-labs/oneformer_coco_swin_large"
# # model = AutoModelForSemanticSegmentation.from_pretrained(model_name,is_training=True)
# model = AutoModelForUniversalSegmentation.from_pretrained(model_name,is_training=True,num_labels=4,ignore_mismatched_sizes=True)
# processor = AutoProcessor.from_pretrained(model_name)  # Replace with model ID
# processor.image_processor.num_text = model.config.num_queries - model.config.text_encoder_n_ctx

In [37]:
# # Initialize dataset
# root_dir = r'C:\Users\lliu\FrontierSI\Projects - 127 Residential Dwelling Floor Height\4 Executing\Data Exploration\GSV\Wagga\Panos_clipped'
# annotation_file = r"C:\Users\lliu\Desktop\FrontierSI\projects\GA_floor_height\GA-floor-height\output\Wagga\Annotations_COCO\dataset.json"
# train_dataset = COCOSegmentationDataset(root_dir, annotation_file, processor)
# encoded_inputs = train_dataset[0]
# encoded_inputs["pixel_values"].shape


In [38]:
# import matplotlib.pyplot as plt
# plt.imshow(np.array(encoded_inputs['mask_labels'][1,:,:]))

In [39]:
# encoded_inputs["class_labels"].shape
# encoded_inputs["class_labels"]
# encoded_inputs["class_labels"].squeeze().unique()

## Define dataloader

In [57]:
# Define a data collator
def collate_fn(batch):
    pixel_values = torch.stack([example["pixel_values"] for example in batch])
    pixel_mask = torch.stack([example["pixel_mask"] for example in batch])
    class_labels = [example["class_labels"] for example in batch]
    mask_labels = [example["mask_labels"] for example in batch]
    #text_inputs = [example["text_inputs"] for example in batch]
    text_inputs = torch.stack([example["text_inputs"] for example in batch])
    #task_inputs = [example["task_inputs"] for example in batch]
    task_inputs = torch.stack([example["task_inputs"] for example in batch])
    return {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "class_labels": class_labels, "mask_labels": mask_labels,'text_inputs':text_inputs,'task_inputs':task_inputs}

In [59]:
# root_dir=r"C:\Users\lliu\Desktop\FrontierSI\projects\GA_floor_height\GA-floor-height\output\Wagga\GSV_annotations_converted_merged\train"
root_dir=r"C:\Users\lliu\Desktop\FrontierSI\projects\GA_floor_height\GA-floor-height\output\Wagga\GSV_annotations_converted_merged\train"
train_dataset = SemanticSegmentationDataset(root_dir=root_dir, processor=image_processor, transform=train_transform)
print("Number of training examples:", len(train_dataset))
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True,collate_fn=collate_fn)


Number of training examples: 245


In [60]:
train_dataset[0]

{'pixel_values': tensor([[[-0.2856, -0.2856, -0.2856,  ..., -0.4739, -0.4568, -0.4568],
          [-0.2856, -0.2856, -0.2856,  ..., -0.4739, -0.4568, -0.4568],
          [-0.2856, -0.2856, -0.2856,  ..., -0.4739, -0.4568, -0.4568],
          ...,
          [-0.4911, -0.5253, -0.4226,  ..., -0.8507, -0.6623, -0.9020],
          [-0.7308, -0.5253, -0.4739,  ..., -0.9363, -0.8164, -0.6794],
          [-0.4568, -0.4739, -0.4911,  ..., -0.8335, -0.7650, -0.6452]],
 
         [[ 0.8880,  0.8880,  0.8880,  ...,  0.6604,  0.6779,  0.6779],
          [ 0.8880,  0.8880,  0.8880,  ...,  0.6604,  0.6779,  0.6779],
          [ 0.8880,  0.8880,  0.8880,  ...,  0.6604,  0.6779,  0.6779],
          ...,
          [-0.0749, -0.1099, -0.0049,  ..., -0.4076, -0.1975, -0.4251],
          [-0.2500, -0.0749, -0.0049,  ..., -0.4601, -0.3725, -0.2500],
          [ 0.0651,  0.0826,  0.0301,  ..., -0.4076, -0.3200, -0.1975]],
 
         [[ 2.6051,  2.6051,  2.6051,  ...,  2.3088,  2.3263,  2.3263],
          [ 

In [63]:
batch = next(iter(train_loader))
for k,v in batch.items():
  print(k,  v)

pixel_values tensor([[[[-0.3883, -0.1828, -0.3712,  ..., -0.2684, -0.3027, -0.5253],
          [ 0.3652, -0.1999, -0.0458,  ...,  0.0912, -0.0629, -0.2684],
          [ 1.0673, -0.4226, -0.1143,  ...,  0.7591,  0.7077,  0.1426],
          ...,
          [ 0.5022,  0.5022,  0.4851,  ...,  0.1254,  0.2282,  0.2967],
          [ 0.5022,  0.4851,  0.4851,  ..., -0.1143, -0.0458, -0.0116],
          [ 0.5022,  0.4851,  0.4851,  ..., -0.2513, -0.2171, -0.2171]],

         [[-0.2325,  0.0126, -0.1625,  ...,  0.0826,  0.1352, -0.1450],
          [ 0.5553,  0.0301,  0.2752,  ...,  0.4503,  0.3277,  0.0651],
          [ 1.3431, -0.1975,  0.2227,  ...,  1.1681,  1.0980,  0.4678],
          ...,
          [ 0.9755,  0.9755,  0.9755,  ...,  0.2402,  0.3627,  0.4503],
          [ 0.9755,  0.9580,  0.9755,  ..., -0.0049,  0.0826,  0.1352],
          [ 0.9755,  0.9755,  0.9755,  ..., -0.1450, -0.0924, -0.0749]],

         [[ 0.0082,  0.0256, -0.0964,  ...,  1.0017,  0.9668,  0.3916],
          [ 1.298

## Fine-tune

In [66]:
# import torchmetrics
metric = evaluate.load("mean_iou")
# define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006)

model.train()
max_iou=0
min_loss=10000
for epoch in range(100):
    print("Epoch:", epoch)
    running_loss = 0.0
    num_samples = 0
    #for idx, batch in enumerate(tqdm(train_dataloader)):
    for idx, batch in enumerate(tqdm(train_loader)):
        # get the inputs;
        #pixel_values = batch["pixel_values"].to(device)
        #labels = batch["labels"].to(device)
        # zero the parameter gradients
        optimizer.zero_grad()
        outputs = model( pixel_values=batch["pixel_values"].to(device),
            pixel_mask =batch["pixel_mask"].to(device),
            mask_labels=[labels.to(device) for labels in batch["mask_labels"]],
            class_labels=[labels.to(device) for labels in batch["class_labels"]],
            #text_inputs=[labels.to(device) for labels in batch["text_inputs"]],
            text_inputs=batch["text_inputs"].to(device),
            #task_inputs=[labels.to(device) for labels in batch["task_inputs"]],
            task_inputs=batch["task_inputs"].to(device),
            )
        loss = outputs.loss
        batch_size = batch["pixel_values"].size(0)
        running_loss += loss.item()
        num_samples += batch_size
        loss.backward()
        optimizer.step()

    # evaluate
    print("Loss:", running_loss/num_samples)
    if loss.item()<min_loss:
        min_loss=loss.item()
        model.save_pretrained(r"C:\Users\lliu\Desktop\FrontierSI\projects\GA_floor_height\GA-floor-height\output\segformer\class_merged\V3")
        image_processor.save_pretrained(r"C:\Users\lliu\Desktop\FrontierSI\projects\GA_floor_height\GA-floor-height\output\segformer\class_merged\V3")

Epoch: 0


  0%|          | 0/245 [00:00<?, ?it/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  with autocast(enabled=False):


KeyboardInterrupt: 

In [None]:
# metric = evaluate.load("mean_iou")
# # import torchmetrics
# # define optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=0.00006)

# # jaccard = torchmetrics.JaccardIndex(task="multiclass", num_classes=4)
# model.train()
# max_iou=0
# for epoch in range(100):  
#     print("Epoch:", epoch)
#     running_loss = 0.0
#     num_samples = 0
#     #for idx, batch in enumerate(tqdm(train_dataloader)):
#     for idx, batch in enumerate(train_loader):
#         # get the inputs;
#         # zero the parameter gradients
#         optimizer.zero_grad()
#         #batch = {k:v.to(device) for k,v in batch.items()}
#         # forward pass
#         #outputs = model(**batch)
#         # forward + backward + optimize
#         outputs = model( pixel_values=batch["pixel_values"].to(device),
#             pixel_mask =batch["pixel_mask"].to(device),
#             mask_labels=[labels.to(device) for labels in batch["mask_labels"]],
#             class_labels=[labels.to(device) for labels in batch["class_labels"]],
#             #text_inputs=[labels.to(device) for labels in batch["text_inputs"]],
#             text_inputs=batch["text_inputs"].to(device),
#             #task_inputs=[labels.to(device) for labels in batch["task_inputs"]],
#             task_inputs=batch["task_inputs"].to(device),

#             )
#         loss = outputs.loss
#         batch_size = batch["pixel_values"].size(0)
#         running_loss += loss.item()
#         num_samples += batch_size
#         loss.backward()
#         optimizer.step()

#     # mean_iou_epoch=jaccard.compute()
#     print("Loss:", running_loss/num_samples)
#     # print("Mean_IOU:", mean_iou_epoch)
#     # jaccard.reset()
#     # if mean_iou_epoch>max_iou:
#     #     max_iou=mean_iou_epoch
#     # if epoch%10==0:
#     #torch.save(model.state_dict(), r'X:\50243-23WestgoldCue\06_Output/'+str(epoch)+".pth")
#     model.save_pretrained(r"C:\Users\lliu\Desktop\FrontierSI\projects\GA_floor_height\GA-floor-height\output\oneformer\GPU")
#     processor.save_pretrained(r"C:\Users\lliu\Desktop\FrontierSI\projects\GA_floor_height\GA-floor-height\output\oneformer\GPU")