In [11]:
!pip install torch torchvision transformers opencv-python pycocotools roboflow matplotlib tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
from roboflow import Roboflow
rf = Roboflow(api_key="vQsG0k1QDxbi74ZD03Wp")
project = rf.workspace("fire-detection-4mpgq").project("root-cannel")
version = project.version(5)
dataset = version.download("yolov5")


loading Roboflow workspace...
loading Roboflow project...


In [13]:
image_dir = f"/content/Root-cannel-5/train/images"
annotation_dir = f"/content/Root-cannel-5/train/labels"
print(image_dir)
print(annotation_dir)


/content/Root-cannel-5/train/images
/content/Root-cannel-5/train/labels


In [14]:
import torch
import torchvision
import os
import cv2
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, image_dir, annotation_dir):
        self.image_dir = image_dir
        self.annotation_dir = annotation_dir
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg') or f.endswith('.png')]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)
        ann_path = os.path.join(self.annotation_dir, img_name.replace('.jpg', '.txt'))

        # Load image
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = torch.tensor(image).permute(2, 0, 1).float() / 255.0

        # Load annotations (YOLO format)
        boxes, labels = [], []
        with open(ann_path, "r") as file:
            for line in file.readlines():
                data = list(map(float, line.strip().split()))
                label, x_center, y_center, width, height = data
                x1 = (x_center - width / 2) * image.shape[2]
                y1 = (y_center - height / 2) * image.shape[1]
                x2 = (x_center + width / 2) * image.shape[2]
                y2 = (y_center + height / 2) * image.shape[1]
                boxes.append([x1, y1, x2, y2])
                labels.append(int(label))

        target = {"boxes": torch.tensor(boxes, dtype=torch.float32), "labels": torch.tensor(labels, dtype=torch.int64)}
        return image, target


In [15]:
from transformers import SwinModel

swin_backbone = SwinModel.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
backbone = swin_backbone
backbone.out_channels = 768


In [16]:
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator

anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),), aspect_ratios=((0.5, 1.0, 2.0),) * 5)

model = FasterRCNN(backbone, num_classes=10, rpn_anchor_generator=anchor_generator)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): SwinModel(
    (embeddings): SwinEmbeddings(
      (patch_embeddings): SwinPatchEmbeddings(
        (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): SwinEncoder(
      (layers): ModuleList(
        (0): SwinStage(
          (blocks): ModuleList(
            (0): SwinLayer(
              (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
              (attention): SwinAttention(
                (self): SwinSelfAttention(
                  (query): Linear(in_features=96, out_features=96, bias=True)
                  (key): Linear(in_features=96, out_features=96, bias=True)
                  (value): Linear(in_fe

In [17]:
from transformers import SamModel, SamProcessor

sam_model = SamModel.from_pretrained("facebook/sam-vit-huge")
sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
sam_model.to(device)


SamModel(
  (shared_image_embedding): SamPositionalEmbedding()
  (vision_encoder): SamVisionEncoder(
    (patch_embed): SamPatchEmbeddings(
      (projection): Conv2d(3, 1280, kernel_size=(16, 16), stride=(16, 16))
    )
    (layers): ModuleList(
      (0-31): 32 x SamVisionLayer(
        (layer_norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): SamVisionAttention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (layer_norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): SamMLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (neck): SamVisionNeck(
      (conv1): Conv2d(1280, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (layer_norm1): SamLayerNor

In [18]:
dataset = CustomDataset(image_dir, annotation_dir)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=lambda x: tuple(zip(*x)))

In [19]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model.parameters(), lr=0.0001)
num_epochs = 5

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for images, targets in tqdm(data_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        optimizer.zero_grad()
        loss_dict = model(images, targets)
        loss = sum(loss for loss in loss_dict.values())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


Epoch 1/5:   0%|          | 0/113 [00:01<?, ?it/s]


RuntimeError: Given groups=1, weight of size [768, 768, 3, 3], expected input[1, 4, 625, 768] to have 768 channels, but got 4 channels instead

In [None]:
torch.save(model.state_dict(), "model.pth")


In [None]:
import matplotlib.pyplot as plt

def detect_and_segment(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_tensor = torch.tensor(image).permute(2, 0, 1).unsqueeze(0).float().to(device)

    model.eval()
    with torch.no_grad():
        detections = model(image_tensor)

    boxes = detections[0]['boxes'].cpu().numpy()

    # Run SAM on the detected image
    inputs = sam_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        masks = sam_model(**inputs).last_hidden_state

    for box in boxes:
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)

    plt.imshow(image)
    plt.title("Object Detection + SAM Segmentation")
    plt.axis("off")
    plt.show()

# Test on a sample image
test_image = f"./{project.name}-{project.version}/test/images/example.jpg"
detect_and_segment(test_image)
