In [None]:
import torch
from torchvision import transforms
from PIL import Image
import os
from torch.utils.data import DataLoader
import numpy as np


In [None]:
# FUNZIONI DA IMPORTARE 

def get_bounding_box(file):
    numbers=file.split("-")
    values=numbers[3]
    values_v2=values.split("&")
    values_v3=[]
    for i in range(len(values_v2)):
        if "_" in values_v2[i]:
            values_v3.append(values_v2[i].split("_"))
    t=[values_v2[0],values_v3[0],values_v3[1],values_v3[2],values_v2[-1]]
    final_values = [int(x) for item in t for x in (item if isinstance(item, list) else [item])]
    x_coords=[final_values[0],final_values[2],final_values[4],final_values[6]]
    y_coords=[final_values[1],final_values[3],final_values[5],final_values[7]]
    x_min = min(x_coords)
    y_min = min(y_coords)
    x_max = max(x_coords)
    y_max = max(y_coords)
    return [float(x_min), float(y_min), float(x_max), float(y_max)]

def crop_image_with_ground_truth(full_path):
    filename = os.path.basename(full_path)  
    bb = get_bounding_box(filename)
    image = Image.open(full_path).convert("RGB")
    cropped_image = image.crop(bb)
    return cropped_image

def crop_folder(folder_path):
    cropped_folder = []
    files = os.listdir(folder_path)
    for file in files:
        full_path = os.path.join(folder_path, file)
        cropped_image = crop_image_with_ground_truth(full_path)
        cropped_folder.append(cropped_image)
    return cropped_folder

cropped_folder=crop_folder("/home/filippo/Documents/Visual Studio Code/Computer_Vision/Prove/train2")

In [None]:
# FEATURE EXTRACTION MODEL --> [BATCH,512,6,18]

from torchvision.models import mobilenet_v2
import torch.nn as nn

def Mobilenet_V2_reshaped(image):
    backbone = mobilenet_v2(pretrained=True)
    backbone = nn.Sequential(*list(backbone.features.children()))
    features=backbone(image)
    projection = nn.Sequential(
    nn.Conv2d(1280, 512, kernel_size=1),
    nn.AdaptiveAvgPool2d((6, 18)),
    )
    output = projection(features)
    return output

In [None]:
# CLASS FOR THE FOLDER

class cropped_images(torch.utils.data.Dataset): # takes as input the cropped folder

    def __init__(self,folder,transformations):
        self.folder=folder
        self.transformations=transformations

    def __len__(self):
        return len(self.folder)

    def __getitem__(self, idx):
        image = self.folder[idx]
        if self.transformations:
            image = self.transformations(image)

        return Mobilenet_V2_reshaped(image.unsqueeze(0))

trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

input_folder=cropped_images(cropped_folder, transformations=trans)
input_dataloader=DataLoader(input_folder, batch_size=2)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=200):
        super().__init__()
        pe = torch.zeros(max_len, d_model)  # [max_len, d_model]
        position = torch.arange(0, max_len).unsqueeze(1)  # [max_len, 1]
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # [1, max_len, d_model]

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)].to(x.device)
        return x

pos_encoder = PositionalEncoding(d_model=512)

for image in input_dataloader: # creo dei batch, resizo in modo tale che ogni batch abbia come size: [batch,108,512]. Quindi da: [A,B,C,D] --> [A,C*D,B] e applico l'encoding
    A,B,C,D,E=image.shape
    image=image.view(A,C,D*E)
    image=image.permute(0,2,1)
    image=pos_encoder(image)

In [None]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer

encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8, dim_feedforward=2048)
transformer_encoder = TransformerEncoder(encoder_layer, num_layers=4)

encoded = transformer_encoder(image)  # [B, 108, 512]


In [None]:
# CHAT VERSION

import torch
import torch.nn as nn
from torchvision import transforms
from torchvision.models import mobilenet_v2
from torch.utils.data import Dataset, DataLoader

# Dataset
class CroppedImages(Dataset):
    def __init__(self, folder, transformations):
        self.folder = folder
        self.transformations = transformations

    def __len__(self):
        return len(self.folder)

    def __getitem__(self, idx):
        image = self.folder[idx]
        return self.transformations(image)

# Feature extractor
class MobilenetV2FeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        base = mobilenet_v2(pretrained=True)
        self.backbone = nn.Sequential(*list(base.features.children()))
        self.projection = nn.Sequential(
            nn.Conv2d(1280, 512, kernel_size=1),
            nn.AdaptiveAvgPool2d((6, 18))
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.projection(x)
        return x

# Positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=200):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # [1, max_len, d_model]

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

# Parallel Decoder

class ParallelDecoder(nn.Module):
    def __init__(self, d_model=512, num_chars=8, num_classes=68, num_layers=2):
        super().__init__()
        self.num_chars = num_chars
        self.query_embed = nn.Parameter(torch.randn(num_chars, d_model))

        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=8)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, memory):
        # memory: [B, 108, 512]
        B, _, D = memory.shape

        # Repeat query embedding across batch
        queries = self.query_embed.unsqueeze(1).repeat(1, B, 1)  # [num_chars, B, d_model]

        # Prepare encoder memory: [seq_len, B, d_model]
        memory = memory.permute(1, 0, 2)  # [108, B, 512]

        # Decode
        decoded = self.decoder(tgt=queries, memory=memory)  # [num_chars, B, d_model]

        # Predict character class for each position
        decoded = decoded.permute(1, 0, 2)  # [B, num_chars, d_model]
        logits = self.classifier(decoded)   # [B, num_chars, num_classes]

        return logits

# Config
trans = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = CroppedImages(cropped_folder, trans)
dataloader = DataLoader(dataset, batch_size=2)

# Instantiate models
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
feature_extractor = MobilenetV2FeatureExtractor().to(device)
pos_encoder = PositionalEncoding(d_model=512).to(device)

# Transformer
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4).to(device)
decoder = ParallelDecoder(d_model=512, num_chars=8, num_classes=68).to(device)

# Inference loop
for batch in dataloader:
    batch = batch.to(device)
    features = feature_extractor(batch)  # [B, 512, 6, 18]
    B, C, H, W = features.shape
    features = features.view(B, C, H * W).permute(0, 2, 1)  # [B, 108, 512]
    features = pos_encoder(features)
    memory = transformer_encoder(features)  # [B, 108, 512]
    logits=decoder(memory)

In [None]:
# Suppose ground truth is shape: [B, 8] (each value is class index from 0 to 67)
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(logits.view(-1, 68), targets.view(-1))
