In [1]:
import kagglehub

path = kagglehub.dataset_download("lyly99/logodet3k")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/lyly99/logodet3k?dataset_version_number=1...


100%|██████████| 2.87G/2.87G [00:16<00:00, 184MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/lyly99/logodet3k/versions/1


In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
def create_image_annotation_dataset(root_dir):
    dataset = []

    for parent_dir, _, files in os.walk(root_dir):
        images = {os.path.splitext(file)[0]: file for file in files if file.endswith('.jpg')}
        annotations = {os.path.splitext(file)[0]: file for file in files if file.endswith('.xml')}

        for base_name, image_file in images.items():
            annotation_file = annotations.get(base_name)
            dataset.append({
                "ImagePath": os.path.join(parent_dir, image_file),
                "AnnotationPath": os.path.join(parent_dir, annotation_file) if annotation_file else None
            })

    return pd.DataFrame(dataset)

root_directory = "/LogoDet-3K"
image_annotation_df = create_image_annotation_dataset(path+root_directory)

image_annotation_df

Unnamed: 0,ImagePath,AnnotationPath
0,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
1,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
2,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
3,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
4,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
...,...,...
158649,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
158650,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
158651,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...
158652,/root/.cache/kagglehub/datasets/lyly99/logodet...,/root/.cache/kagglehub/datasets/lyly99/logodet...


In [4]:
image_annotation_df.iloc[0, 1]

'/root/.cache/kagglehub/datasets/lyly99/logodet3k/versions/1/LogoDet-3K/Others/BT Conferencing/33.xml'

In [5]:
first_annotation_path = image_annotation_df['AnnotationPath'].iloc[0]

if first_annotation_path:
    try:
        with open(first_annotation_path, 'r') as f:
            file_contents = f.read()
            print(file_contents)
    except FileNotFoundError:
        print(f"Error: File not found at {first_annotation_path}")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("Error: No annotation path found for the first image.")


<?xml version="1.0" ?><annotation verified="no">
	<folder>BT Conferencing</folder>
	<filename>33.jpg</filename>
	<source>
		<database>Unknown</database>
	</source>
	<size>
		<width>500</width>
		<height>372</height>
		<depth>3</depth>
	</size>
	<segmented>0</segmented>
	<object>
		<name>BT Conferencing</name>
		<pose>Unspecified</pose>
		<truncated>0</truncated>
		<difficult>0</difficult>
		<bndbox>
			<xmin>8</xmin>
			<ymin>44</ymin>
			<xmax>492</xmax>
			<ymax>316</ymax>
		</bndbox>
	</object>
</annotation>


In [7]:
import pandas as pd
import xml.etree.ElementTree as ET
from collections import Counter

def parse_annotation_labels(annotation_path):
    """
    Парсинг XML для извлечения всех меток (labels) объектов.
    Args:
        annotation_path (str): Путь к XML-файлу.
    Returns:
        list: Список меток объектов.
    """
    tree = ET.parse(annotation_path)
    root = tree.getroot()
    labels = []
    for obj in root.findall("object"):
        label = obj.find("name").text
        labels.append(label)
    return labels

all_labels = []
for ann_path in image_annotation_df["AnnotationPath"]:
    labels = parse_annotation_labels(ann_path)
    all_labels.extend(labels)

label_distribution = Counter(all_labels)

label_distribution_df = pd.DataFrame.from_dict(label_distribution, orient='index', columns=['Count']).reset_index()
label_distribution_df.rename(columns={'index': 'Label'}, inplace=True)

label_distribution_df.sort_values(by="Count", ascending=False, inplace=True)

print(label_distribution_df)


                 Label  Count
143   avery dennison-2    532
1574     alpinestars-2    508
1964           lexus-1    466
698     violet crumble    414
1461     new balance-1    328
...                ...    ...
372                vpl      4
2769      royal albert      4
233          brm buggy      4
150        r. e. dietz      4
1815              pony      4

[2993 rows x 2 columns]


In [8]:
import pandas as pd
import xml.etree.ElementTree as ET
import os
from PIL import Image
from tqdm import tqdm

output_dir = "cropped_images"
os.makedirs(output_dir, exist_ok=True)

cropped_samples = []

for _, row in tqdm(image_annotation_df.iterrows()):
    image_path = row["ImagePath"]
    ann_path = row["AnnotationPath"]

    try:
        image = Image.open(image_path).convert("RGB")

        tree = ET.parse(ann_path)
        root = tree.getroot()

        for obj_idx, obj in enumerate(root.findall("object")):
            label = obj.find("name").text.strip()
            bndbox = obj.find("bndbox")
            xmin = int(float(bndbox.find("xmin").text))
            ymin = int(float(bndbox.find("ymin").text))
            xmax = int(float(bndbox.find("xmax").text))
            ymax = int(float(bndbox.find("ymax").text))

            cropped_image = image.crop((xmin, ymin, xmax, ymax))

            cropped_filename = f"{os.path.splitext(os.path.basename(image_path))[0]}_{obj_idx}.jpg"
            cropped_path = os.path.join(output_dir, cropped_filename)

            cropped_image.save(cropped_path)

            cropped_samples.append({
                "CroppedImagePath": cropped_path,
                "Label": label
            })

    except Exception as e:
        print(f"Ошибка при обработке {ann_path}: {e}")

cropped_dataset_df = pd.DataFrame(cropped_samples)


26474it [00:44, 595.72it/s]

Ошибка при обработке /root/.cache/kagglehub/datasets/lyly99/logodet3k/versions/1/LogoDet-3K/Food/Magners Irish/36.xml: cannot write empty image as JPEG


116741it [03:16, 603.18it/s]

Ошибка при обработке /root/.cache/kagglehub/datasets/lyly99/logodet3k/versions/1/LogoDet-3K/Electronic/airmate/78.xml: cannot write empty image as JPEG


158654it [04:23, 601.01it/s]


In [9]:
cropped_dataset_df

Unnamed: 0,CroppedImagePath,Label
0,cropped_images/33_0.jpg,BT Conferencing
1,cropped_images/17_0.jpg,BT Conferencing
2,cropped_images/1_0.jpg,BT Conferencing
3,cropped_images/30_0.jpg,BT Conferencing
4,cropped_images/56_0.jpg,BT Conferencing
...,...,...
194257,cropped_images/5_0.jpg,sea-doo
194258,cropped_images/67_0.jpg,sea-doo
194259,cropped_images/86_0.jpg,sea-doo
194260,cropped_images/32_0.jpg,sea-doo


In [10]:
cropped_dataset_df['Label'].nunique()

2993

In [11]:
unique_labels = sorted(cropped_dataset_df["Label"].unique())
label2idx = {label: idx for idx, label in enumerate(unique_labels)}

cropped_dataset_df["LabelIdx"] = cropped_dataset_df["Label"].map(label2idx)

sorted(cropped_dataset_df['LabelIdx'].unique())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [12]:
import random
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from collections import defaultdict


def stratified_split(df, min_valid_samples=2, valid_ratio=0.2):
    """
    Разделяет данные так, чтобы в валидации было хотя бы min_valid_samples образцов из каждого класса.
    Остальные идут в train.

    Args:
        df (pd.DataFrame): DataFrame с колонками "CroppedImagePath" и "LabelIdx"
        min_valid_samples (int): Минимальное число образцов каждого класса в валидации
        valid_ratio (float): Доля данных, отводимых под валидацию (ориентир)

    Returns:
        train_df, val_df
    """
    train_list, val_list = [], []

    grouped = df.groupby("LabelIdx")

    for label, group in grouped:
        num_samples = len(group)
        num_valid = max(min_valid_samples, int(num_samples * valid_ratio))

        group = group.sample(frac=1, random_state=42).reset_index(drop=True)
        val_list.append(group.iloc[:num_valid])
        train_list.append(group.iloc[num_valid:])

    val_df = pd.concat(val_list).reset_index(drop=True)
    train_df = pd.concat(train_list).reset_index(drop=True)

    return train_df, val_df


train_df, val_df = stratified_split(cropped_dataset_df, min_valid_samples=3, valid_ratio=0.2)

print("Train size:", len(train_df), "| Val size:", len(val_df))
print("Train class distribution:\n", train_df["LabelIdx"].value_counts())
print("Val class distribution:\n", val_df["LabelIdx"].value_counts())


Train size: 156031 | Val size: 38231
Train class distribution:
 LabelIdx
1389    426
1359    407
1732    373
2822    332
2001    263
       ... 
886       1
2101      1
2885      1
2297      1
2348      1
Name: count, Length: 2993, dtype: int64
Val class distribution:
 LabelIdx
1389    106
1359    101
1732     93
2822     82
1387     65
       ... 
2400      3
2380      3
2383      3
2389      3
2390      3
Name: count, Length: 2993, dtype: int64


In [13]:
class TripletLogoDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.label_groups = defaultdict(list)

        for _, row in self.df.iterrows():
            self.label_groups[row["LabelIdx"]].append(row["CroppedImagePath"])

        self.samples = self.df.to_dict('records')

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        anchor_record = self.samples[idx]
        anchor_path = anchor_record["CroppedImagePath"]
        anchor_label = anchor_record["LabelIdx"]
        anchor_img = Image.open(anchor_path).convert("RGB")

        positive_list = self.label_groups[anchor_label]
        positive_path = random.choice([p for p in positive_list if p != anchor_path]) \
            if len(positive_list) > 1 else anchor_path
        positive_img = Image.open(positive_path).convert("RGB")

        negative_label = random.choice([l for l in self.label_groups.keys() if l != anchor_label])
        negative_path = random.choice(self.label_groups[negative_label])
        negative_img = Image.open(negative_path).convert("RGB")

        if self.transform:
            anchor_img = self.transform(anchor_img)
            positive_img = self.transform(positive_img)
            negative_img = self.transform(negative_img)

        return anchor_img, positive_img, negative_img

class DeterministicTripletDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.groups = defaultdict(list)

        for _, row in self.df.iterrows():
            self.groups[row["LabelIdx"]].append(row["CroppedImagePath"])

        self.triplets = []
        sorted_labels = sorted(self.groups.keys())

        for label in sorted_labels:
            images = self.groups[label]
            if len(images) < 2:
                continue
            for i, anchor_path in enumerate(images):
                positive_path = images[(i + 1) % len(images)]
                negative_label = sorted_labels[(sorted_labels.index(label) + 1) % len(sorted_labels)]
                negative_path = self.groups[negative_label][0]
                self.triplets.append((anchor_path, positive_path, negative_path))

    def __len__(self):
        return len(self.triplets)

    def __getitem__(self, idx):
        anchor_path, positive_path, negative_path = self.triplets[idx]
        anchor = Image.open(anchor_path).convert("RGB")
        positive = Image.open(positive_path).convert("RGB")
        negative = Image.open(negative_path).convert("RGB")

        if self.transform:
            anchor = self.transform(anchor)
            positive = self.transform(positive)
            negative = self.transform(negative)

        return anchor, positive, negative

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = TripletLogoDataset(train_df, transform=transform)
val_dataset = DeterministicTripletDataset(val_df, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=12)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=8)


In [14]:
from torch.utils.data import DataLoader

dataset = TripletLogoDataset(cropped_dataset_df, transform=transform)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=12)

In [15]:
import torch.nn as nn
import torchvision.models as models

class LogoEmbeddingNet(nn.Module):
    def __init__(self, embedding_dim=128):
        super(LogoEmbeddingNet, self).__init__()
        self.backbone = models.resnet50(pretrained=True)
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, embedding_dim)

    def forward(self, x):
        return self.backbone(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = LogoEmbeddingNet(embedding_dim=128).to(device)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 191MB/s]


In [23]:
import torch
import torch.nn.functional as F

class TripletLossWithTemperature(nn.Module):
    def __init__(self, temp=0.1):
        """
        Triplet Loss с температурным коэффициентом.

        Args:
            temp (float): Температурный коэффициент (обычно 0.1 или 0.05)
        """
        super(TripletLossWithTemperature, self).__init__()
        self.temp = temp

    def forward(self, anchor, positive, negative):
        """
        Вычисляет модифицированный триплетный лосс.

        Args:
            anchor (Tensor): эмбеддинги для anchor
            positive (Tensor): эмбеддинги для positive
            negative (Tensor): эмбеддинги для negative

        Returns:
            Tensor: значение лосса
        """

        pos_dist = F.pairwise_distance(anchor, positive, p=2)
        neg_dist = F.pairwise_distance(anchor, negative, p=2)

        loss = torch.log(1 + torch.exp((pos_dist - neg_dist) / self.temp))
        return loss.mean()


In [20]:
def precision_at_k(anchor_emb, positive_emb, negative_emb, k=1):
    """
    Вычисляет Precision@K:
    - Если положительный пример входит в топ-K ближайших соседей anchor, считаем верным предсказанием.
    """
    batch_size = anchor_emb.shape[0]

    pos_sim = F.cosine_similarity(anchor_emb, positive_emb)
    neg_sim = F.cosine_similarity(anchor_emb, negative_emb)

    similarities = torch.cat([pos_sim.view(-1, 1), neg_sim.view(-1, 1)], dim=1)

    sorted_indices = torch.argsort(similarities, descending=True, dim=1)

    correct = (sorted_indices[:, :k] == 0).sum().item()

    return correct / batch_size


In [34]:
from tqdm import tqdm

optimizer = optim.Adam(embedding_model.parameters(), lr=1e-4)
criterion = TripletLossWithTemperature(temp=0.4)

num_epochs = 10

for epoch in range(num_epochs):
    embedding_model.train()
    running_loss = 0.0
    running_prec = 0.0
    total_train = 0

    for anchor, positive, negative in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)

        optimizer.zero_grad()
        anchor_emb = embedding_model(anchor)
        positive_emb = embedding_model(positive)
        negative_emb = embedding_model(negative)

        loss = criterion(anchor_emb, positive_emb, negative_emb)
        loss.backward()
        optimizer.step()

        batch_size = anchor.size(0)
        running_loss += loss.item() * batch_size

        batch_prec = precision_at_k(anchor_emb, positive_emb, negative_emb, k=1)
        running_prec += batch_prec * batch_size
        total_train += batch_size

    train_loss = running_loss / total_train
    train_prec = running_prec / total_train

    embedding_model.eval()
    val_loss = 0.0
    val_prec = 0.0
    total_val = 0

    with torch.no_grad():
        for anchor, positive, negative in tqdm(val_loader, desc=f"Val Epoch {epoch+1}"):
            anchor, positive, negative = anchor.to(device), positive.to(device), negative.to(device)

            anchor_emb = embedding_model(anchor)
            positive_emb = embedding_model(positive)
            negative_emb = embedding_model(negative)

            loss = criterion(anchor_emb, positive_emb, negative_emb)
            batch_size = anchor.size(0)
            val_loss += loss.item() * batch_size

            batch_prec = precision_at_k(anchor_emb, positive_emb, negative_emb, k=1)
            val_prec += batch_prec * batch_size
            total_val += batch_size

    val_loss /= total_val
    val_prec /= total_val

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f}, Train Prec@1: {train_prec:.4f} | "
          f"Val Loss: {val_loss:.4f}, Val Prec@1: {val_prec:.4f}")

Train Epoch 1:   0%|          | 0/1219 [00:03<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 196.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 62.88 MiB is free. Process 8115 has 39.49 GiB memory in use. Of the allocated memory 38.44 GiB is allocated by PyTorch, and 559.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [42]:
import gc
gc.collect()

314