In [10]:
import os
from collections import Counter

labels_path = "/kaggle/input/dataset-soccer/train/labels"

def calculate_label_balance(labels_path):
    if not os.path.isdir(labels_path):
        print(f"La directory {labels_path} non esiste.")
        return

    positive_labels = 0
    negative_labels = 0

    for label_file in os.listdir(labels_path):
        if label_file.endswith(".txt"):
            label_file_path = os.path.join(labels_path, label_file)
            # Verifica se il file è vuoto o contiene dati
            if os.path.getsize(label_file_path) > 0:
                positive_labels += 1
            else:
                negative_labels += 1

    total_files = positive_labels + negative_labels

    print("Bilanciamento delle etichette:")
    print(f"File con etichette positive (non vuoti): {positive_labels} ({(positive_labels / total_files) * 100:.2f}%)")
    print(f"File con etichette negative (vuoti): {negative_labels} ({(negative_labels / total_files) * 100:.2f}%)")
    print(f"Totale file: {total_files}")

calculate_label_balance(labels_path)

Bilanciamento delle etichette:
File con etichette positive (non vuoti): 3742 (39.00%)
File con etichette negative (vuoti): 5854 (61.00%)
Totale file: 9596


In [12]:
import os
import shutil

train_labels_path = "/kaggle/input/dataset-soccer/train/labels"
train_images_path = "/kaggle/input/dataset-soccer/train/images"
val_labels_path = "/kaggle/input/dataset-soccer/val/labels"
val_images_path = "/kaggle/input/dataset-soccer/val/images"
output_labels_path = "/kaggle/working/train_val/labels"
output_images_path = "/kaggle/working/train_val/images"

os.makedirs(output_labels_path, exist_ok=True)
os.makedirs(output_images_path, exist_ok=True)

def merge_datasets(train_labels_path, train_images_path, val_labels_path, val_images_path, output_labels_path, output_images_path):
    for label_file in os.listdir(train_labels_path):
        shutil.copy(os.path.join(train_labels_path, label_file), os.path.join(output_labels_path, label_file))
        image_file = label_file.replace(".txt", ".jpg")
        shutil.copy(os.path.join(train_images_path, image_file), os.path.join(output_images_path, image_file))

    for label_file in os.listdir(val_labels_path):
        shutil.copy(os.path.join(val_labels_path, label_file), os.path.join(output_labels_path, label_file))
        image_file = label_file.replace(".txt", ".jpg")
        shutil.copy(os.path.join(val_images_path, image_file), os.path.join(output_images_path, image_file))

    print("Merge completato: dati salvati in /kaggle/working/train_val.")

# Esegui la funzione
merge_datasets(train_labels_path, train_images_path, val_labels_path, val_images_path, output_labels_path, output_images_path)


Merge completato: dati salvati in /kaggle/working/train_val.


In [14]:
!pip install -U albumentations



In [15]:
import os
import random
import shutil
import albumentations as A
from albumentations import GaussianBlur
from albumentations.augmentations.transforms import RandomBrightnessContrast, HueSaturationValue, Sharpen, ISONoise
from albumentations.core.composition import OneOf
import cv2

labels_path = "/kaggle/working/train_val/labels"
images_path = "/kaggle/working/train_val/images"
output_labels_path = "/kaggle/working/train_val_edit/labels"
output_images_path = "/kaggle/working/train_val_edit/images"

os.makedirs(output_labels_path, exist_ok=True)
os.makedirs(output_images_path, exist_ok=True)

def balance_dataset(labels_path, images_path, output_labels_path, output_images_path):
    if not os.path.isdir(labels_path) or not os.path.isdir(images_path):
        print("Una delle directory specificate non esiste.")
        return

    positive_files = []
    negative_files = []

    for label_file in os.listdir(labels_path):
        if label_file.endswith(".txt"):
            label_file_path = os.path.join(labels_path, label_file)
            if os.path.getsize(label_file_path) > 0:
                positive_files.append(label_file)
            else:
                negative_files.append(label_file)

    # Sottocampionamento dei negativi
    sampled_negatives = random.sample(negative_files, len(positive_files))
    print(f"sample negatives:  {len(sampled_negatives)}")

    for label_file in sampled_negatives:
        shutil.copy(os.path.join(labels_path, label_file), os.path.join(output_labels_path, label_file))
        image_file = label_file.replace(".txt", ".jpg")
        shutil.copy(os.path.join(images_path, image_file), os.path.join(output_images_path, image_file))

    # Sovracampionamento dei positivi con augmentation
    augmentation = A.Compose([
        OneOf([
            RandomBrightnessContrast(p=0.5),
            HueSaturationValue(p=0.5),
            ISONoise(color_shift=(0.01, 0.05), intensity=(0.1, 0.5), p=0.5),
            GaussianBlur(blur_limit=(3, 7), p=0.5),
            Sharpen(alpha=(0.2, 0.5), lightness=(0.5, 1.0), p=0.5)
        ], p=1),
    ])

    for label_file in positive_files:
        label_file_path = os.path.join(labels_path, label_file)
        image_file = label_file.replace(".txt", ".jpg")
        image_file_path = os.path.join(images_path, image_file)

        shutil.copy(label_file_path, os.path.join(output_labels_path, label_file))
        shutil.copy(image_file_path, os.path.join(output_images_path, image_file))

        # Augmentation
        image = cv2.imread(image_file_path)
        for i in range(2):  # Crea 2 immagini aumentate per ogni positivo
            augmented = augmentation(image=image)
            aug_image = augmented["image"]

            aug_image_name = f"aug_{i}_{image_file}"
            cv2.imwrite(os.path.join(output_images_path, aug_image_name), aug_image)
            with open(os.path.join(output_labels_path, f"aug_{i}_{label_file}"), "w") as f:
                with open(label_file_path, "r") as original_label:
                    f.write(original_label.read())

    print("Bilanciamento completato: dataset bilanciato con augmentation.")

balance_dataset(labels_path, images_path, output_labels_path, output_images_path)


sample negatives:  4646
Bilanciamento completato: dataset bilanciato con augmentation.


In [None]:
import os
import random
import shutil

input_labels_path = "/kaggle/working/train_val_edit/labels"
input_images_path = "/kaggle/working/train_val_edit/images"

train_labels_path = "/kaggle/working/dataset-soccer-last/train/labels"
train_images_path = "/kaggle/working/dataset-soccer-last/train/images"
val_labels_path = "/kaggle/working/dataset-soccer-last/val/labels"
val_images_path = "/kaggle/working/dataset-soccer-last/val/images"

os.makedirs(train_labels_path, exist_ok=True)
os.makedirs(train_images_path, exist_ok=True)
os.makedirs(val_labels_path, exist_ok=True)
os.makedirs(val_images_path, exist_ok=True)

def split_dataset(input_labels_path, input_images_path, train_labels_path, train_images_path, val_labels_path, val_images_path, split_ratio=0.8):

    all_files = [f for f in os.listdir(input_labels_path) if f.endswith(".txt")]
    random.shuffle(all_files)

    split_index = int(len(all_files) * split_ratio)
    train_files = all_files[:split_index]
    val_files = all_files[split_index:]

    for label_file in train_files:
        shutil.copy(os.path.join(input_labels_path, label_file), os.path.join(train_labels_path, label_file))
        image_file = label_file.replace(".txt", ".jpg")
        shutil.copy(os.path.join(input_images_path, image_file), os.path.join(train_images_path, image_file))

    for label_file in val_files:
        shutil.copy(os.path.join(input_labels_path, label_file), os.path.join(val_labels_path, label_file))
        image_file = label_file.replace(".txt", ".jpg")
        shutil.copy(os.path.join(input_images_path, image_file), os.path.join(val_images_path, image_file))

    print(f"Dataset diviso: {len(train_files)} file per il train, {len(val_files)} file per il validation.")

def create_zip(zip_path, base_dir):
    shutil.make_archive(zip_path.replace(".zip", ""), 'zip', base_dir)
    print(f"File ZIP creato: {zip_path}")

split_dataset(input_labels_path, input_images_path, train_labels_path, train_images_path, val_labels_path, val_images_path)

zip_output_path = "/kaggle/working/dataset-soccer-last.zip"
create_zip(zip_output_path, "/kaggle/working/dataset-soccer-last")


In [None]:
import os

def count_files(directory, extension=None):
    if not os.path.isdir(directory):
        print(f"La directory {directory} non esiste.")
        return 0

    if extension:
        # Filtra i file con l'estensione specificata
        files = [f for f in os.listdir(directory) if f.endswith(extension)]
    else:
        # Conta tutti i file nella directory
        files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

    return len(files)

directory = "/kaggle/working/dataset-soccer-last/train/images"  
extension = None  

file_count = count_files(directory, extension)
print(f"Numero di file nella directory '{directory}': {file_count}")


In [None]:
import os

def count_files(directory, extension=None):
    if not os.path.isdir(directory):
        print(f"La directory {directory} non esiste.")
        return 0

    if extension:
        files = [f for f in os.listdir(directory) if f.endswith(extension)]
    else:
        files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    return len(files)

# Percorsi delle cartelle
train_labels_path = "/kaggle/working/dataset-soccer-last/train/labels"
train_images_path = "/kaggle/working/dataset-soccer-last/train/images"
val_labels_path = "/kaggle/working/dataset-soccer-last/val/labels"
val_images_path = "/kaggle/working/dataset-soccer-last/val/images"

def validate_dataset():
    print("Validazione del dataset:")

    train_labels_count = count_files(train_labels_path, ".txt")
    train_images_count = count_files(train_images_path, ".jpg")
    print(f"Train set: {train_images_count} immagini, {train_labels_count} annotazioni.")

    val_labels_count = count_files(val_labels_path, ".txt")
    val_images_count = count_files(val_images_path, ".jpg")
    print(f"Validation set: {val_images_count} immagini, {val_labels_count} annotazioni.")

    # Verifica
    if train_images_count != train_labels_count:
        print("[AVVISO] Il numero di immagini e annotazioni non corrisponde nel train set.")
    if val_images_count != val_labels_count:
        print("[AVVISO] Il numero di immagini e annotazioni non corrisponde nel validation set.")

    total_files = train_images_count + val_images_count
    print(f"Totale file (train + validation): {total_files}")

validate_dataset()
