In [1]:
import os
import json
import time

-------------------------------------------------------------------

# **Data Preprocessing**

In [2]:
def SoccerNet_LoadData(root_dir, img_width=1920, img_height=1080):
    """
    Function to load SoccerNet data, extract annotations, and organize metadata.

    Arguments:
    root_dir (str): Path to the root directory containing game folders.
    img_width (int): Width of the images for normalization.
    img_height (int): Height of the images for normalization.

    Returns:
    match_info (list): Metadata about each match.
    img_info (list): Metadata about each image, including annotations.
    images (list): Raw image information from the dataset.
    """
    match_info = []
    img_info = []
    images = []

    # Sort game folders by their numeric part
    game_folders = sorted(
        [folder for folder in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, folder))],
        key=lambda x: int(''.join(filter(str.isdigit, x)))  # Extract numeric part for sorting
    )

    for idx, train_folder in enumerate(game_folders):
        train_folder_path = os.path.join(root_dir, train_folder)
        image_folder_path = os.path.join(train_folder_path, 'img1')
        label_path = os.path.join(train_folder_path, 'Labels-GameState.json')

        # Sort image paths by their numeric part
        image_paths = sorted(
            [img for img in os.listdir(image_folder_path) if img.endswith(('.jpg', '.png'))],
            key=lambda x: int(''.join(filter(str.isdigit, x)))  # Extract numeric part for sorting
        )

        # Load the label data
        with open(label_path, 'r') as label_file:
            label_data = json.load(label_file)

            # Append match metadata
            match_info.append(
                {
                    'Game Version': label_data['info']['version'],
                    'Game ID': label_data['info']['game_id'],
                    'Game Unique ID': label_data['info']['id'],
                    'Tracklets Count': label_data['info']['num_tracklets'],
                    'Action Position': label_data['info']['action_position'],
                    'Action Class': label_data['info']['action_class'],
                    'Game Folder Path': train_folder_path,
                    'Image Folder Path': image_folder_path
                }
            )

            # Extract image IDs from label data
            img_ids = [img['image_id'] for img in label_data['images']]

            for i, img_id in enumerate(img_ids):
                img_data = next((img for img in label_data['images'] if img['image_id'] == img_id), None)
                images.append(img_data)

                if img_data:
                    annotations = []
                    for annotation in label_data['annotations']:
                        if annotation["image_id"] == img_id and annotation['supercategory'] != "pitch":
                            bbox_image = annotation.get('bbox_image', {})
                            x_center = bbox_image.get('x_center', 0) / img_width
                            y_center = bbox_image.get('y_center', 0) / img_height
                            w = bbox_image.get('w', 0) / img_width
                            h = bbox_image.get('h', 0) / img_height

                            annotations.append(
                                {
                                    "Team": annotation['attributes'].get('team', None),
                                    "Role": annotation['attributes'].get('role', None),
                                    "Jersey Number": annotation['attributes'].get('jersey', None),
                                    "Bounding Box (Image)": (annotation['category_id'], x_center, y_center, w, h)
                                }
                            )

                    # Aggregate annotation data for each image
                    bounding_boxes = [ann.get("Bounding Box (Image)", {}) for ann in annotations]
                    teams = [ann.get("Team", None) for ann in annotations]
                    roles = [ann.get("Role", None) for ann in annotations]
                    jersey_numbers = [ann.get("Jersey Number", None) for ann in annotations]

                    # Append image metadata
                    img_info.append({
                        "Image ID": img_id,
                        "Image Path": os.path.join(image_folder_path, image_paths[i]),
                        "Bounding Boxes (Image)": bounding_boxes,
                        "Teams": teams,
                        "Roles": roles,
                        "Jersey Numbers": jersey_numbers
                    })

    return match_info, img_info, images

In [3]:
# Start the timer
epoch_start_time = time.time()

root = '/kaggle/input/gsr-soccernet-train-set'
match_info, img_info, images = SoccerNet_LoadData(root)

epoch_end_time = time.time()
time_taken = epoch_end_time - epoch_start_time
print(f"Time taken to process: {time_taken:.2f} seconds")

Time taken to process: 137.82 seconds


In [4]:
print("Match Info (1st entry):")
print(match_info[0])
print("-"*40)

print("Image Info (1st entry):")
print(img_info[0])
print("-"*40)

print("Raw Image Data (1st entry):")
print(images[0])
print("-"*40)

print(f"Match Info Keys:")
print(match_info[0].keys())
print("-"*40)

print("Image Info Keys:")
print(img_info[0].keys())
print("-"*40)

print("Image Data Keys:")
print(images[0].keys())
print("-"*40)

# Bounding Boxes (Image): A List of Tuples ()
print(f"This is a Tuple: {img_info[0]['Bounding Boxes (Image)'][0]}")

# Teams: A List of String
# Roles: A List of String
# Jersey Numbers: A List of String

Match Info (1st entry):
{'Game Version': '1.3', 'Game ID': '4', 'Game Unique ID': '060', 'Tracklets Count': '26', 'Action Position': '895', 'Action Class': 'Kick-off', 'Game Folder Path': '/kaggle/input/gsr-soccernet-train-set/SNGS-060', 'Image Folder Path': '/kaggle/input/gsr-soccernet-train-set/SNGS-060/img1'}
----------------------------------------
Image Info (1st entry):
{'Image ID': '1060000001', 'Image Path': '/kaggle/input/gsr-soccernet-train-set/SNGS-060/img1/000001.jpg', 'Bounding Boxes (Image)': [(1, 0.4903645833333333, 0.8712962962962963, 0.028645833333333332, 0.15925925925925927), (1, 0.4859375, 0.5888888888888889, 0.016666666666666666, 0.11296296296296296), (1, 0.51171875, 0.5773148148148148, 0.027604166666666666, 0.12314814814814815), (1, 0.6661458333333333, 0.6884259259259259, 0.022916666666666665, 0.13055555555555556), (1, 0.9911458333333333, 0.41712962962962963, 0.015625, 0.09351851851851851), (1, 0.89921875, 0.3384259259259259, 0.011979166666666667, 0.075), (1, 0.804

Doesn't include the following: Categories key and supercategory such as Pitch (I don't know what it does)

---------------------------------------------