In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2 as cv
import os
import json
from pathlib import Path
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')


[Dataset source](https://doi.org/10.1038/s41597-024-03656-8)

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 8)


In [None]:
# Define paths to raw YOLO data
DATA_ROOT = "../raw_data/Data_YOLO"
TRAIN_IMAGES = f"{DATA_ROOT}/images/train"
TRAIN_LABELS = f"{DATA_ROOT}/labels/train"
VAL_IMAGES = f"{DATA_ROOT}/images/val"
VAL_LABELS = f"{DATA_ROOT}/labels/val"

# Class names for PCB defects taken from paper section Methods/Defect labeling and dataset partition
CLASS_NAMES = {
    0: "SH",      # Short
    1: "SP",      # Spur
    2: "SC",      # Spurious Copper
    3: "OP",      # Open 
    4: "MB",      # Mouse Bite
    5: "HB",      # Hole Breakout
    6: "CS",      # Conductor Scratch
    7: "CFO",     # Conductor Foreign Object
    8: "BMFO"     # Base Material Foreign Object
}


print(f"Dataset Root: {DATA_ROOT}")
print(f"\nTrain Images: {len(os.listdir(TRAIN_IMAGES))} files")
print(f"Train Labels: {len(os.listdir(TRAIN_LABELS))} files")
print(f"Val Images: {len(os.listdir(VAL_IMAGES))} files")
print(f"Val Labels: {len(os.listdir(VAL_LABELS))} files")
print(f"\nTotal Images: {len(os.listdir(TRAIN_IMAGES)) + len(os.listdir(VAL_IMAGES))}")


In [None]:
def parse_yolo_labels(labels_dir):
    """
    Parse YOLO format labels and extract statistics.
    YOLO format: class_id center_x center_y width height (all normalized 0-1)
    """
    annotations = []
    
    for label_file in os.listdir(labels_dir):
        if not label_file.endswith('.txt'):
            continue
            
        label_path = os.path.join(labels_dir, label_file)
        
        with open(label_path, 'r') as f:
            lines = f.readlines()
            
        for line in lines:
            if line.strip():
                parts = line.strip().split()
                class_id = int(parts[0])
                center_x = float(parts[1])
                center_y = float(parts[2])
                width = float(parts[3])
                height = float(parts[4])
                
                annotations.append({
                    'filename': label_file.replace('.txt', '.jpg'),
                    'class_id': class_id,
                    'class_name': CLASS_NAMES.get(class_id, f"class_{class_id}"),
                    'center_x': center_x,
                    'center_y': center_y,
                    'bbox_width': width,
                    'bbox_height': height,
                    'bbox_area': width * height
                })
    
    return pd.DataFrame(annotations)

# parse train and val labels
train_df = parse_yolo_labels(TRAIN_LABELS)
train_df['split'] = 'train'

val_df = parse_yolo_labels(VAL_LABELS)
val_df['split'] = 'val'

all_annotations = pd.concat([train_df, val_df], ignore_index=True)

print(f"\n{len(all_annotations)} total annotations")
print(f" - Train: {len(train_df)} annotations")
print(f" - Val: {len(val_df)} annotations")
print(f"\nDataset shape: {all_annotations.shape}")
all_annotations.head()


## CLASS DISTRIBUTION ANALYSIS

In [None]:
# Overall class distribution
class_counts = all_annotations['class_name'].value_counts()
print("\n Overall Class Distribution:")
print(class_counts)

# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(18, 6))

# Overall distribution
ax1 = axes[0]
class_counts.plot(kind='bar', ax=ax1, color='skyblue', edgecolor='black')
ax1.set_title('Overall Class Distribution', fontsize=16, fontweight='bold')
ax1.set_xlabel('Class Name', fontsize=12)
ax1.set_ylabel('Count', fontsize=12)
ax1.tick_params(axis='x', rotation=45)
ax1.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(class_counts):
    ax1.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

# Distribution by split
ax2 = axes[1]
split_class_dist.plot(kind='bar', ax=ax2, edgecolor='black')
ax2.set_title('Class Distribution by Train/Val Split', fontsize=16, fontweight='bold')
ax2.set_xlabel('Class Name', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.tick_params(axis='x', rotation=45)
ax2.legend(title='Split')
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Class imbalance analysis
print("\n‚öñÔ∏è Class Imbalance Analysis:")
max_count = class_counts.max()
min_count = class_counts.min()
imbalance_ratio = max_count / min_count
print(f"Most common class: {class_counts.index[0]} ({class_counts.iloc[0]} samples)")
print(f"Least common class: {class_counts.index[-1]} ({class_counts.iloc[-1]} samples)")
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")


In [None]:
# BOUNDING BOX STATISTICS
print("="*60)
print("BOUNDING BOX STATISTICS")
print("="*60)

# Basic statistics
print("\nüìê Bounding Box Dimensions (normalized):")
print("\nWidth statistics:")
print(all_annotations['bbox_width'].describe())
print("\nHeight statistics:")
print(all_annotations['bbox_height'].describe())
print("\nArea statistics:")
print(all_annotations['bbox_area'].describe())

# Objects per image
objects_per_image = all_annotations.groupby(['filename', 'split']).size().reset_index(name='count')
print(f"\nüì¶ Objects per Image Statistics:")
print(objects_per_image['count'].describe())

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Distribution of bbox widths
ax1 = axes[0, 0]
all_annotations['bbox_width'].hist(bins=50, ax=ax1, color='coral', edgecolor='black', alpha=0.7)
ax1.set_title('Distribution of Bounding Box Widths', fontsize=14, fontweight='bold')
ax1.set_xlabel('Width (normalized)')
ax1.set_ylabel('Frequency')
ax1.axvline(all_annotations['bbox_width'].mean(), color='red', linestyle='--', linewidth=2, label=f"Mean: {all_annotations['bbox_width'].mean():.3f}")
ax1.legend()
ax1.grid(alpha=0.3)

# 2. Distribution of bbox heights
ax2 = axes[0, 1]
all_annotations['bbox_height'].hist(bins=50, ax=ax2, color='lightgreen', edgecolor='black', alpha=0.7)
ax2.set_title('Distribution of Bounding Box Heights', fontsize=14, fontweight='bold')
ax2.set_xlabel('Height (normalized)')
ax2.set_ylabel('Frequency')
ax2.axvline(all_annotations['bbox_height'].mean(), color='red', linestyle='--', linewidth=2, label=f"Mean: {all_annotations['bbox_height'].mean():.3f}")
ax2.legend()
ax2.grid(alpha=0.3)

# 3. Width vs Height scatter
ax3 = axes[1, 0]
scatter = ax3.scatter(all_annotations['bbox_width'], all_annotations['bbox_height'], 
                     c=all_annotations['class_id'], cmap='tab10', alpha=0.5, s=10)
ax3.set_title('Bounding Box Aspect Ratios', fontsize=14, fontweight='bold')
ax3.set_xlabel('Width (normalized)')
ax3.set_ylabel('Height (normalized)')
ax3.plot([0, 1], [0, 1], 'r--', label='Square (1:1)')
ax3.legend()
ax3.grid(alpha=0.3)
cbar = plt.colorbar(scatter, ax=ax3)
cbar.set_label('Class ID')

# 4. Objects per image distribution
ax4 = axes[1, 1]
objects_per_image['count'].hist(bins=30, ax=ax4, color='plum', edgecolor='black', alpha=0.7)
ax4.set_title('Distribution of Objects per Image', fontsize=14, fontweight='bold')
ax4.set_xlabel('Number of Objects')
ax4.set_ylabel('Number of Images')
ax4.axvline(objects_per_image['count'].mean(), color='red', linestyle='--', linewidth=2, 
           label=f"Mean: {objects_per_image['count'].mean():.2f}")
ax4.legend()
ax4.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nüìä Objects per Image by Split:")
print(objects_per_image.groupby('split')['count'].describe())


In [None]:
# VISUALIZE SAMPLE IMAGES WITH ANNOTATIONS
print("="*60)
print("SAMPLE IMAGES WITH ANNOTATIONS")
print("="*60)

def draw_yolo_boxes(image_path, label_path, class_names):
    """
    Draw bounding boxes on image from YOLO format labels
    """
    # Read image
    img = cv.imread(image_path)
    img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
    h, w = img.shape[:2]
    
    # Read labels
    if not os.path.exists(label_path):
        return img
    
    with open(label_path, 'r') as f:
        lines = f.readlines()
    
    # Define colors for each class (RGB)
    colors = [
        (255, 0, 0), (0, 255, 0), (0, 0, 255),
        (255, 255, 0), (255, 0, 255), (0, 255, 255)
    ]
    
    for line in lines:
        if line.strip():
            parts = line.strip().split()
            class_id = int(parts[0])
            center_x = float(parts[1]) * w
            center_y = float(parts[2]) * h
            bbox_w = float(parts[3]) * w
            bbox_h = float(parts[4]) * h
            
            # Convert to corner coordinates
            x1 = int(center_x - bbox_w / 2)
            y1 = int(center_y - bbox_h / 2)
            x2 = int(center_x + bbox_w / 2)
            y2 = int(center_y + bbox_h / 2)
            
            # Draw rectangle
            color = colors[class_id % len(colors)]
            cv.rectangle(img, (x1, y1), (x2, y2), color, 2)
            
            # Draw label
            label = class_names.get(class_id, f"Class {class_id}")
            cv.putText(img, label, (x1, y1-10), cv.FONT_HERSHEY_SIMPLEX, 
                      0.5, color, 2)
    
    return img

# Sample images from each class
samples_per_class = 2
fig, axes = plt.subplots(len(CLASS_NAMES), samples_per_class, figsize=(12, 20))

for idx, (class_id, class_name) in enumerate(CLASS_NAMES.items()):
    # Get samples for this class
    class_samples = all_annotations[all_annotations['class_id'] == class_id]['filename'].unique()
    
    if len(class_samples) > 0:
        # Take up to samples_per_class samples
        selected_samples = np.random.choice(class_samples, 
                                           min(samples_per_class, len(class_samples)), 
                                           replace=False)
        
        for col, filename in enumerate(selected_samples):
            # Find the image (could be in train or val)
            if os.path.exists(os.path.join(TRAIN_IMAGES, filename)):
                img_path = os.path.join(TRAIN_IMAGES, filename)
                lbl_path = os.path.join(TRAIN_LABELS, filename.replace('.jpg', '.txt'))
            else:
                img_path = os.path.join(VAL_IMAGES, filename)
                lbl_path = os.path.join(VAL_LABELS, filename.replace('.jpg', '.txt'))
            
            # Draw boxes
            img_with_boxes = draw_yolo_boxes(img_path, lbl_path, CLASS_NAMES)
            
            # Display
            ax = axes[idx, col] if samples_per_class > 1 else axes[idx]
            ax.imshow(img_with_boxes)
            ax.set_title(f"{class_name}\n{filename}", fontsize=10)
            ax.axis('off')
    
plt.tight_layout()
plt.show()

print("‚úÖ Sample images displayed with bounding boxes")


In [None]:
# SUMMARY AND INSIGHTS
print("="*60)
print("DATASET SUMMARY & INSIGHTS")
print("="*60)

# Image statistics
print("\nüìä Image Statistics:")
print(f"Total images: {len(os.listdir(TRAIN_IMAGES)) + len(os.listdir(VAL_IMAGES))}")
print(f"  - Train: {len(os.listdir(TRAIN_IMAGES))}")
print(f"  - Val: {len(os.listdir(VAL_IMAGES))}")

# Load one image to get dimensions
sample_img_path = os.path.join(TRAIN_IMAGES, os.listdir(TRAIN_IMAGES)[0])
sample_img = cv.imread(sample_img_path)
print(f"\nImage dimensions (H x W): {sample_img.shape[0]} x {sample_img.shape[1]}")
print(f"Image channels: {sample_img.shape[2]}")

# Annotation statistics per class
print("\nüìà Detailed Class Statistics:")
class_stats = all_annotations.groupby('class_name').agg({
    'bbox_width': ['mean', 'std', 'min', 'max'],
    'bbox_height': ['mean', 'std', 'min', 'max'],
    'bbox_area': ['mean', 'std'],
    'filename': 'count'
}).round(4)
class_stats.columns = ['_'.join(col).strip() for col in class_stats.columns.values]
class_stats = class_stats.rename(columns={'filename_count': 'total_annotations'})
print(class_stats)

# Check for images with no annotations
all_images_train = set(os.listdir(TRAIN_IMAGES))
all_images_val = set(os.listdir(VAL_IMAGES))
annotated_images = set(all_annotations['filename'].unique())

images_no_annotations = (all_images_train | all_images_val) - annotated_images
print(f"\n‚ö†Ô∏è Images without annotations: {len(images_no_annotations)}")

# Visualize class statistics
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Average bbox size by class
ax1 = axes[0]
class_bbox_sizes = all_annotations.groupby('class_name')[['bbox_width', 'bbox_height']].mean()
class_bbox_sizes.plot(kind='bar', ax=ax1, color=['skyblue', 'lightcoral'])
ax1.set_title('Average Bounding Box Size by Class', fontsize=14, fontweight='bold')
ax1.set_xlabel('Class')
ax1.set_ylabel('Size (normalized)')
ax1.tick_params(axis='x', rotation=45)
ax1.legend(['Width', 'Height'])
ax1.grid(axis='y', alpha=0.3)

# Class distribution pie chart
ax2 = axes[1]
class_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
ax2.set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')
ax2.set_ylabel('')

# Train/Val split
ax3 = axes[2]
split_counts = all_annotations.groupby('split').size()
split_counts.plot(kind='bar', ax=ax3, color=['steelblue', 'orange'])
ax3.set_title('Train/Val Split', fontsize=14, fontweight='bold')
ax3.set_xlabel('Split')
ax3.set_ylabel('Number of Annotations')
ax3.tick_params(axis='x', rotation=0)
ax3.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(split_counts):
    ax3.text(i, v + 100, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n‚úÖ EDA Complete!")
