In [1]:
print("ok")

ok


In [None]:
'''
Only first time I need that
'''
# Import library
import gdown

# Gdrive link
gdrive_url = "https://drive.google.com/file/d/1ECfl3dtYyfivY8kYPq7RHUBTjC-2vf61/view?usp=share_link"

# ID of the file you want to download
file_id = gdrive_url.split('/')[-2]
file_id

In [None]:
# Download format
prefix = 'https://drive.google.com/uc?/export=download&id='

# Output you want
url = prefix+file_id
output = "../datasets/waste-material-dataset.zip"

In [None]:
def download_file(file_url, output):
    try:
        # Download the file
        gdown.download(file_url, output)
        print('File downloaded successfully.')
    except Exception as e:
        print('An error occurred:', e)

In [None]:
download_file(url, output)

In [None]:
'''
Only first time I need that
'''
# Extract zip file
from zipfile import ZipFile

def extract_zip_files(file_name, unzip_path):
    try:
        with ZipFile(file_name, 'r') as file:
            print('Extract all the files...')
            file.extractall(path=unzip_path)
            print(f"Successfully extracted to {unzip_path}")
    except Exception as e:
        print(f"Extracting file error: {e}")

In [None]:
'''
Only first time I need that
'''
file_name = "../datasets/waste-material-dataset.zip"
unzip_path = "../datasets"

extract_zip_files(file_name, unzip_path)

# I made a few strategies to deal with the dataset.

### 1. Understanding the Dataset
### 2. EDA
### 3. Image Processing on Bounding Box
### 4. Model Traning / Apply Transfer Learning
### 5. Model Validation, and Evaluation
### 6. Model Selection
### 7. Model Testing & Export
### 8. Model Monitoring and Maintaining

In [2]:
'''
Start here
'''
# Importing libraries
import numpy as np
import os
from collections import Counter, defaultdict

# Data visualization libraries
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
from tqdm import tqdm

# Disable python warnings
import warnings
warnings.filterwarnings("ignore")

# CV & image processing libraries
from PIL import Image

In [3]:
# Load image and label directories
train_image_dir = "../datasets/train/images"
train_label_dir = "../datasets/train/labels"
valid_image_dir = "../datasets/valid/images"
valid_label_dir = "../datasets/valid/labels"

In [4]:
# Load yaml file
import yaml

with open("../datasets/data.yaml", "r") as f:
    dataset = yaml.load(f, Loader=yaml.FullLoader)

dataset

{'train': '../datasets/train/images',
 'val': '../datasets/valid/images',
 'nc': 13,
 'names': ['banana',
  'chilli',
  'drinkcan',
  'drinkpack',
  'foodcan',
  'lettuce',
  'paperbag',
  'plasticbag',
  'plasticbottle',
  'plasticcontainer',
  'sweetpotato',
  'teabag',
  'tissueroll']}

## 1. Understanding the Dataset

In [None]:

def explain_dataset(image_dir, label_dir, class_names, num_classes):
    
    # Storage variables
    image_count = 0
    label_count = 0
    image_not_count = 0

    images_per_class = defaultdict(int)
    instances_per_class = defaultdict(int)
    objects_per_image = defaultdict(int)

    missing_label_files = []
    empty_label_files = []
    images_without_annotations = []

    total_bboxes = 0
    bbox_areas = []
    
    for img_file in os.listdir(image_dir):
        if not img_file.lower().endswith((".jpg", ".png", ".jpeg")):
            image_not_count += 1
            continue
        
        image_count += 1
        image_name = os.path.splitext(img_file)[0]
        label_file = os.path.join(label_dir, f"{image_name}.txt")
        
        # Check if label file exists
        if not os.path.exists(label_file):
            missing_label_files.append(img_file)
            images_without_annotations.append(img_file)
            continue
        else:
            label_count += 1
        
        # Read annotate 
        with open(label_file, "r") as f:
            lines = f.readlines()
        
        if lines == [''] or len(lines) == 0:
            empty_label_files.append(img_file)
            images_without_annotations.append(img_file)
            continue
        
        # Count objects
        objects_per_image[img_file] = len(lines)
        total_bboxes += len(lines)
        
        # Count instances
        for line in lines:
            cls, x, y, w, h = map(float, line.split())
            cls = int(cls)
            
            instances_per_class[cls] += 1
            images_per_class[cls] += 1
            
            # Calculate bbox area
            bbox_area = w * h
            bbox_areas.append(bbox_area)
        
    # DISPLAY STATISTICS
    print("\n====== DATASET STATISTICS ======\n")
    print("Total images:", image_count)
    print("Total label files:", label_count)
    print("Total bounding boxes:", total_bboxes)
    print("Average bounding-box size (normalized area):", 
        np.mean(bbox_areas) if bbox_areas else 0)

    print("\n=== IMAGES PER CLASS ===")
    for cls in range(num_classes):
        print(f"{cls} ({class_names[cls]}): {images_per_class[cls]} images")

    print("\n=== INSTANCES PER CLASS ===")
    for cls in range(num_classes):
        print(f"{cls} ({class_names[cls]}): {instances_per_class[cls]} instances")

    print("\nImages without annotation labels:", len(images_without_annotations))
    print("Missing label files:", len(missing_label_files))
    print("Empty label files:", len(empty_label_files))
    print("Images other formats:", image_not_count)

In [None]:
class_names = dataset["names"]
num_classes = len(class_names)

explain_dataset(train_image_dir, train_label_dir, class_names, num_classes)

## Train Dataset Observations:

- I'm using YOLO format to understand and analyze the dataset.
- The dataset contains two splits: train and valid.
- Each split contains two folders:
    - images/ → original images
    - labels/ → YOLO annotation files
- Each annotation file contains lines in the format:
- class x_center y_center width height, all values normalized.

## Understanding Images and Annotations

- Each image may contain one or more annotations, but in this dataset most images contain exactly one object.
- Each annotation is linked to only one image through the same filename.
- Class IDs range from 0 to 12, mapped to 13 categories like "banana", "chilli", "drinkcan", etc.
- The dataset appears to have one instance per image (based on total images = 919 and total instances = 1180, with very few multi-object images).

## Parsed and Computed Statistics

- Loaded all images and their corresponding YOLO annotation files.

- Counted:
    - Total number of images
    - Total number of labels (same count → perfectly consistent)
    - Total number of bounding boxes (1180)
    - Bounding-box sizes
    - Objects per image
    - Instances per class
    - Images per class
- Verified data consistency:
    - No missing labels
    - No empty label files
    - No images without annotations
    - No images are any other format
- Bounding box statistics:
    - Average normalized bbox area = 0.3129
    (meaning boxes are generally large relative to image size)

## Class Distribution

- Extracted category IDs and counted occurrences of each class.
- The dataset has 13 object categories:
    - The largest class is sweetpotato (120 instances).
    - The smallest class is chilli (71 instances).
- Class distribution is relatively balanced, with no extremely rare classes.

## Counting Objects Per Image

- Computed number of objects per image across both splits.
- Most images contain a single object, since total instances (1180) is only slightly higher than total images (919).
- The average number of objects per image ≈ 1.28.

## Conclusions:

- The dataset contains two YOLO-format splits: train and valid.
- Directory structure is consistent, containing:
    - images/
    - labels/
    - data.yaml defining the 13 class names.
- All images have corresponding label files:
    - 0 missing labels
    - 0 empty labels
    - 0 unannotated images
    - 0 image are other format
    - This indicates a clean and well-prepared dataset.
- Each annotation follows YOLO format with class IDs and normalized bounding boxes.
- The dataset contains:
    - 919 images
    - 919 label files
    - 1180 object instances
- Class distribution is balanced overall:
    - Highest: sweetpotato (120)
    - Lowest: chilli (71)
- Bounding boxes are generally large, with an average normalized area of ~0.31.
- On average, each image contains 1.28 objects, meaning the dataset is primarily single-object per image.
- Category names match the 13 waste/food-related classes:
banana, chilli, drinkcan, drinkpack, foodcan, lettuce, paperbag, plasticbag, plasticbottle, plasticcontainer, sweetpotato, teabag, tissueroll.

In [None]:
explain_dataset(valid_image_dir, valid_label_dir, class_names, num_classes)

## Valid Dataset Observations:

- I'm using YOLO format to understand and analyze the dataset.
- The dataset contains two splits: train and valid.
- Each split contains two folders:
    - images/ → original images
    - labels/ → YOLO annotation files
- Each annotation file contains lines in the format:
- class x_center y_center width height, all values normalized.

## Understanding Images and Annotations

- Each image may contain one or more annotations, but in this dataset most images contain exactly one object.
- Each annotation is linked to only one image through the same filename.
- Class IDs range from 0 to 12, mapped to 13 categories like "banana", "chilli", "drinkcan", etc.
- The dataset appears to have one instance per image (based on total images = 459 and total instances = 702, with very few multi-object images).

## Parsed and Computed Statistics

- Loaded all images and their corresponding YOLO annotation files.

- Counted:
    - Total number of images
    - Total number of labels (same count → perfectly consistent)
    - Total number of bounding boxes (702)
    - Bounding-box sizes
    - Objects per image
    - Instances per class
    - Images per class
- Verified data consistency:
    - No missing labels
    - No empty label files
    - No images without annotations
    - No images are any other format
- Bounding box statistics:
    - Average normalized bbox area = 0.2643
    (meaning boxes are generally large relative to image size)

## Class Distribution

- Extracted category IDs and counted occurrences of each class.
- The dataset has 13 object categories:
    - The largest class is tissueroll (63 instances).
    - The smallest class is plasticbag (40 instances).
- Class distribution is relatively balanced, with no extremely rare classes.

## Counting Objects Per Image

- Computed number of objects per image across both splits.
- Most images contain a single object, since total instances (702) is only slightly higher than total images (459).
- The average number of objects per image ≈ 1.52.

## Conclusions:

- The dataset contains two YOLO-format splits: train and valid.
- Directory structure is consistent, containing:
    - images/
    - labels/
    - data.yaml defining the 13 class names.
- All images have corresponding label files:
    - 0 missing labels
    - 0 empty labels
    - 0 unannotated images
    - 0 image are other format
    - This indicates a clean and well-prepared dataset.
- Each annotation follows YOLO format with class IDs and normalized bounding boxes.
- The dataset contains:
    - 459 images
    - 459 label files
    - 702 object instances
- Class distribution is balanced overall:
    - Highest: tissueroll (63)
    - Lowest: plasticbag (40)
- Bounding boxes are generally large, with an average normalized area of ~0.31.
- On average, each image contains 1.28 objects, meaning the dataset is primarily single-object per image.
- Category names match the 13 waste/food-related classes:
banana, chilli, drinkcan, drinkpack, foodcan, lettuce, paperbag, plasticbag, plasticbottle, plasticcontainer, sweetpotato, teabag, tissueroll.

## 2. EDA

In [None]:
def parse_dataset(image_dir, label_dir, class_names, num_classes):
    
    # Storage variables
    image_info = []
    annotations = []
    image_not_count = 0

    objects_per_image = Counter()
    instances_per_class = Counter()
    co_occurrence = defaultdict(lambda: Counter())

    missing_labels = []
    empty_annotations = []
    bbox_areas = []
    bbox_widths = []
    bbox_heights = []
    bbox_centers = []
    
    for img_file in os.listdir(image_dir):
        if not img_file.lower().endswith((".jpg", ".png", ".jpeg")):
            image_not_count += 1
            continue
        
        img_path = os.path.join(image_dir, img_file)
        label_path = os.path.join(label_dir, img_file.rsplit(".",1)[0] + ".txt")

In [None]:
# ============================================================
# YOLO DATASET EXPLORATORY DATA ANALYSIS (EDA)
# ============================================================

import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from tqdm import tqdm

# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
dataset_path = "dataset"   # root folder
splits = ["train", "valid"]

class_names = [
    'banana','chilli','drinkcan','drinkpack','foodcan','lettuce',
    'paperbag','plasticbag','plasticbottle','plasticcontainer',
    'sweetpotato','teabag','tissueroll'
]
num_classes = len(class_names)

# ------------------------------------------------------------
# STORAGE
# ------------------------------------------------------------
image_info = []           # (file_path, width, height)
annotations = []          # (image_path, class, x_center, y_center, w, h)
objects_per_image = Counter()
instances_per_class = Counter()
co_occurrence = defaultdict(lambda: Counter())

missing_labels = []
empty_annotations = []
bbox_areas = []
bbox_widths = []
bbox_heights = []
bbox_centers = []

# =============================================================
# PARSE DATASET
# =============================================================
print("Parsing dataset...")

for split in splits:
    img_dir = os.path.join(dataset_path, split, "images")
    label_dir = os.path.join(dataset_path, split, "labels")
    
    for img_file in tqdm(os.listdir(img_dir)):
        if not img_file.lower().endswith((".jpg",".png",".jpeg")):
            continue
        
        img_path = os.path.join(img_dir, img_file)
        label_path = os.path.join(label_dir, img_file.rsplit(".",1)[0] + ".txt")

        # Load image resolution
        img = cv2.imread(img_path)
        if img is None:
            continue
        h, w = img.shape[:2]
        image_info.append((img_path, w, h))

        # Check annotations
        if not os.path.exists(label_path):
            missing_labels.append(img_path)
            continue
        
        with open(label_path, "r") as f:
            lines = [l.strip() for l in f.readlines() if l.strip()]

        if len(lines) == 0:
            empty_annotations.append(img_path)
            continue

        objects_per_image[img_path] = len(lines)

        # Track which classes in this image
        present_classes = set()

        for line in lines:
            cls, xc, yc, bw, bh = map(float, line.split())
            cls = int(cls)

            annotations.append((img_path, cls, xc, yc, bw, bh))
            instances_per_class[cls] += 1
            present_classes.add(cls)

            bbox_areas.append(bw * bh)
            bbox_widths.append(bw)
            bbox_heights.append(bh)
            bbox_centers.append((xc, yc))

        # record co-occurrence
        for c1 in present_classes:
            for c2 in present_classes:
                if c1 != c2:
                    co_occurrence[c1][c2] += 1


# =================================================================
# 1. CLASS DISTRIBUTION AND PERCENTAGE ANALYSIS
# =================================================================
print("\n=== CLASS DISTRIBUTION ===")

plt.figure(figsize=(12,6))
sns.barplot(x=list(class_names), y=[instances_per_class[i] for i in range(num_classes)], palette="viridis")
plt.xticks(rotation=45)
plt.title("Instances per Class")
plt.show()

# Percentage
total_objects = sum(instances_per_class.values())
percentages = {class_names[i]: (instances_per_class[i]/total_objects*100) for i in range(num_classes)}

plt.figure(figsize=(8,8))
plt.pie(percentages.values(), labels=percentages.keys(), autopct="%1.1f%%")
plt.title("Class Percentage Distribution")
plt.show()

print(percentages)


# =================================================================
# 2. OBJECT FREQUENCY PER IMAGE
# =================================================================
obj_counts = list(objects_per_image.values())

plt.figure(figsize=(8,5))
sns.histplot(obj_counts, bins=10, kde=True)
plt.title("Objects Per Image Distribution")
plt.xlabel("Number of objects")
plt.show()

avg_objects = np.mean(obj_counts)
print("Average objects per image:", avg_objects)


# =================================================================
# 3. BBOX SIZE & LOCATION DISTRIBUTION
# =================================================================

# Area distribution
plt.figure(figsize=(8,5))
sns.histplot(bbox_areas, bins=20, kde=True)
plt.title("Bounding Box Area Distribution (normalized)")
plt.show()

# Width distribution
plt.figure(figsize=(8,5))
sns.histplot(bbox_widths, bins=20, kde=True)
plt.title("Bounding Box Width Distribution")
plt.show()

# Height distribution
plt.figure(figsize=(8,5))
sns.histplot(bbox_heights, bins=20, kde=True)
plt.title("Bounding Box Height Distribution")
plt.show()

# Heatmap of bbox centers
xs = [c[0] for c in bbox_centers]
ys = [c[1] for c in bbox_centers]
plt.figure(figsize=(6,6))
sns.kdeplot(x=xs, y=ys, fill=True, cmap="Reds")
plt.title("Bounding Box Center Heatmap")
plt.show()


# =================================================================
# 4. IMAGE RESOLUTION ANALYSIS
# =================================================================
widths = [w for _, w, _ in image_info]
heights = [h for _, _, h in image_info]

plt.figure(figsize=(8,6))
plt.scatter(widths, heights, alpha=0.4)
plt.xlabel("Width")
plt.ylabel("Height")
plt.title("Image Resolution Distribution")
plt.show()

print("Average resolution:", (np.mean(widths), np.mean(heights)))


# =================================================================
# 5. ANNOTATION QUALITY CHECKING (RANDOM SAMPLES)
# =================================================================
def show_random_samples(n=5):
    import random
    samples = random.sample(annotations, n)
    
    for img_path, cls, xc, yc, bw, bh in samples:
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w = img.shape[:2]

        x1 = int((xc - bw/2) * w)
        y1 = int((yc - bh/2) * h)
        x2 = int((xc + bw/2) * w)
        y2 = int((yc + bh/2) * h)

        cv2.rectangle(img, (x1,y1), (x2,y2), (255,0,0), 2)
        cv2.putText(img, class_names[cls], (x1,y1-5), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255,0,0), 2)
        
        plt.figure(figsize=(6,6))
        plt.imshow(img)
        plt.axis("off")
        plt.show()

print("\nShowing random annotated images...")
show_random_samples(5)


# =================================================================
# 6. DATASET BALANCE BETWEEN SPLITS
# =================================================================
split_instances = {s: Counter() for s in splits}

for img_path, cls, *_ in annotations:
    if "train" in img_path:
        split_instances["train"][cls] += 1
    else:
        split_instances["valid"][cls] += 1

plt.figure(figsize=(12,6))
x = np.arange(num_classes)
width = 0.35

plt.bar(x - width/2, [split_instances["train"][i] for i in range(num_classes)], width, label="Train")
plt.bar(x + width/2, [split_instances["valid"][i] for i in range(num_classes)], width, label="Valid")

plt.xticks(x, class_names, rotation=45)
plt.legend()
plt.title("Class Balance: Train vs Valid")
plt.show()


# =================================================================
# 7. OUTLIER DETECTION
# =================================================================
# Bbox size outliers
areas = np.array(bbox_areas)
mean_area = np.mean(areas)
std_area = np.std(areas)
outliers = np.where((areas > mean_area + 3*std_area) | (areas < mean_area - 3*std_area))[0]

print("\nBounding box outliers:", len(outliers))

plt.figure(figsize=(8,5))
sns.boxplot(x=areas)
plt.title("Boxplot of Normalized BBox Area")
plt.show()


# =================================================================
# 8. CORRELATION / CO-OCCURRENCE ANALYSIS
# =================================================================
matrix = np.zeros((num_classes, num_classes), dtype=int)

for c1 in co_occurrence:
    for c2 in co_occurrence[c1]:
        matrix[c1][c2] = co_occurrence[c1][c2]

plt.figure(figsize=(12,10))
sns.heatmap(matrix, xticklabels=class_names, yticklabels=class_names, cmap="Blues", annot=True)
plt.title("Class Co-occurrence Heatmap")
plt.show()
