<a href="https://colab.research.google.com/github/JericN/rice-disease-classifier/blob/main/testestest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
! pip --quiet install datasets rasterio

In [4]:
from pathlib import Path
import numpy as np
import rasterio
from datasets import Dataset, DatasetDict
from tqdm import tqdm

In [5]:
import os
from google.colab import drive
import zipfile

# Mount Google Drive
drive.mount('/content/drive')
zip_path = '/content/drive/MyDrive/School/Courses/thesis/D2.zip'

# Extract the ZIP file
extract_path = '/content/dataset'
if not os.path.exists(extract_path):
    os.makedirs(extract_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

Mounted at /content/drive


In [6]:
data_dir = Path("/content/dataset/D2")
splits = ["train", "test", "validation"]

In [7]:
# Function to load .tif images
def load_tif_image(image_path):
    with rasterio.open(image_path) as src:
        return src.read().astype(np.float32)  # Shape: (bands, height, width)

# Function to load label masks
def load_label_mask(label_path):
    with rasterio.open(label_path) as src:
        return src.read(1).astype(np.uint8)  # Shape: (height, width)

In [12]:
def create_dataset(split):
    images, masks = [], []
    image_dir = data_dir / split
    mask_dir = data_dir / f"{split}_labels"

    # Check if subdirectories exist
    if not image_dir.exists() or not mask_dir.exists():
        raise FileNotFoundError(f"Missing '{image_dir}' or '{mask_dir}'. Please check your dataset structure.")

    # Get all image files
    image_files = sorted(image_dir.iterdir())

    for img_file in tqdm(image_files, desc=f"Processing {split}"):
        # Extract patch number from image filename
        patch_number = img_file.stem.split("_")[-1]  # Extract '3' from 'image_patch_3'
        mask_file = mask_dir / f"label_patch_{patch_number}.tif"  # Expected mask filename

        if mask_file.exists():
            images.append(load_tif_image(img_file))
            masks.append(load_label_mask(mask_file))
        else:
            print(f"WARNING: No mask found for {img_file.name}")

    if not images:
        print(f"WARNING: No images were loaded for {split}")

    return Dataset.from_dict({
        "image": images,  # Shape: (bands, height, width)
        "label": masks      # Shape: (height, width)
    })


In [13]:
# Create datasets
datasets = {split: create_dataset(split) for split in splits}
dataset_dict = DatasetDict(datasets)

Processing train: 100%|██████████| 1299/1299 [00:09<00:00, 143.20it/s]
Processing test: 100%|██████████| 281/281 [00:01<00:00, 161.96it/s]
Processing validation: 100%|██████████| 277/277 [00:01<00:00, 158.48it/s]


In [15]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1299
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 281
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 277
    })
})

In [20]:
import numpy as np

# Function to print dataset info
def print_dataset_info(dataset_dict):
    for split, dataset in dataset_dict.items():
        print(f"--- {split.upper()} SET ---")
        print(f"Number of samples: {len(dataset)}")

        # Get first sample
        sample = dataset[0]
        image = np.array(sample["image"])  # Convert list to NumPy array
        label = np.array(sample["label"])  # Convert list to NumPy array

        # Print shape and data type
        print(f"Image shape: {image.shape} | dtype: {image.dtype}")
        print(f"Label shape: {label.shape} | dtype: {label.dtype}")

        # Print unique label classes
        unique_classes = np.unique(label)
        print(f"Unique classes in label: {unique_classes}")

        print("-" * 40)

# Print dataset information
print_dataset_info(dataset_dict)


--- TRAIN SET ---
Number of samples: 1299
Image shape: (6, 256, 256) | dtype: float64
Label shape: (256, 256) | dtype: int64
Unique classes in label: [0 1 3]
----------------------------------------
--- TEST SET ---
Number of samples: 281
Image shape: (6, 256, 256) | dtype: float64
Label shape: (256, 256) | dtype: int64
Unique classes in label: [4]
----------------------------------------
--- VALIDATION SET ---
Number of samples: 277
Image shape: (6, 256, 256) | dtype: float64
Label shape: (256, 256) | dtype: int64
Unique classes in label: [4]
----------------------------------------


In [16]:
dataset_dict.push_to_hub("SodaXII/blb-ms-01")

Uploading the dataset shards:   0%|          | 0/5 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/SodaXII/blb-ms-01/commit/b8547882d1eb0087b1887cf18595f4c084ab21cc', commit_message='Upload dataset', commit_description='', oid='b8547882d1eb0087b1887cf18595f4c084ab21cc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/SodaXII/blb-ms-01', endpoint='https://huggingface.co', repo_type='dataset', repo_id='SodaXII/blb-ms-01'), pr_revision=None, pr_num=None)