In [None]:
# @title 1. Setup Environment and Mount Google Drive
# This cell sets up necessary libraries and connects to your Google Drive
# where you might store the PlantVillage dataset or clone your repo.

import os
import shutil
import random
from tqdm import tqdm # For progress bars
import zipfile

# Install split-folders library - essential for stratified splitting
!pip install split-folders tqdm

from splitfolders import ratio

# Mount Google Drive (if you store your dataset or project repo there)
from google.colab import drive
drive.mount('/content/drive')

print("Environment setup complete and Google Drive mounted.")

# Define your project root in Colab.
# Assuming your 'agro-ai-copilot' repo is cloned into /content/drive/MyDrive/agro-ai-copilot
PROJECT_ROOT_DIR = '/content/drive/MyDrive/agro-ai-copilot/module1-edge-ai'

# Fallback for temporary Colab runtime if repo is not found on Drive.
# In this case, you'll need to manually manage copying data or re-running git clone.
if not os.path.exists(PROJECT_ROOT_DIR):
    print(f"Warning: Project root '{PROJECT_ROOT_DIR}' not found. Using local Colab path.")
    PROJECT_ROOT_DIR = '/content/module1-edge-ai_local' # This will be created locally in Colab
    os.makedirs(PROJECT_ROOT_DIR, exist_ok=True)

# Create the specific data directory within module1-edge-ai
DATA_DIR = os.path.join(PROJECT_ROOT_DIR, 'data')
os.makedirs(DATA_DIR, exist_ok=True)
print(f"Project data directory set to: {DATA_DIR}")

# Define where your raw PlantVillage ZIP is located or will be downloaded.
# Option 1: Path if stored in Google Drive (adjust this path if needed)
PLANTVILLAGE_ZIP_PATH_DRIVE = '/content/drive/MyDrive/plantvillage dataset.zip' # <--- ADJUST THIS PATH if your zip is named differently or elsewhere!

# Option 2: Download directly from a common public mirror (recommended for ease in Colab)
# This URL points to a compressed version often found publicly.
PLANTVILLAGE_DOWNLOAD_URL = "https://storage.googleapis.com/plantdata/PlantVillage.zip"
PLANTVILLAGE_ZIP_PATH_COLAB = os.path.join(DATA_DIR, 'plantvillage_dataset.zip') # Name for downloaded file

# Define the directory where the extracted dataset will live temporarily
# This is the raw dataset after unzipping, before splitting.
EXTRACTED_DATA_DIR = os.path.join(DATA_DIR, 'PlantVillage_Raw')

# Define the directory where the final train/val/test subsets will be placed
SUBSET_OUTPUT_DIR = os.path.join(DATA_DIR, 'PlantVillage_Subset')

In [None]:
# @title 2. Download and Extract PlantVillage Dataset
# This cell handles getting the raw PlantVillage dataset.
# It prioritizes checking Google Drive first, then defaults to downloading.

SOURCE_ZIP_PATH = None

# Try using dataset from Google Drive first
print(f"\n--- Attempting to use PlantVillage dataset from Google Drive ---")
if os.path.exists(PLANTVILLAGE_ZIP_PATH_DRIVE):
    print(f"Using zip from Google Drive: {PLANTVILLAGE_ZIP_PATH_DRIVE}")
    SOURCE_ZIP_PATH = PLANTVILLAGE_ZIP_PATH_DRIVE
else:
    print(f"PlantVillage zip not found at {PLANTVILLAGE_ZIP_PATH_DRIVE}.")
    print(f"Attempting to download to Colab runtime from: {PLANTVILLAGE_DOWNLOAD_URL}")
    if not os.path.exists(PLANTVILLAGE_ZIP_PATH_COLAB):
        !wget -q {PLANTVILLAGE_DOWNLOAD_URL} -O "{PLANTVILLAGE_ZIP_PATH_COLAB}"
        print("Download complete.")
    else:
        print("PlantVillage zip already exists in Colab runtime. Skipping download.")
    SOURCE_ZIP_PATH = PLANTVILLAGE_ZIP_PATH_COLAB

if SOURCE_ZIP_PATH is None or not os.path.exists(SOURCE_ZIP_PATH):
    raise FileNotFoundError("Could not find or download PlantVillage dataset ZIP. Please check paths or URLs.")


# Extract the dataset
print(f"\n--- Extracting dataset from {SOURCE_ZIP_PATH} to {EXTRACTED_DATA_DIR} ---")
if os.path.exists(EXTRACTED_DATA_DIR):
    print(f"Existing extraction found at {EXTRACTED_DATA_DIR}. Deleting for fresh extraction...")
    shutil.rmtree(EXTRACTED_DATA_DIR) # Clear previous extraction for idempotence
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) # Recreate the directory

with zipfile.ZipFile(SOURCE_ZIP_PATH, 'r') as zip_ref:
    # Use tqdm for a progress bar during extraction
    for member in tqdm(zip_ref.infolist(), desc='Extracting '):
        try:
            zip_ref.extract(member, EXTRACTED_DATA_DIR)
        except zipfile.error as e:
            print(f"Error extracting {member.filename}: {e}")
            continue
print("Extraction complete.")

# Verify and set the actual root directory within the extracted data
# The zip might extract into a subdirectory named 'PlantVillage'
extracted_contents = os.listdir(EXTRACTED_DATA_DIR)
if 'PlantVillage' in extracted_contents and os.path.isdir(os.path.join(EXTRACTED_DATA_DIR, 'PlantVillage')):
    ORIGINAL_DATA_ROOT = os.path.join(EXTRACTED_DATA_DIR, 'PlantVillage')
else:
    # If not 'PlantVillage', assume the class folders are directly under EXTRACTED_DATA_DIR
    ORIGINAL_DATA_ROOT = EXTRACTED_DATA_DIR

print(f"Original dataset root for splitting (containing class folders): {ORIGINAL_DATA_ROOT}")
if not os.path.exists(ORIGINAL_DATA_ROOT) or not os.listdir(ORIGINAL_DATA_ROOT):
    raise FileNotFoundError(f"ERROR: Original data root '{ORIGINAL_DATA_ROOT}' not found or is empty after extraction. Please check the zip content.")

In [None]:
# @title 3. Create Train, Validation, Test Subset
# This cell creates the stratified train/validation/test split into the desired structure.

# Clear existing subset directory if it exists to ensure a clean split
if os.path.exists(SUBSET_OUTPUT_DIR):
    print(f"Removing existing subset directory: {SUBSET_OUTPUT_DIR}")
    shutil.rmtree(SUBSET_OUTPUT_DIR)
os.makedirs(SUBSET_OUTPUT_DIR, exist_ok=True) # Recreate the directory

print(f"\n--- Creating train/val/test split into {SUBSET_OUTPUT_DIR} ---")
# Use split-folders to create the train/val/test split
# ratio = (train, val, test)
# seed for reproducibility
# group_prefix=None is suitable for datasets where class names are directly folder names
ratio(
    ORIGINAL_DATA_ROOT,
    output=SUBSET_OUTPUT_DIR,
    seed=1337, # Fixed seed for reproducibility
    ratio=(0.7, 0.15, 0.15), # 70% train, 15% validation, 15% test
    group_prefix=None
)

print("\nDataset splitting complete. Generated directory structure:")
# List the contents of the newly created train/val/test directories
for split_folder in ['train', 'val', 'test']:
    path = os.path.join(SUBSET_OUTPUT_DIR, split_folder)
    print(f"\nContents of {path}:")
    if os.path.exists(path):
        # List first few class subdirectories to show structure
        class_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
        print(f"  Total classes: {len(class_dirs)}")
        print("  Sample class directories:", sorted(class_dirs)[:5], "...")
    else:
        print(f"  {split_folder} directory not found.")

In [None]:
# @title 4. Verify Data Counts (Optional but Recommended)
# This cell helps verify the number of images in each split for sanity check.

print("\n--- Verifying Image Counts in Each Split ---")
for split in ['train', 'val', 'test']:
    split_path = os.path.join(SUBSET_OUTPUT_DIR, split)
    if os.path.exists(split_path):
        total_images = 0
        num_classes = 0
        print(f"\n{split.upper()} set located at: {split_path}")
        for class_name in sorted(os.listdir(split_path)):
            class_path = os.path.join(split_path, class_name)
            if os.path.isdir(class_path):
                num_images = len(os.listdir(class_path))
                # print(f"  - {class_name}: {num_images} images") # Uncomment for verbose class-wise counts
                total_images += num_images
                num_classes += 1
        print(f"Total images in {split.upper()} set: {total_images} across {num_classes} classes.")
    else:
        print(f"Warning: {split.upper()} directory not found at {split_path}")

print("\nModule 1 MVP data preparation complete! The structured dataset is ready for training.")