In [None]:
# @title 1. Setup Environment and Mount Google Drive
# This cell sets up necessary libraries and connects to your Google Drive.

import os
import shutil
import random
from tqdm import tqdm # For progress bars
import zipfile
import requests # For Pythonic download alternative (not used directly here, but good to have)

# Install split-folders library - essential for stratified splitting
!pip install split-folders tqdm

from splitfolders import ratio

# Mount Google Drive (REQUIRED for persistent storage of large files)
from google.colab import drive
drive.mount('/content/drive')

print("Environment setup complete and Google Drive mounted.")

# --- DEFINE YOUR GOOGLE DRIVE PROJECT DATA ROOT ---
# This MUST match the path you set in 'drive_setup_project_data_dirs.ipynb'
GOOGLE_DRIVE_PROJECT_ROOT = '/content/drive/MyDrive/AgroAI_Project_Data' # <--- ENSURE THIS MATCHES YOUR SETUP!
if not os.path.exists(GOOGLE_DRIVE_PROJECT_ROOT):
    raise FileNotFoundError(f"Google Drive project root not found: {GOOGLE_DRIVE_PROJECT_ROOT}. Please run 'drive_setup_project_data_dirs.ipynb' first.")
print(f"Google Drive project root (for data storage): {GOOGLE_DRIVE_PROJECT_ROOT}")

# --- DEFINE PROJECT-SPECIFIC DATA PATHS WITHIN GOOGLE DRIVE ---
# These paths are where Module 1's large data and models will be stored persistently.
MODULE1_DRIVE_DATA_DIR = os.path.join(GOOGLE_DRIVE_PROJECT_ROOT, 'module1_edge_ai', 'data')
MODULE1_DRIVE_MODELS_DIR = os.path.join(GOOGLE_DRIVE_PROJECT_ROOT, 'module1_edge_ai', 'trained_models')

# Ensure these directories exist (they should, if drive_setup_project_data_dirs.ipynb was run)
os.makedirs(MODULE1_DRIVE_DATA_DIR, exist_ok=True)
os.makedirs(MODULE1_DRIVE_MODELS_DIR, exist_ok=True) # Ensure models dir is also there
print(f"Module 1 data will be stored persistently in Drive at: {MODULE1_DRIVE_DATA_DIR}")
print(f"Module 1 models will be stored persistently in Drive at: {MODULE1_DRIVE_MODELS_DIR}")

# Define where your raw PlantVillage ZIP will be downloaded *to* in Google Drive
PLANTVILLAGE_ZIP_PATH_DRIVE = os.path.join(MODULE1_DRIVE_DATA_DIR, 'plantvillage_dataset.zip')

# Define the source URL for the PlantVillage dataset
PLANTVILLAGE_DOWNLOAD_URL = "https://storage.googleapis.com/plantdata/PlantVillage.zip"

# Define the directory names for extracted and subset data within your Drive module data folder
EXTRACTED_DATA_DIR = os.path.join(MODULE1_DRIVE_DATA_DIR, 'PlantVillage_Raw')
SUBSET_OUTPUT_DIR = os.path.join(MODULE1_DRIVE_DATA_DIR, 'PlantVillage_Subset')

In [None]:
# @title 2. Download and Extract PlantVillage Dataset Directly to Google Drive
# This cell handles getting the raw PlantVillage dataset and saving it persistently.

print(f"\n--- Attempting to download PlantVillage dataset to Google Drive: {PLANTVILLAGE_ZIP_PATH_DRIVE} ---")

if not os.path.exists(PLANTVILLAGE_ZIP_PATH_DRIVE):
    print(f"Downloading from: {PLANTVILLAGE_DOWNLOAD_URL}")
    # Using !wget for direct download to Google Drive path
    !wget -q {PLANTVILLAGE_DOWNLOAD_URL} -O "{PLANTVILLAGE_ZIP_PATH_DRIVE}"
    print("Download complete.")
else:
    print("PlantVillage zip already exists in Google Drive. Skipping download.")

# Verify that the ZIP file exists before proceeding
if not os.path.exists(PLANTVILLAGE_ZIP_PATH_DRIVE):
    raise FileNotFoundError(f"ERROR: PlantVillage dataset ZIP not found at {PLANTVILLAGE_ZIP_PATH_DRIVE} after download attempt.")

# Extract the dataset
print(f"\n--- Extracting dataset from {PLANTVILLAGE_ZIP_PATH_DRIVE} to {EXTRACTED_DATA_DIR} ---")
if os.path.exists(EXTRACTED_DATA_DIR):
    print(f"Existing extraction found at {EXTRACTED_DATA_DIR}. Deleting for fresh extraction...")
    shutil.rmtree(EXTRACTED_DATA_DIR) # Clear previous extraction for idempotence
os.makedirs(EXTRACTED_DATA_DIR, exist_ok=True) # Recreate the directory

with zipfile.ZipFile(PLANTVILLAGE_ZIP_PATH_DRIVE, 'r') as zip_ref:
    # Use tqdm for a progress bar during extraction
    for member in tqdm(zip_ref.infolist(), desc='Extracting '):
        try:
            zip_ref.extract(member, EXTRACTED_DATA_DIR)
        except zipfile.error as e:
            print(f"Error extracting {member.filename}: {e}")
            continue
print("Extraction complete.")

# Verify and set the actual root directory within the extracted data
# The zip might extract into a subdirectory named 'PlantVillage'
extracted_contents = os.listdir(EXTRACTED_DATA_DIR)
if 'PlantVillage' in extracted_contents and os.path.isdir(os.path.join(EXTRACTED_DATA_DIR, 'PlantVillage')):
    ORIGINAL_DATA_ROOT = os.path.join(EXTRACTED_DATA_DIR, 'PlantVillage')
else:
    # If not 'PlantVillage', assume the class folders are directly under EXTRACTED_DATA_DIR
    ORIGINAL_DATA_ROOT = EXTRACTED_DATA_DIR

print(f"Original dataset root for splitting (containing class folders): {ORIGINAL_DATA_ROOT}")
if not os.path.exists(ORIGINAL_DATA_ROOT) or not os.listdir(ORIGINAL_DATA_ROOT):
    raise FileNotFoundError(f"ERROR: Original data root '{ORIGINAL_DATA_ROOT}' not found or is empty after extraction. Please check the zip content.")

In [None]:
# @title 3. Create Train, Validation, Test Subset
# This cell creates the stratified train/validation/test split into the desired structure.

# Clear existing subset directory if it exists to ensure a clean split
if os.path.exists(SUBSET_OUTPUT_DIR):
    print(f"Removing existing subset directory: {SUBSET_OUTPUT_DIR}")
    shutil.rmtree(SUBSET_OUTPUT_DIR)
os.makedirs(SUBSET_OUTPUT_DIR, exist_ok=True) # Recreate the directory

print(f"\n--- Creating train/val/test split into {SUBSET_OUTPUT_DIR} ---")
# Use split-folders to create the train/val/test split
# ratio = (train, val, test)
# seed for reproducibility
# group_prefix=None is suitable for datasets where class names are directly folder names
ratio(
    ORIGINAL_DATA_ROOT,
    output=SUBSET_OUTPUT_DIR,
    seed=1337, # Fixed seed for reproducibility
    ratio=(0.7, 0.15, 0.15), # 70% train, 15% validation, 15% test
    group_prefix=None
)

print("\nDataset splitting complete. Generated directory structure (in Google Drive):")
# List the contents of the newly created train/val/test directories
for split_folder in ['train', 'val', 'test']:
    path = os.path.join(SUBSET_OUTPUT_DIR, split_folder)
    print(f"\nContents of {path}:")
    if os.path.exists(path):
        # List first few class subdirectories to show structure
        class_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
        print(f"  Total classes: {len(class_dirs)}")
        print("  Sample class directories:", sorted(class_dirs)[:5], "...")
    else:
        print(f"  {split_folder} directory not found.")

In [None]:
# @title 4. Verify Data Counts (Optional but Recommended)
# This cell helps verify the number of images in each split for sanity check.

print("\n--- Verifying Image Counts in Each Split ---")
for split in ['train', 'val', 'test']:
    split_path = os.path.join(SUBSET_OUTPUT_DIR, split)
    if os.path.exists(split_path):
        total_images = 0
        num_classes = 0
        print(f"\n{split.upper()} set located at: {split_path}")
        for class_name in sorted(os.listdir(split_path)):
            class_path = os.path.join(split_path, class_name)
            if os.path.isdir(class_path):
                num_images = len(os.listdir(class_path))
                # print(f"  - {class_name}: {num_images} images") # Uncomment for verbose class-wise counts
                total_images += num_images
                num_classes += 1
        print(f"Total images in {split.upper()} set: {total_images} across {num_classes} classes.")
    else:
        print(f"Warning: {split.upper()} directory not found at {split_path}")

print("\nModule 1 MVP data preparation complete! The structured dataset is ready for training, persistently stored in Google Drive.")