In [4]:
import zipfile
import pandas as pd
import os
from collections import defaultdict

def process_zipped_images(zip_path, csv_path, output_folder='Train', n_images=208):
    """
    Process images from a zip file based on CSV labels and create a training dataset.
    
    Args:
        zip_path (str): Path to the zip file containing images
        csv_path (str): Path to the CSV file with labels
        output_folder (str): Name of the output folder (default: 'Train')
        n_images (int): Number of images to select per class (default: 208)
    """
    # Verify input files exist
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Zip file not found: {zip_path}")
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
        
    print(f"Reading CSV file: {csv_path}")
    # Read the CSV file and show initial info
    df = pd.read_csv(csv_path)
    print(f"CSV contents:")
    print(f"Number of rows: {len(df)}")
    print(f"Columns found: {df.columns.tolist()}")
    print("\nFirst few rows of CSV:")
    print(df.head())
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    print(f"\nCreated output folder: {output_folder}")
    
    # Dictionary to keep track of images per label
    label_counts = defaultdict(int)
    selected_files = []
    
    print(f"\nOpening zip file: {zip_path}")
    # Open the zip file and show contents
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_files = set(zip_ref.namelist())
        print(f"Number of files in zip: {len(zip_files)}")
        print("First few files in zip:")
        for filename in list(zip_files)[:5]:
            print(f"- {filename}")
        
        # Process each row in the CSV
        print("\nProcessing files...")
        for idx, row in df.iterrows():
            file_path = row['file_path']
            label = str(row['Label'])  # Using 'Label' instead of 'label'
            
            # Try different possible paths in the zip file
            possible_paths = [
                file_path,
                f"plantvillage/{file_path}",
                file_path.replace('plantvillage/', '')
            ]
            
            # Find the correct path in the zip file
            zip_path = None
            for path in possible_paths:
                if path in zip_files:
                    zip_path = path
                    break
            
            if zip_path and label_counts[label] < n_images:
                try:
                    print(f"Extracting: {zip_path} (Label: {label}, Count: {label_counts[label] + 1})")
                    # Extract the file
                    zip_ref.extract(zip_path, output_folder)
                    
                    # Get the filename without the plantvillage prefix
                    final_filename = os.path.basename(zip_path)
                    
                    # If extracted with plantvillage prefix, move the file up
                    extracted_path = os.path.join(output_folder, zip_path)
                    final_path = os.path.join(output_folder, final_filename)
                    
                    # If the file was extracted into a subdirectory, move it up
                    if os.path.dirname(extracted_path) != output_folder:
                        os.makedirs(os.path.dirname(final_path), exist_ok=True)
                        os.rename(extracted_path, final_path)
                        # Remove empty plantvillage directory if it exists
                        plantvillage_dir = os.path.join(output_folder, 'plantvillage')
                        if os.path.exists(plantvillage_dir):
                            try:
                                os.rmdir(plantvillage_dir)
                            except OSError:
                                pass  # Directory not empty, that's okay
                    
                    label_counts[label] += 1
                    selected_files.append({
                        'file_path': os.path.join(output_folder, final_filename),
                        'Label': label
                    })
                except Exception as e:
                    print(f"Error extracting {zip_path}: {str(e)}")
            elif idx < 5 and not zip_path:  # Only show first few missing files
                print(f"File not found in zip: {file_path}")
    
    # Create new DataFrame with selected files
    if selected_files:
        new_df = pd.DataFrame(selected_files)
        output_csv = f'{output_folder}_labels.csv'
        new_df.to_csv(output_csv, index=False)
        print(f"\nCreated new CSV file: {output_csv}")
    else:
        print("\nWarning: No files were selected/extracted!")
    
    # Print summary
    print("\nProcessing complete!")
    print("\nFiles selected per label:")
    if label_counts:
        for label, count in label_counts.items():
            print(f"{label}: {count} images")
    else:
        print("No files were processed!")
        
    # Additional debugging information
    print("\nDebug Information:")
    print(f"Working directory: {os.getcwd()}")
    print(f"Train folder contents: {os.listdir(output_folder) if os.path.exists(output_folder) else 'Folder empty or not found'}")

# Example usage
if __name__ == "__main__":
    # Replace these paths with your actual paths
    zip_path = "plantvillage.zip"
    csv_path = "crop.csv"
    
    process_zipped_images(zip_path, csv_path)

Reading CSV file: crop.csv
CSV contents:
Number of rows: 38994
Columns found: ['file_path', 'Label']

First few rows of CSV:
                                           file_path  Label
0  4b22a1e7-745b-4c78-a49e-14ca8cfba26a___RS_HL-8...      0
1  9ee2b5e6-46cf-400f-a82f-660a2ac05157___RS_HL-5...      0
2  181ce194-fdc9-4f0b-ae8d-0c621f723279___RS_HL-7...      0
3  3e8eb97c-b2ad-4aeb-82ff-2f1334801c9b___RS_HL-6...      0
4  4e817ac6-818a-431f-af6d-e44477f9b649___RS_HL-7...      0

Created output folder: Train

Opening zip file: plantvillage.zip
Number of files in zip: 54306
First few files in zip:
- plantvillage/d4d20c99-4cf3-4ecc-95ae-a964560cfeed___RS_Erly.B-9604.JPG
- plantvillage/8b32e824-d36e-443a-9410-5ca8ccad2492___CREC_HLB-7384.JPG
- plantvillage/76ee26db-ef55-43b9-9179-6b9ba15dc21a___RS_LB-4058.JPG
- plantvillage/63a5e362-672a-4dee-a673-d9258bb0ab6c___RS_HL-3560.JPG
- plantvillage/526f929e-e9c1-4ca5-a493-c0bcf4898b04___Crnl_L.Mold-9129.JPG

Processing files...
Extracting: plan

In [2]:
import zipfile
import pandas as pd
import os
from collections import defaultdict

def process_zipped_images(zip_path, csv_path, output_folder='Test', n_images=42):
    """
    Process images from a zip file based on CSV labels and create a training dataset.
    
    Args:
        zip_path (str): Path to the zip file containing images
        csv_path (str): Path to the CSV file with labels
        output_folder (str): Name of the output folder (default: 'Train')
        n_images (int): Number of images to select per class (default: 208)
    """
    # Verify input files exist
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Zip file not found: {zip_path}")
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
        
    print(f"Reading CSV file: {csv_path}")
    # Read the CSV file and show initial info
    df = pd.read_csv(csv_path)
    print(f"CSV contents:")
    print(f"Number of rows: {len(df)}")
    print(f"Columns found: {df.columns.tolist()}")
    print("\nFirst few rows of CSV:")
    print(df.head())
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    print(f"\nCreated output folder: {output_folder}")
    
    # Dictionary to keep track of images per label
    label_counts = defaultdict(int)
    selected_files = []
    
    print(f"\nOpening zip file: {zip_path}")
    # Open the zip file and show contents
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_files = set(zip_ref.namelist())
        print(f"Number of files in zip: {len(zip_files)}")
        print("First few files in zip:")
        for filename in list(zip_files)[:5]:
            print(f"- {filename}")
        
        # Process each row in the CSV
        print("\nProcessing files...")
        for idx, row in df.iterrows():
            file_path = row['file_path']
            label = str(row['Label'])  # Using 'Label' instead of 'label'
            
            # Try different possible paths in the zip file
            possible_paths = [
                file_path,
                f"plantvillage/{file_path}",
                file_path.replace('plantvillage/', '')
            ]
            
            # Find the correct path in the zip file
            zip_path = None
            for path in possible_paths:
                if path in zip_files:
                    zip_path = path
                    break
            
            if zip_path and label_counts[label] < n_images:
                try:
                    print(f"Extracting: {zip_path} (Label: {label}, Count: {label_counts[label] + 1})")
                    # Extract the file
                    zip_ref.extract(zip_path, output_folder)
                    
                    # Get the filename without the plantvillage prefix
                    final_filename = os.path.basename(zip_path)
                    
                    # If extracted with plantvillage prefix, move the file up
                    extracted_path = os.path.join(output_folder, zip_path)
                    final_path = os.path.join(output_folder, final_filename)
                    
                    # If the file was extracted into a subdirectory, move it up
                    if os.path.dirname(extracted_path) != output_folder:
                        os.makedirs(os.path.dirname(final_path), exist_ok=True)
                        os.rename(extracted_path, final_path)
                        # Remove empty plantvillage directory if it exists
                        plantvillage_dir = os.path.join(output_folder, 'plantvillage')
                        if os.path.exists(plantvillage_dir):
                            try:
                                os.rmdir(plantvillage_dir)
                            except OSError:
                                pass  # Directory not empty, that's okay
                    
                    label_counts[label] += 1
                    selected_files.append({
                        'file_path': os.path.join(output_folder, final_filename),
                        'Label': label
                    })
                except Exception as e:
                    print(f"Error extracting {zip_path}: {str(e)}")
            elif idx < 5 and not zip_path:  # Only show first few missing files
                print(f"File not found in zip: {file_path}")
    
    # Create new DataFrame with selected files
    if selected_files:
        new_df = pd.DataFrame(selected_files)
        output_csv = f'{output_folder}_labels.csv'
        new_df.to_csv(output_csv, index=False)
        print(f"\nCreated new CSV file: {output_csv}")
    else:
        print("\nWarning: No files were selected/extracted!")
    
    # Print summary
    print("\nProcessing complete!")
    print("\nFiles selected per label:")
    if label_counts:
        for label, count in label_counts.items():
            print(f"{label}: {count} images")
    else:
        print("No files were processed!")
        
    # Additional debugging information
    print("\nDebug Information:")
    print(f"Working directory: {os.getcwd()}")
    print(f"Train folder contents: {os.listdir(output_folder) if os.path.exists(output_folder) else 'Folder empty or not found'}")

# Example usage
if __name__ == "__main__":
    # Replace these paths with your actual paths
    zip_path = "plantvillage.zip"
    csv_path = "crop_test.csv"
    
    process_zipped_images(zip_path, csv_path)

Reading CSV file: crop_test.csv
CSV contents:
Number of rows: 10279
Columns found: ['file_path', 'Label']

First few rows of CSV:
                                           file_path  Label
0  db172d76-ff08-4bd5-bd0b-6231f432434a___RS_HL-5...      0
1  c2aa2c88-d07b-4df4-80c7-1ba703254844___RS_HL-5...      0
2  d57f5f33-d8e9-4c40-80ad-c2875743f781___RS_HL-5...      0
3  83636c4e-c4f1-47c4-81e8-0bcf7bac8995___RS_HL-5...      0
4  398f4455-d008-4d69-a58d-b0a1f228abaa___RS_HL-5...      0

Created output folder: Test

Opening zip file: plantvillage.zip
Number of files in zip: 54306
First few files in zip:
- plantvillage/d4d20c99-4cf3-4ecc-95ae-a964560cfeed___RS_Erly.B-9604.JPG
- plantvillage/8b32e824-d36e-443a-9410-5ca8ccad2492___CREC_HLB-7384.JPG
- plantvillage/76ee26db-ef55-43b9-9179-6b9ba15dc21a___RS_LB-4058.JPG
- plantvillage/63a5e362-672a-4dee-a673-d9258bb0ab6c___RS_HL-3560.JPG
- plantvillage/526f929e-e9c1-4ca5-a493-c0bcf4898b04___Crnl_L.Mold-9129.JPG

Processing files...
Extracting: 

In [1]:
import zipfile
import pandas as pd
import os
from collections import defaultdict
from pathlib import Path

def normalize_path(path):
    """Convert path to use forward slashes."""
    return str(Path(path)).replace(os.sep, '/')

def process_zipped_images(zip_path, csv_path, output_folder='Test', n_images=2):
    """
    Process images from a zip file based on CSV labels and create a training dataset.
    
    Args:
        zip_path (str): Path to the zip file containing images
        csv_path (str): Path to the CSV file with labels
        output_folder (str): Name of the output folder (default: 'Train')
        n_images (int): Number of images to select per class (default: 208)
    """
    # Verify input files exist
    if not os.path.exists(zip_path):
        raise FileNotFoundError(f"Zip file not found: {zip_path}")
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
        
    print(f"Reading CSV file: {csv_path}")
    # Read the CSV file and show initial info
    df = pd.read_csv(csv_path)
    print(f"CSV contents:")
    print(f"Number of rows: {len(df)}")
    print(f"Columns found: {df.columns.tolist()}")
    print("\nFirst few rows of CSV:")
    print(df.head())
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    print(f"\nCreated output folder: {output_folder}")
    
    # Dictionary to keep track of images per label
    label_counts = defaultdict(int)
    selected_files = []
    
    print(f"\nOpening zip file: {zip_path}")
    # Open the zip file and show contents
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_files = set(zip_ref.namelist())
        print(f"Number of files in zip: {len(zip_files)}")
        print("First few files in zip:")
        for filename in list(zip_files)[:5]:
            print(f"- {filename}")
        
        # Process each row in the CSV
        print("\nProcessing files...")
        for idx, row in df.iterrows():
            file_path = row['file_path']
            label = str(row['Label'])  # Using 'Label' instead of 'label'
            
            # Try different possible paths in the zip file
            possible_paths = [
                file_path,
                f"plantvillage/{file_path}",
                file_path.replace('plantvillage/', '')
            ]
            
            # Find the correct path in the zip file
            zip_path_found = None
            for path in possible_paths:
                if path in zip_files:
                    zip_path_found = path
                    break
            
            if zip_path_found and label_counts[label] < n_images:
                try:
                    print(f"Extracting: {zip_path_found} (Label: {label}, Count: {label_counts[label] + 1})")
                    # Extract the file
                    zip_ref.extract(zip_path_found, output_folder)
                    
                    # Get the filename without the plantvillage prefix
                    final_filename = os.path.basename(zip_path_found)
                    
                    # If extracted with plantvillage prefix, move the file up
                    extracted_path = os.path.join(output_folder, zip_path_found)
                    final_path = os.path.join(output_folder, final_filename)
                    
                    # Convert paths to use forward slashes
                    extracted_path = normalize_path(extracted_path)
                    final_path = normalize_path(final_path)
                    
                    # If the file was extracted into a subdirectory, move it up
                    if os.path.dirname(extracted_path) != normalize_path(output_folder):
                        os.makedirs(os.path.dirname(final_path), exist_ok=True)
                        os.rename(extracted_path, final_path)
                        # Remove empty plantvillage directory if it exists
                        plantvillage_dir = os.path.join(output_folder, 'plantvillage')
                        if os.path.exists(plantvillage_dir):
                            try:
                                os.rmdir(plantvillage_dir)
                            except OSError:
                                pass  # Directory not empty, that's okay
                    
                    label_counts[label] += 1
                    selected_files.append({
                        'file_path': f"{output_folder}/{final_filename}",  # Use forward slash
                        'Label': label
                    })
                except Exception as e:
                    print(f"Error extracting {zip_path_found}: {str(e)}")
            elif idx < 5 and not zip_path_found:  # Only show first few missing files
                print(f"File not found in zip: {file_path}")
    
    # Create new DataFrame with selected files
    if selected_files:
        new_df = pd.DataFrame(selected_files)
        output_csv = f'{output_folder}_labels.csv'
        new_df.to_csv(output_csv, index=False)
        print(f"\nCreated new CSV file: {output_csv}")
    else:
        print("\nWarning: No files were selected/extracted!")
    
    # Print summary
    print("\nProcessing complete!")
    print("\nFiles selected per label:")
    if label_counts:
        for label, count in label_counts.items():
            print(f"{label}: {count} images")
    else:
        print("No files were processed!")
        
    # Additional debugging information
    print("\nDebug Information:")
    print(f"Working directory: {normalize_path(os.getcwd())}")
    print(f"Train folder contents: {os.listdir(output_folder) if os.path.exists(output_folder) else 'Folder empty or not found'}")

# Example usage
if __name__ == "__main__":
    # Replace these paths with your actual paths
    zip_path = "plantvillage.zip"
    csv_path = "crop_test.csv"
    
    process_zipped_images(zip_path, csv_path)

Reading CSV file: crop_test.csv
CSV contents:
Number of rows: 10279
Columns found: ['file_path', 'Label']

First few rows of CSV:
                                           file_path  Label
0  db172d76-ff08-4bd5-bd0b-6231f432434a___RS_HL-5...      0
1  c2aa2c88-d07b-4df4-80c7-1ba703254844___RS_HL-5...      0
2  d57f5f33-d8e9-4c40-80ad-c2875743f781___RS_HL-5...      0
3  83636c4e-c4f1-47c4-81e8-0bcf7bac8995___RS_HL-5...      0
4  398f4455-d008-4d69-a58d-b0a1f228abaa___RS_HL-5...      0

Created output folder: Test

Opening zip file: plantvillage.zip
Number of files in zip: 54306
First few files in zip:
- plantvillage/ddb68cbe-d031-4adc-b7d3-04cc6fcbd042___FAM_B.Rot-5076.JPG
- plantvillage/4b472308-dde6-41b2-943e-707eb5b4b050___Mt.N.V_HL-8942.JPG
- plantvillage/d2d20c95-1a0d-49cf-aa4d-0148c78354e2___RS_HL-3042.JPG
- plantvillage/7475a28c-f789-49d9-8289-17a5dc97717b___YLCV_GCREC-2895.JPG
- plantvillage/ea4a18b9-2f82-44eb-b5ca-955b3f2bf92d___JR_HL-5958.JPG

Processing files...
Extracting: 