In [None]:
# Unzip the downloaded dataset
!unzip Soil_classification_dataset.zip -d /content/Soil-classification-dataset

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/Soil-classification-dataset/Soil_classification_dataset_5/Moderate_dry/soil_moisture_india_and_sri_lanka_2017-03-07_class_2_patch_37.png  
  inflating: /content/Soil-classification-dataset/Soil_classification_dataset_5/Moderate_dry/soil_moisture_india_and_sri_lanka_2017-03-07_class_2_patch_48.png  
  inflating: /content/Soil-classification-dataset/Soil_classification_dataset_5/Moderate_dry/soil_moisture_india_and_sri_lanka_2017-03-07_class_2_patch_49.png  
  inflating: /content/Soil-classification-dataset/Soil_classification_dataset_5/Moderate_dry/soil_moisture_india_and_sri_lanka_2017-03-07_class_2_patch_50.png  
  inflating: /content/Soil-classification-dataset/Soil_classification_dataset_5/Moderate_dry/soil_moisture_india_and_sri_lanka_2017-03-07_class_2_patch_51.png  
  inflating: /content/Soil-classification-dataset/Soil_classification_dataset_5/Moderate_dry/soil_moisture_india_and_sri_lanka_201

In [None]:
import os
import random
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
from PIL import Image

# Set the paths and target count
dataset_path = "/content/Soil-classification-dataset/Soil_classification_dataset_5"  # Replace with the actual path
output_path = "/content/Soil_classification_dataset/Soil_classification_dataset_processed"  # Path to save the processed dataset
target_count = 5000  # Target images per class

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

# Define the augmentation generator
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest"
)

# Function to save augmented images
def save_augmented_images(generator, image_array, save_dir, current_count, target_count):
    """Generates augmented images and saves them until reaching target count."""
    for batch in generator.flow(image_array, batch_size=1, save_to_dir=save_dir, save_format="jpeg"):
        current_count += 1
        if current_count >= target_count:
            break

# Iterate through classes
for class_name in os.listdir(dataset_path):
    class_dir = os.path.join(dataset_path, class_name)
    output_class_dir = os.path.join(output_path, class_name)
    os.makedirs(output_class_dir, exist_ok=True)

    # List all images in the class folder
    image_files = [f for f in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, f))]
    current_count = len(image_files)

    if current_count > target_count:
        # Downsample to 5000 images
        print(f"Downsampling class '{class_name}' from {current_count} to {target_count}")
        selected_files = random.sample(image_files, target_count)
        for image_file in selected_files:
            shutil.copy(os.path.join(class_dir, image_file), output_class_dir)

    elif current_count < target_count:
        # Upsample to 5000 images
        print(f"Upsampling class '{class_name}' from {current_count} to {target_count}")
        for image_file in image_files:
            shutil.copy(os.path.join(class_dir, image_file), output_class_dir)

        # Augment to reach target count
        for image_file in image_files:
            img_path = os.path.join(class_dir, image_file)
            img = Image.open(img_path).convert("RGB")  # Ensure images are in RGB mode
            img_array = np.expand_dims(np.array(img), axis=0)

            current_count = len(os.listdir(output_class_dir))
            save_augmented_images(datagen, img_array, output_class_dir, current_count, target_count)
            if current_count >= target_count:
                break

    else:
        # If exactly 5000 images, just copy
        print(f"Class '{class_name}' already has {current_count} images.")
        for image_file in image_files:
            shutil.copy(os.path.join(class_dir, image_file), output_class_dir)

print("Processing complete.")


Upsampling class 'Moderate_moisture' from 1688 to 5000
Downsampling class 'Highest_moisture' from 13852 to 5000
Upsampling class 'Higher_moisture' from 506 to 5000
Downsampling class 'Moderate_dry' from 5511 to 5000
Upsampling class 'Dry' from 4687 to 5000
Processing complete.


In [None]:
import shutil

# Define the folder you want to zip and the name of the output zip file
folder_to_zip = '/content/Soil_classification_dataset/Soil_classification_dataset_processed'  # Replace with your folder path
output_zip_file = '/content/Soil_classification_dataset/Soil_classification_dataset_processed.zip'  # Replace with the desired output zip file path

# Use shutil to create a zip file
shutil.make_archive(output_zip_file.replace('.zip', ''), 'zip', folder_to_zip)

print(f'Folder {folder_to_zip} has been zipped into {output_zip_file}')

Folder /content/Soil_classification_dataset/processed_dataset has been zipped into /content/Soil_classification_dataset/processed_dataset.zip
