<a href="https://colab.research.google.com/github/Iftitahyr/Klasifikasi-Penyakit-Daun-Kopi-Menggunakan-Metode-Machine-Learning-Klasik-dan-CNN/blob/main/split_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**kode program untuk split dataset**

In [None]:
!pip install split-folders

Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl.metadata (6.2 kB)
Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [None]:
import os
import shutil
from google.colab import drive
import splitfolders # Pastikan library ini sudah terinstal

# ============================================
# 1. Mount Google Drive
# ============================================
print("--- Mounting Google Drive ---")
drive.mount('/content/drive')

# ============================================
# 2. Konfigurasi Path Dataset
# ============================================
# --- PENTING: Sesuaikan path ini dengan lokasi dataset asli Anda di Drive ---
# Ini adalah folder yang berisi sub-folder untuk setiap kelas gambar (misal: coffee___healthy, coffee___red_spider_mite, coffee___rust)
original_dataset_path = '/content/drive/MyDrive/semester 7/ML Teori/Dataset/Dataset_Daun_Kopi_Balanced'

# Ini adalah folder baru tempat dataset yang sudah di-split akan disimpan.
# Di dalamnya akan ada sub-folder 'train', 'val', 'test'.
output_split_path = '/content/drive/MyDrive/semester 7/ML Teori/Dataset/dataset_coffee_split'

print(f"\nOriginal Dataset Path: {original_dataset_path}")
print(f"Output Split Path: {output_split_path}")

# ============================================
# 3. Validasi Path dan Persiapan Output Folder
# ============================================
if not os.path.exists(original_dataset_path):
    print(f"Error: Original dataset path '{original_dataset_path}' not found.")
    print("Please check the path and ensure your dataset is located there.")
    exit()

# Hapus folder output_split_path jika sudah ada untuk memastikan split yang bersih
if os.path.exists(output_split_path):
    print(f"Warning: Existing output directory '{output_split_path}' found.")
    print("Deleting existing directory to perform a clean split...")
    try:
        shutil.rmtree(output_split_path)
        print("Existing output directory deleted successfully.")
    except Exception as e:
        print(f"Error deleting existing output directory: {e}")
        print("Please delete it manually if this persists.")
        exit()

# ============================================
# 4. Melakukan Pembagian Dataset (Split)
# ============================================
print("\n--- Starting Dataset Splitting Process (80% Train, 10% Validation, 10% Test) ---")

# Rasio split: (train, validation, test)
# 0.8 untuk training, 0.1 untuk validation, 0.1 untuk test
split_ratio = (0.8, 0.1, 0.1)
random_seed = 42 # Untuk hasil split yang reproduktif

try:
    splitfolders.ratio(
        original_dataset_path,
        output=output_split_path,
        seed=random_seed,
        ratio=split_ratio,
        group_prefix=None, # Jangan tambahkan prefix ke nama file
        move=False # Penting: Salin file, jangan pindahkan (dataset asli tetap utuh)
    )
    print(f"\nDataset split completed successfully!")
    print(f"New split dataset is located at: {output_split_path}")
    print("You will find 'train', 'val', and 'test' subfolders inside.")
except Exception as e:
    print(f"\nError during dataset splitting: {e}")
    print("Please check your original_dataset_path and folder structure.")

# ============================================
# 5. Konfirmasi Hasil Split (Opsional, untuk memverifikasi)
# ============================================
print("\n--- Verifying Split Structure and Counts ---")
train_count = 0
val_count = 0
test_count = 0
total_original_count = 0

# Dapatkan nama kelas dari folder yang baru dibuat
if os.path.exists(os.path.join(output_split_path, 'train')):
    class_names = sorted(os.listdir(os.path.join(output_split_path, 'train')))
    class_names = [name for name in class_names if os.path.isdir(os.path.join(output_split_path, 'train', name))]
else:
    print("Error: 'train' folder not found in output. Split may have failed.")
    exit()

print(f"Classes found: {class_names}")

for class_name in class_names:
    train_class_path = os.path.join(output_split_path, 'train', class_name)
    val_class_path = os.path.join(output_split_path, 'val', class_name)
    test_class_path = os.path.join(output_split_path, 'test', class_name)

    train_files = len(os.listdir(train_class_path)) if os.path.exists(train_class_path) else 0
    val_files = len(os.listdir(val_class_path)) if os.path.exists(val_class_path) else 0
    test_files = len(os.listdir(test_class_path)) if os.path.exists(test_class_path) else 0

    print(f"  {class_name}:")
    print(f"    Train: {train_files} images")
    print(f"    Validation: {val_files} images")
    print(f"    Test: {test_files} images")

    train_count += train_files
    val_count += val_files
    test_count += test_files

# Menghitung total gambar dari dataset asli untuk perbandingan
for class_name in class_names:
    original_class_path = os.path.join(original_dataset_path, class_name)
    if os.path.exists(original_class_path):
        total_original_count += len(os.listdir(original_class_path))

total_split_count = train_count + val_count + test_count

print(f"\n--- Overall Split Summary ---")
print(f"Total images in Original Dataset: {total_original_count}")
print(f"Total images in Split Dataset (Train + Val + Test): {total_split_count}")
print(f"Train Set: {train_count} images ({train_count/total_split_count:.1%})")
print(f"Validation Set: {val_count} images ({val_count/total_split_count:.1%})")
print(f"Test Set: {test_count} images ({test_count/total_split_count:.1%})")

if total_split_count == total_original_count:
    print("Verification: Total images in split dataset matches original dataset.")
else:
    print("Warning: Total images in split dataset does NOT match original dataset. Investigate!")

print("\n--- Dataset Splitting Script Finished ---")

--- Mounting Google Drive ---
Mounted at /content/drive

Original Dataset Path: /content/drive/MyDrive/semester 7/ML Teori/Dataset/Dataset_Daun_Kopi_Balanced
Output Split Path: /content/drive/MyDrive/semester 7/ML Teori/Dataset/dataset_coffee_split

--- Starting Dataset Splitting Process (80% Train, 10% Validation, 10% Test) ---


Copying files: 4500 files [03:41, 20.34 files/s]



Dataset split completed successfully!
New split dataset is located at: /content/drive/MyDrive/semester 7/ML Teori/Dataset/dataset_coffee_split
You will find 'train', 'val', and 'test' subfolders inside.

--- Verifying Split Structure and Counts ---
Classes found: ['Daun_Bercak', 'Daun_Karat', 'Daun_Sehat']
  Daun_Bercak:
    Train: 1200 images
    Validation: 150 images
    Test: 150 images
  Daun_Karat:
    Train: 1200 images
    Validation: 150 images
    Test: 150 images
  Daun_Sehat:
    Train: 1200 images
    Validation: 150 images
    Test: 150 images

--- Overall Split Summary ---
Total images in Original Dataset: 4500
Total images in Split Dataset (Train + Val + Test): 4500
Train Set: 3600 images (80.0%)
Validation Set: 450 images (10.0%)
Test Set: 450 images (10.0%)
Verification: Total images in split dataset matches original dataset.

--- Dataset Splitting Script Finished ---
