In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
import cv2
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
# Dataset
dataset_path = '/content/drive/MyDrive/bisindo_dataset'

# Target
output_path = '/content/drive/MyDrive/bisindo_dataset_split'
train_path = os.path.join(output_path, 'train')
val_path = os.path.join(output_path, 'val')
test_path = os.path.join(output_path, 'test')

In [None]:
# Create output directories
for folder in [train_path, val_path, test_path]:
    os.makedirs(folder, exist_ok=True)

In [None]:
data = []
for label in os.listdir(dataset_path):
    class_dir = os.path.join(dataset_path, label)
    if not os.path.isdir(class_dir):
        continue
    for fname in os.listdir(class_dir):
        if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            filepath = os.path.join(class_dir, fname)
            data.append((filepath, label))

In [None]:
df = pd.DataFrame(data, columns=['filepath', 'label'])

In [None]:
# Step 2: Stratified split
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=1)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=1)

In [None]:
# Step 3: Copy function
def copy_files(df, target_dir):
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f'Copying to {os.path.basename(target_dir)}'):
        label_dir = os.path.join(target_dir, row['label'])
        os.makedirs(label_dir, exist_ok=True)
        dst_path = os.path.join(label_dir, os.path.basename(row['filepath']))
        shutil.copy2(row['filepath'], dst_path)

In [None]:
# Step 4: Perform copy
copy_files(train_df, train_path)
copy_files(val_df, val_path)
copy_files(test_df, test_path)

print("\nStratified split complete: train/val/test saved in:", output_path)

Copying to train: 100%|██████████| 3126/3126 [35:40<00:00,  1.46it/s]
Copying to val: 100%|██████████| 670/670 [07:40<00:00,  1.46it/s]
Copying to test: 100%|██████████| 670/670 [07:53<00:00,  1.42it/s]


Stratified split complete: train/val/test saved in: /content/drive/MyDrive/bisindo_dataset_split





In [None]:
# Check the Dataset Again
def analyze_dataset(dataset_path):
    class_data = []
    image_shapes = set()
    total_images = 0
    inaccessible_images_total = 0
    inconsistent_shapes = False

    for label in sorted(os.listdir(dataset_path)):
        label_path = os.path.join(dataset_path, label)
        if not os.path.isdir(label_path):
            continue

        image_files = [
            f for f in os.listdir(label_path)
            if f.lower().endswith(('.jpg', '.jpeg', '.png'))
        ]
        count = len(image_files)
        total_images += count

        # Get sample image names
        sample_files = ", ".join(image_files[:3]) if image_files else "-"

        # Check shape from first few images
        local_shapes = set()
        for img_file in image_files[:5]:
            img_path = os.path.join(label_path, img_file)
            img = cv2.imread(img_path)
            if img is not None:
                local_shapes.add(img.shape)
                image_shapes.add(img.shape)

        # Record potential shape inconsistency
        if len(local_shapes) > 1:
            inconsistent_shapes = True

        class_data.append({
            "Label": label,
            "Number of Images": count,
            "Sample Filenames": sample_files,
            "Detected Shapes": list(local_shapes)
        })

    # Convert to DataFrame
    df = pd.DataFrame(class_data)
    df = df.sort_values("Label").reset_index(drop=True)

    # Display table
    print(f"Total Classes: {len(df)}")
    print(f"Total Images: {total_images}")
    if inconsistent_shapes or len(image_shapes) > 1:
        print("Warning: Multiple image resolutions detected.")

    return df

In [None]:
# Train
train_df = analyze_dataset(train_path)
train_df

Total Classes: 26
Total Images: 3126


Unnamed: 0,Label,Number of Images,Sample Filenames,Detected Shapes
0,A,116,"A_Background Baju_Samsung_A55.jpg, A_Polos Ter...","[(3060, 3060, 3), (3072, 3072, 3), (3056, 3056..."
1,B,118,"B_BACKGROUNDBAJU_VIVO_V20.jpeg, B_Polos terang...","[(1280, 1280, 3), (2316, 2316, 3), (3072, 3072..."
2,C,118,"C_Polos Gelap_Samsung_A55.jpg, C_Polos Gelap_O...","[(3904, 3904, 3), (3472, 3472, 3), (3056, 3056..."
3,D,119,"D_BAJU_Apple_Iphone 12.JPG, D_Polos Gelap_Sams...","[(2448, 2448, 3), (2316, 2316, 3), (3072, 3072..."
4,E,118,"E_Putih_Samsung_M51 (2).jpg, E_Polos terang_Ap...","[(2992, 2992, 3), (2448, 2448, 3), (2316, 2316..."
5,F,121,"F_Polos Gelap_Apple_Iphone 7+.JPG, F_Baju_Real...","[(3024, 3024, 3), (3072, 3072, 3), (3056, 3056..."
6,G,121,"G_Background_Baju_POCO_F4.jpg, G_Bebas_Xiaomi_...","[(1379, 1379, 3), (3904, 3904, 3), (3056, 3056..."
7,H,121,"H_Gelap_Apple_Iphone XS.jpg, H_Polos terang_Ap...","[(2316, 2316, 3), (720, 720, 3), (4000, 4000, ..."
8,I,121,"I_Bebas_Xiaomi _Note 8 Pro.jpg, I_BEBAS_Apple_...","[(1280, 1280, 3), (2316, 2316, 3), (3024, 3024..."
9,J,118,"J_Background Baju_Iphone 14 Pro.jpg, J_Backgro...","[(2992, 2992, 3), (3024, 3024, 3), (3056, 3056..."


In [None]:
# Val
val_df = analyze_dataset(val_path)
val_df

Total Classes: 26
Total Images: 670


Unnamed: 0,Label,Number of Images,Sample Filenames,Detected Shapes
0,A,25,"A_Polos Terang_Samsung_A21S.jpeg, A_Bebas_Sams...","[(2448, 2448, 3), (4224, 4224, 3), (667, 668, ..."
1,B,25,"B_Polos Terang_ROG_Phone5.jpg, B_Baju_Apple_Ip...","[(3072, 3072, 3), (2316, 2316, 3), (3056, 3056..."
2,C,25,"C_backgroundputih_vivov20.jpeg, C_BAJU PUTIH_S...","[(1280, 1280, 3), (3024, 3024, 3), (4000, 4000..."
3,D,26,"D_Polos Gelap_Samsung_Galaxy A32 5G.jpg, 20240...","[(2992, 2992, 3), (2316, 2316, 3), (3056, 3056..."
4,E,25,"E_Polos Terang_iPhone_XR.jpeg, E_Body Red_Xiao...","[(1600, 1600, 3), (2992, 2992, 3), (3472, 3472..."
5,F,26,"F_background baju_Samsung_A33.jpg, 20240509_18...","[(1388, 1388, 3), (2544, 2544, 3), (3056, 3056..."
6,G,26,"G_Polos Gelap_Vivo_V23 5G.jpg, G_backgroundhit...","[(1280, 1280, 3), (3024, 3024, 3), (3464, 3464..."
7,H,26,"H_Bebas_Samsung_Galaxy M14 5G.jpg, H_Polos Ter...","[(1280, 1280, 3), (2316, 2316, 3), (3024, 3024..."
8,I,26,"I_Batik_Sony_A7C.JPG, I_Baju_Infinix_Hot 11s N...","[(1280, 1280, 3), (1581, 1581, 3), (2944, 2944..."
9,J,25,"J_Polos Terang_Redmi_Note 12 Pro.jpg, J_Putih_...","[(2544, 2544, 3), (2944, 2944, 3), (3072, 3072..."


In [None]:
# Test
test_df = analyze_dataset(test_path)
test_df

Total Classes: 26
Total Images: 670


Unnamed: 0,Label,Number of Images,Sample Filenames,Detected Shapes
0,A,24,"A_Baju_Samsung_A22.jpg, A_Bebas_Oppo_A53.jpg, ...","[(2992, 2992, 3), (3024, 3024, 3), (2944, 2944..."
1,B,26,"B_Baju_Vivo_V21.jpg, B_POLOS GELAP_SAMSUNG_S22...","[(1280, 1280, 3), (2992, 2992, 3), (3456, 3456..."
2,C,26,"C_Bebas_Apple_Iphone XS.jpg, C_Background Polo...","[(2448, 2448, 3), (720, 720, 3), (2180, 2180, ..."
3,D,25,"D_Polos Terang_Infinix_Hot 11s NFC.jpg, IMG_20...","[(1280, 1280, 3), (1282, 1284, 3), (2944, 2944..."
4,E,26,"E_Background_Polos_Terang_POCO_F4.jpg, E_Bebas...","[(1280, 1280, 3), (2992, 2992, 3), (3472, 3472..."
5,F,26,"F_Background Baju_Samsung_A55.jpg, F_Baju_Appl...","[(1280, 1280, 3), (720, 720, 3), (3056, 3056, ..."
6,G,26,"G_Bebas_Samsung_A32.jpg, G_BAJU_Apple_Iphone 1...","[(2316, 2316, 3), (3024, 3024, 3), (3456, 3456..."
7,H,26,"H_Bebas_ROG_Phone5.jpg, H_Background_Baju_POCO...","[(3904, 3904, 3), (3056, 3056, 3), (1600, 1600..."
8,I,26,"I_Background Baju_Iphone 14 Pro.jpg, I_Bebas_S...","[(2992, 2992, 3), (3024, 3024, 3), (2604, 2753..."
9,J,26,"J_Polos terang_Oppo_A53.jpg, J_Bebas_Xiaomi _N...","[(3024, 3024, 3), (1280, 1280, 3), (2992, 2992..."
