In [1]:
# --- dependencies ---
from pathlib import Path
import pandas as pd

DATASET_PARENT = Path(
    r"c:/Meet/Projects/Project_8_Phoenix_Cervical Cancer Image Classification/Project-Phoenix/Dataset/Augmented Dataset - Limited Enhancement"
)
# ----------------------------------------------

if not DATASET_PARENT.exists():
    raise FileNotFoundError(f"Dataset parent path not found: {DATASET_PARENT}")

# --- find all NLM_CLAHE directories (case-insensitive) ---
nlm_dirs = set()

# 1) immediate child search: look for X/<class>/NLM_CLAHE
for child in DATASET_PARENT.iterdir():
    if not child.is_dir():
        continue
    # search child for a folder named NLM_CLAHE (case-insensitive)
    for sub in child.iterdir():
        if sub.is_dir() and sub.name.lower() == "nlm_clahe":
            nlm_dirs.add(sub.resolve())
            break

# 2) recursive fallback: in case structure is deeper or different
for p in DATASET_PARENT.rglob("*"):
    if p.is_dir() and p.name.lower() == "nlm_clahe":
        nlm_dirs.add(p.resolve())

if not nlm_dirs:
    raise FileNotFoundError(
        "No 'NLM_CLAHE' directories found under DATASET_PARENT. "
        "Check folder names and capitalization."
    )

# --- collect BMP files from each NLM_CLAHE and map to class name (parent folder) ---
rows = []
seen_paths = set()   # dedupe absolute paths

for nlm in sorted(nlm_dirs, key=lambda x: str(x)):
    class_name = nlm.parent.name    # parent folder is the class label
    # gather BMP files (case-insensitive)
    bmp_files = [p.resolve() for p in nlm.iterdir() if p.is_file() and p.suffix.lower() == ".bmp"]
    if not bmp_files:
        # warn but continue
        print(f"Warning: no .bmp files found in: {nlm}  (class = '{class_name}')")
        continue
    for p in bmp_files:
        sp = str(p)
        if sp in seen_paths:
            continue
        seen_paths.add(sp)
        rows.append((sp, class_name))

# --- build DataFrame ---
df = pd.DataFrame(rows, columns=["image_path", "label_name"])
if df.empty:
    raise RuntimeError("No .bmp image files were found in any discovered NLM_CLAHE directories.")

# stable sorted class ordering -> map to integer labels
class_names = sorted(df["label_name"].unique().tolist())
label_to_id = {n: i for i, n in enumerate(class_names)}
df["label"] = df["label_name"].map(label_to_id)

# optional: shuffle rows (helps downstream splitting)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# summary prints
print("Dataset parent:", DATASET_PARENT)
print("Discovered NLM_CLAHE directories (count):", len(nlm_dirs))
for p in sorted(nlm_dirs):
    print("  -", p)
print("\nFound classes (alphabetical):", class_names)
print("Total images found:", len(df))
print("Counts per class:")
print(df.groupby("label_name").size().sort_values(ascending=False))

# save csv to dataset parent for convenience
csv_out = DATASET_PARENT / "sipakmed_file_list.csv"
df.to_csv(csv_out, index=False)
print(f"\nSaved file list to: {csv_out}")

Dataset parent: c:\Meet\Projects\Project_8_Phoenix_Cervical Cancer Image Classification\Project-Phoenix\Dataset\Augmented Dataset - Limited Enhancement
Discovered NLM_CLAHE directories (count): 5
  - C:\Meet\Projects\Project_8_Phoenix_Cervical Cancer Image Classification\Project-Phoenix\Dataset\Augmented Dataset - Limited Enhancement\im_Dyskeratotic\NLM_CLAHE
  - C:\Meet\Projects\Project_8_Phoenix_Cervical Cancer Image Classification\Project-Phoenix\Dataset\Augmented Dataset - Limited Enhancement\im_Koilocytotic\NLM_CLAHE
  - C:\Meet\Projects\Project_8_Phoenix_Cervical Cancer Image Classification\Project-Phoenix\Dataset\Augmented Dataset - Limited Enhancement\im_Metaplastic\NLM_CLAHE
  - C:\Meet\Projects\Project_8_Phoenix_Cervical Cancer Image Classification\Project-Phoenix\Dataset\Augmented Dataset - Limited Enhancement\im_Parabasal\NLM_CLAHE
  - C:\Meet\Projects\Project_8_Phoenix_Cervical Cancer Image Classification\Project-Phoenix\Dataset\Augmented Dataset - Limited Enhancement\im_S

In [2]:
from sklearn.model_selection import train_test_split

# stratified split
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df['label'], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42
)

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

# Optional: check class distribution
print("\nTrain class counts:\n", train_df['label_name'].value_counts())
print("\nValidation class counts:\n", val_df['label_name'].value_counts())
print("\nTest class counts:\n", test_df['label_name'].value_counts())


Train size: 3239
Validation size: 405
Test size: 405

Train class counts:
 label_name
im_Superficial-Intermediate    665
im_Koilocytotic                660
im_Dyskeratotic                650
im_Metaplastic                 634
im_Parabasal                   630
Name: count, dtype: int64

Validation class counts:
 label_name
im_Superficial-Intermediate    83
im_Koilocytotic                83
im_Dyskeratotic                81
im_Metaplastic                 79
im_Parabasal                   79
Name: count, dtype: int64

Test class counts:
 label_name
im_Superficial-Intermediate    83
im_Koilocytotic                82
im_Dyskeratotic                82
im_Metaplastic                 80
im_Parabasal                   78
Name: count, dtype: int64


In [None]:
from datasets import Dataset, DatasetDict, Features, ClassLabel, Image

# Define features for HF dataset
features = Features({
    "image": Image(),                # image will be lazy-loaded
    "label": ClassLabel(names=sorted(df['label_name'].unique()))
})

def df_to_ds(dframe):
    d = Dataset.from_dict({
        "image": dframe["image_path"].tolist(),
        "label": dframe["label"].tolist()
    })
    return d.cast(features)

dataset = DatasetDict({
    "train": df_to_ds(train_df.reset_index(drop=True)),
    "validation": df_to_ds(val_df.reset_index(drop=True)),
    "test": df_to_ds(test_df.reset_index(drop=True))
})

# Quick check
print(dataset)
print(dataset['train'][0])


ModuleNotFoundError: No module named 'datasets'