<div class="alert alert-block alert-info">

----------
---------
# <b> 1. Imports and Reproducibility</b> 

--------------
----------------
</div>

In [None]:
import random
import json
import numpy as np
import pandas as pd
from pathlib import Path
import torch
from torchvision import datasets
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True # Allowing loading truncated images


SEED = 42 # Fixing random seeds for reproducibility
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)


<div class="alert alert-block alert-info">

----------
---------
# <b> 2. Paths and Device Setup</b> 

--------------
----------------
</div>

In [None]:
DATA_DIR = Path("/home/shared-data/corrosion_images") # Dataset location on the server

SPLIT_DIR = Path("/home/javid/corrosion-detector-submission/data") # Directory to store split information
SPLIT_DIR.mkdir(parents=True, exist_ok=True)

NPY_PATH  = SPLIT_DIR / "fixed_split_indices.npy" # Output files
JSON_PATH = SPLIT_DIR / "fixed_split_indices.json"
CSV_PATH  = SPLIT_DIR / "split_summary.csv"


<div class="alert alert-block alert-info">

----------
---------
# <b> 3: Loading Dataset</b> 

--------------
----------------
</div>

In [4]:
print("Loading dataset...")
dataset = datasets.ImageFolder(DATA_DIR)

num_samples = len(dataset)
class_names = dataset.classes

print(f"Total images: {num_samples}")
print(f"Classes: {class_names}")


Loading dataset...
Total images: 3999
Classes: ['corrosion', 'no_corrosion']


<div class="alert alert-block alert-info">

----------
---------
# <b> 4.Creating Fixed 80/10/10 Split</b> 

--------------
----------------
</div>

In [None]:
indices = np.arange(num_samples) # Creating shuffled indices
np.random.shuffle(indices)

train_end = int(0.8 * num_samples) # Defining split sizes (80 / 10 / 10)
val_end   = int(0.9 * num_samples)

split_indices = {
    "train": indices[:train_end],
    "val":   indices[train_end:val_end],
    "test":  indices[val_end:]
}

np.save(NPY_PATH, split_indices) # Saving as NumPy (for PyTorch)

with open(JSON_PATH, "w") as f: # Saving as JSON 
    json.dump(
        {k: v.tolist() for k, v in split_indices.items()},
        f,
        indent=2
    )

summary_df = pd.DataFrame({ # Saving CSV summary
    "split": ["train", "val", "test"],
    "num_samples": [
        len(split_indices["train"]),
        len(split_indices["val"]),
        len(split_indices["test"])
    ],
    "percentage": [
        len(split_indices["train"]) / num_samples * 100,
        len(split_indices["val"])   / num_samples * 100,
        len(split_indices["test"])  / num_samples * 100
    ]
})

summary_df.to_csv(CSV_PATH, index=False)

print("Fixed split created and saved:")
print(f"- {NPY_PATH}")
print(f"- {JSON_PATH}")
print(f"- {CSV_PATH}")


Fixed split created and saved:
- /home/javid/corrosion-detector-submission/data/fixed_split_indices.npy
- /home/javid/corrosion-detector-submission/data/fixed_split_indices.json
- /home/javid/corrosion-detector-submission/data/split_summary.csv


<div class="alert alert-block alert-info">

----------
---------
# <b> 5: Verifying Saved Split</b> 

--------------
----------------
</div>

In [6]:
loaded_split = np.load(NPY_PATH, allow_pickle=True).item()

print("Loaded split sizes:")
for k, v in loaded_split.items():
    print(f"{k}: {len(v)}")


Loaded split sizes:
train: 3199
val: 400
test: 400
