In [1]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
if len(sys.argv) != 2:
    print(f"Usage: {sys.argv[0]} <csv_file>")
    sys.exit(1)

csv_path = 'reference_Raman.csv'
df = pd.read_csv(csv_path)

if 'Label' not in df.columns:
    raise KeyError("No 'Label' column found in the input CSV.")

# We'll carve off fractions of the *remaining* data:
# 1/5, then 1/4 of what's left, then 1/3, then 1/2, and the rest is the final fifth.
fractions = [1/5, 1/4, 1/3, 1/2]

rest = df.copy()
splits = []

for frac in fractions:
    part, rest = train_test_split(
        rest,
        train_size=frac,
        stratify=rest['Label'],
        random_state=42
    )
    splits.append(part)

# Whatever is left is the 5th split
splits.append(rest)

# Save them out
base = csv_path.rsplit('.', 1)[0]
for i, part in enumerate(splits, start=1):
    out_name = f"{base}_{i}.csv"
    part.to_csv(out_name, index=False)

print("Done. Files written:",
      ", ".join(f"{base}_{i}.csv" for i in range(1, 6)))


Done. Files written: reference_Raman_1.csv, reference_Raman_2.csv, reference_Raman_3.csv, reference_Raman_4.csv, reference_Raman_5.csv
