In [None]:
import os
import pandas as pd
import zipfile
from datetime import datetime
from tqdm import tqdm
import shutil

# ✅ Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ---------- Config ----------
drive_input_dir = "/content/drive/MyDrive/AMMeBa_Dataset"
main_zip_path = os.path.join(drive_input_dir, "all_images_batches.zip")
label_files = [
    os.path.join(drive_input_dir, "labels_new_3.csv"),
    os.path.join(drive_input_dir, "labels_new_3 (1).csv"),
    os.path.join(drive_input_dir, "labels_new_3 (2).csv"),
]

Mounted at /content/drive


In [None]:
# ---------- Step 1: Unzip all_images_batches.zip ----------
unpack_dir = "/content/all_images_batches"
os.makedirs(unpack_dir, exist_ok=True)

print("📦 Unzipping all_images_batches.zip...")
with zipfile.ZipFile(main_zip_path, 'r') as zip_ref:
    zip_ref.extractall(unpack_dir)

# ---------- Step 2: Unzip all image batch zips ----------
image_dir = "/content/images_all"
os.makedirs(image_dir, exist_ok=True)

batch_zips = [f for f in os.listdir(unpack_dir) if f.endswith(".zip")]
print(f"📦 Found {len(batch_zips)} image batch files.")
for bz in tqdm(batch_zips, desc="Extracting image batches"):
    with zipfile.ZipFile(os.path.join(unpack_dir, bz), 'r') as z:
        z.extractall(image_dir)

# ---------- Step 3: Load and merge all CSVs ----------
df_list = [pd.read_csv(f) for f in label_files]
labels_df = pd.concat(df_list).reset_index(drop=True)

# ---------- Step 4: Clean and filter ----------
labels_df["disqualified"] = labels_df["disqualified"].astype(str).str.strip().str.upper()
num_disqualified = (labels_df["disqualified"] == "TRUE").sum()
labels_df = labels_df[labels_df["disqualified"] != "TRUE"].reset_index(drop=True)

print(f"🧹 Disqualified entries removed: {num_disqualified}")
print(f"🟢 Remaining entries: {len(labels_df)}")


In [None]:

# ---------- Step 5: Sort by submission_time ----------
labels_df["submission_time"] = pd.to_datetime(labels_df["submission_time"], format="%Y-%m-%d %H:%M:%S")

labels_df = labels_df.sort_values(by="submission_time").reset_index(drop=True)

# ---------- Step 6: Time-based split 10:1:1 ----------
total_len = len(labels_df)
train_end = int(total_len * 10 / 12)
val_end = int(total_len * 11 / 12)

train_df = labels_df.iloc[:train_end].copy()
val_df = labels_df.iloc[train_end:val_end].copy()
test_df = labels_df.iloc[val_end:].copy()

# ---------- Sanity check ----------
assert train_df["submission_time"].max() < val_df["submission_time"].min(), "Train must be before Val"
assert val_df["submission_time"].max() < test_df["submission_time"].min(), "Val must be before Test"

print("✅ Time-based split complete:")
print(f"Train: {len(train_df)} samples")
print(f"Val:   {len(val_df)} samples")
print(f"Test:  {len(test_df)} samples")

# ---------- Step 7: Save CSVs only (no image move) ----------
final_dataset_dir = "/content/final_dataset"
os.makedirs(final_dataset_dir, exist_ok=True)

train_df.to_csv(os.path.join(final_dataset_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(final_dataset_dir, "val.csv"), index=False)
test_df.to_csv(os.path.join(final_dataset_dir, "test.csv"), index=False)

# ---------- Step 8: Zip everything ----------
print("📦 Zipping final_dataset...")
shutil.make_archive("/content/final_dataset", 'zip', final_dataset_dir)
shutil.make_archive("/content/images_all", 'zip', image_dir)

# ---------- Step 9: Upload to Drive ----------
# Copy all images into final_dataset/images/ before zipping
print("📥 Copying all images into final_dataset/images/ ...")
dst_img_dir = os.path.join(final_dataset_dir, "images")
os.makedirs(dst_img_dir, exist_ok=True)

from glob import glob
image_paths = glob(os.path.join(image_dir, "*"))
for img_path in tqdm(image_paths, desc="Copying images"):
    shutil.copy(img_path, dst_img_dir)

# Now zip entire final_dataset folder
print("📦 Creating final_dataset.zip...")
final_zip_path = "/content/final_dataset.zip"
with zipfile.ZipFile(final_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, _, files in os.walk(final_dataset_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, start=os.path.dirname(final_dataset_dir))
            zipf.write(file_path, arcname)

# Upload to Google Drive
shutil.copy(final_zip_path, os.path.join(drive_input_dir, "final_dataset.zip"))
print(f"✅ Uploaded final_dataset.zip with folder structure to Google Drive.")


✅ Time-based split complete:
Train: 11240 samples
Val:   1124 samples
Test:  1124 samples
📦 Zipping final_dataset...
📥 Copying all images into final_dataset/images/ ...


Copying images: 100%|██████████| 10264/10264 [00:07<00:00, 1320.78it/s]


📦 Creating final_dataset.zip...
✅ Uploaded final_dataset.zip with folder structure to Google Drive.


In [None]:
# Paths
base_dir = "/content/final_dataset"
image_dir = os.path.join(base_dir, "images")
splits = ["train", "val", "test"]

# Result summary
total_missing = 0

for split in splits:
    csv_path = os.path.join(base_dir, f"{split}.csv")
    df = pd.read_csv(csv_path)

    missing_files = []
    for fname in df["filename"]:
        if not os.path.exists(os.path.join(image_dir, fname)):
            missing_files.append(fname)

    num_missing = len(missing_files)
    total = len(df)
    valid = total - num_missing

    print(f"📂 {split}.csv: {total} records")
    print(f"✅ Found: {valid}")
    print(f"❌ Missing: {num_missing}")

    if num_missing > 0:
        print("⚠️ Example missing files (first 5):", missing_files[:5])
        # Optional: save list to file
        with open(os.path.join(base_dir, f"missing_in_{split}.txt"), "w") as f:
            for mf in missing_files:
                f.write(f"{mf}\n")

    total_missing += num_missing

print("\n✅ Done checking all CSV files.")
if total_missing == 0:
    print("🎉 All images accounted for!")
else:
    print(f"⚠️ Total missing images: {total_missing}")

📂 train.csv: 11240 records
✅ Found: 11240
❌ Missing: 0
📂 val.csv: 1124 records
✅ Found: 1124
❌ Missing: 0
📂 test.csv: 1124 records
✅ Found: 1124
❌ Missing: 0

✅ Done checking all CSV files.
🎉 All images accounted for!
