In [1]:
import os
import cv2
import pandas as pd
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor

# === Helper: Find class name for image ===
def find_class_folder(base_dir, filename):
    for root, _, files in os.walk(base_dir):
        if filename in files:
            return os.path.relpath(root, base_dir)
    return None

# === Load annotations and map class names ===
def load_annotations(csv_path, base_img_dir):
    cols = ['filename', 'xmin', 'ymin', 'xmax', 'ymax', 'class_id']
    df = pd.read_csv(csv_path, names=cols)
    tqdm.pandas(desc="Mapping class names")
    df['class_name'] = df['filename'].progress_apply(lambda x: find_class_folder(base_img_dir, x))
    return df

# === Process one image: crop, save, and return record ===
def process_image(row, src_base_dir, dst_base_dir, dataset_type):
    try:
        filename = row['filename']
        class_name = row['class_name']
        class_id = row['class_id']
        xmin, ymin, xmax, ymax = row['xmin'], row['ymin'], row['xmax'], row['ymax']

        src_path = os.path.join(src_base_dir, class_name, filename)
        dst_dir = os.path.join(dst_base_dir, dataset_type, class_name)
        os.makedirs(dst_dir, exist_ok=True)
        dst_path = os.path.join(dst_dir, filename)

        img = cv2.imread(src_path)
        if img is None:
            return None

        cropped = img[ymin:ymax, xmin:xmax]
        cv2.imwrite(dst_path, cropped)

        return {
            "path": f"{dataset_type}/{class_name}/{filename}",
            "class_id": class_id,
            "class_name": class_name
        }
    except:
        return None

# === Apply multiprocessing crop ===
def save_cropped_dataset(df, src_base_dir, dst_base_dir, dataset_type):
    results = []
    with ThreadPoolExecutor() as executor:
        futures = []
        for _, row in df.iterrows():
            futures.append(executor.submit(process_image, row, src_base_dir, dst_base_dir, dataset_type))
        for f in tqdm(futures, desc=f"Cropping {dataset_type} images"):
            result = f.result()
            if result:
                results.append(result)
    return pd.DataFrame(results)

# === Paths ===
TRAIN_IMG_DIR = "/kaggle/input/stanford-car-dataset-by-classes-folder/car_data/car_data/train"
TEST_IMG_DIR = "/kaggle/input/stanford-car-dataset-by-classes-folder/car_data/car_data/test"
ANNO_TRAIN = "/kaggle/input/stanford-car-dataset-by-classes-folder/anno_train.csv"
ANNO_TEST = "/kaggle/input/stanford-car-dataset-by-classes-folder/anno_test.csv"
OUTPUT_DIR = "/kaggle/working/cropped_stanford-car-dataset"

# === Load annotations ===
train_df = load_annotations(ANNO_TRAIN, TRAIN_IMG_DIR)
test_df = load_annotations(ANNO_TEST, TEST_IMG_DIR)

# === Save cropped datasets ===
cropped_train = save_cropped_dataset(train_df, TRAIN_IMG_DIR, OUTPUT_DIR, "train")
cropped_test = save_cropped_dataset(test_df, TEST_IMG_DIR, OUTPUT_DIR, "test")

# === Save CSV files ===
cropped_train.to_csv(os.path.join(OUTPUT_DIR, "train.csv"), index=False)
cropped_test.to_csv(os.path.join(OUTPUT_DIR, "test.csv"), index=False)

print("✅ Done! Cropped dataset and CSVs are ready.")


Mapping class names:   0%|          | 0/8144 [00:00<?, ?it/s]

Mapping class names:   0%|          | 0/8041 [00:00<?, ?it/s]

Cropping train images:   0%|          | 0/8144 [00:00<?, ?it/s]

Cropping test images:   0%|          | 0/8041 [00:00<?, ?it/s]

✅ Done! Cropped dataset and CSVs are ready.


In [2]:
import zipfile
import os

def zip_dataset(zip_name, base_dir):
    zip_path = os.path.join(base_dir, zip_name)
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(base_dir):
            for file in files:
                if file == zip_name:
                    continue  # Don't include the zip file inside itself
                full_path = os.path.join(root, file)
                rel_path = os.path.relpath(full_path, base_dir)
                zipf.write(full_path, arcname=rel_path)
    print(f"✅ Dataset zipped at: {zip_path}")

# Example usage:
zip_dataset("cropped_stanford-car-dataset.zip", OUTPUT_DIR)


✅ Dataset zipped at: /kaggle/working/cropped_stanford-car-dataset/cropped_stanford-car-dataset.zip


In [17]:
import os
import json
import shutil

# Define your Kaggle username manually here
kaggle_username = "kaggle username"

# Define dataset metadata
dataset_name = "CroppedStanfordCarDataset"
dataset_title = "Cropped Stanford Car Dataset"
zip_file_path = "/kaggle/working/cropped_stanford-car-dataset/cropped_stanford-car-dataset.zip"
output_dir = "/kaggle/working/kaggle_dataset_upload"

# Create upload directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Move ZIP file to upload directory
shutil.copy(zip_file_path, os.path.join(output_dir, "cropped_stanford-car-dataset.zip"))

# Create metadata file for Kaggle dataset
metadata = {
    "title": dataset_title,
    "id": f"{kaggle_username}/{dataset_name}",
    "licenses": [{"name": "CC0-1.0"}]
}

with open(os.path.join(output_dir, "dataset-metadata.json"), "w") as f:
    json.dump(metadata, f, indent=4)


In [18]:
import json
import os

# Replace with contents of your kaggle.json
kaggle_token = {
    "username": "kaggle username",
    "key": "kaggle token"
}

# Save it to the appropriate location
os.makedirs("/root/.config/kaggle", exist_ok=True)
with open("/root/.config/kaggle/kaggle.json", "w") as f:
    json.dump(kaggle_token, f)


In [19]:
!kaggle datasets create -p /kaggle/working/kaggle_dataset_upload --dir-mode zip


Starting upload for file cropped_stanford-car-dataset.zip
100%|██████████████████████████████████████| 1.31G/1.31G [00:33<00:00, 42.4MB/s]
Upload successful: cropped_stanford-car-dataset.zip (1GB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/datasets/mahdisavoji/CroppedStanfordCarDataset
