In [2]:
import zipfile
import os
import shutil
import pandas as pd
from PIL import Image

In [3]:
# File is directly uploaded to session
ZIP_FILE_PATH = 'Cell Tower Detection.v3i.tensorflow.zip'

# Name of the directory where the files will be extracted.
EXTRACT_DIR = 'extracted_data'

# Expected folders to process
TARGET_FOLDERS = ['train', 'test', 'valid']

# Expected dimensions (used for verification and reporting)
EXPECTED_WIDTH = 640
EXPECTED_HEIGHT = 640

In [4]:
# 1. Clean up and create extraction directory
if os.path.exists(EXTRACT_DIR):
    print(f"Cleaning up old directory: {EXTRACT_DIR}")
    shutil.rmtree(EXTRACT_DIR)
os.makedirs(EXTRACT_DIR, exist_ok=True)

try:
    # 2. Extract the zip file
    print(f"Extracting {ZIP_FILE_PATH} to {EXTRACT_DIR}...")
    with zipfile.ZipFile(ZIP_FILE_PATH, 'r') as zf:
        zf.extractall(EXTRACT_DIR)
    print("Extraction complete.")

    # 3. Process each target folder
    for folder_name in TARGET_FOLDERS:
        # --- Load annotations CSV ---
        annotations_path = os.path.join(EXTRACT_DIR, folder_name, '_annotations.csv')
        annotations_df = None
        if os.path.exists(annotations_path):
            print("Loading _annotations.csv for cross-reference...")
            annotations_df = pd.read_csv(annotations_path)
            # Rename column for easy lookup (assuming first column is the filename)
            if 'filename' in annotations_df.columns:
                  annotations_df = annotations_df.rename(columns={'filename': 'original_filename'})
            else:
                  # Assume the first column is the filename if 'filename' is missing
                  annotations_df = annotations_df.rename(columns={annotations_df.columns[0]: 'original_filename'})
        else:
            print("\nWARNING: '_annotations.csv' not found. Skipping annotation checks.")

        # Look for the folder name, even if nested (e.g., zip might contain 'project_name/train')
        folder_path = None
        for root, dirs, files in os.walk(EXTRACT_DIR):
            if os.path.basename(root) == folder_name:
                folder_path = root
                break

        if not folder_path:
            print(f"\nWARNING: Folder '{folder_name}' not found within the extracted data. Skipping.")
            continue

        print(f"\nProcessing folder: {folder_name}...")

        # Filter for common image file extensions
        image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'))]
        image_files.sort() # Ensure consistent renaming order

        data_list = []
        images_without_annotation = []
        mismatched_dimensions = []

        for i, original_filename in enumerate(image_files):
            full_original_path = os.path.join(folder_path, original_filename)

            try:
                # Use PIL to open the image and determine metadata
                with Image.open(full_original_path) as img:
                    width, height = img.size
                    # Determine the correct file extension from the image format
                    ext = '.' + img.format.lower() if img.format else '.jpg'

                    # Generate the new filename (e.g., 'image1.jpg', 'image2.png')
                    new_filename = f"image{i+1}{ext}"
                    full_new_path = os.path.join(folder_path, new_filename)

                    # RENAME the file
                    if full_original_path != full_new_path:
                        os.rename(full_original_path, full_new_path)

                    # Add data for the CSV
                    data_list.append({
                        'filename': new_filename,
                        'width': width,
                        'height': height,
                        'original_filename': original_filename # Temporary for check
                    })

                    # --- Annotation and Dimension Check ---
                    if annotations_df is not None:
                        # 1. Check if the original file exists in the annotations CSV
                        if not annotations_df['original_filename'].str.contains(original_filename, case=False, na=False).any():
                            images_without_annotation.append(new_filename)

                    # 2. Check if the actual dimensions match the expected 640x640
                    if width != EXPECTED_WIDTH or height != EXPECTED_HEIGHT:
                        mismatched_dimensions.append(f"{new_filename} (Actual: {width}x{height})")

            except Exception as e:
                print(f"Error processing image {original_filename}: {e}")
                continue

        # 4. Generate the CSV file for the current folder
        if data_list:
            df = pd.DataFrame(data_list)
            # Drop the temporary original filename column
            df = df.drop(columns=['original_filename'])

            csv_filename = f"{folder_name}_image_data.csv"
            # Save the CSV to the Colab root directory
            df.to_csv(csv_filename, index=False)
            print(f"✅ CSV file created successfully: {csv_filename}")

        # 5. Report findings for the folder
        print(f"\n--- Validation Report for {folder_name} ---")

        # Annotation Check Report
        if annotations_df is not None:
            if images_without_annotation:
                print(f"❌ Annotation Issue: {len(images_without_annotation)} images had NO corresponding entry in annotations.csv.")
                print(f"Example: {images_without_annotation[0]}")
            else:
                print("✅ Annotation Check: All images had an entry in annotations.csv.")

        # Dimension Check Report (640x640)
        if mismatched_dimensions:
            print(f"⚠️ Dimension Issue: {len(mismatched_dimensions)} images were NOT {EXPECTED_WIDTH}x{EXPECTED_HEIGHT}.")
            print(f"Example: {mismatched_dimensions[0]}")
        else:
            print(f"✅ Dimension Check: All images were confirmed to be {EXPECTED_WIDTH}x{EXPECTED_HEIGHT}.")

except FileNotFoundError:
    print(f"\nERROR: The zip file was not found at {ZIP_FILE_PATH}. Please check your path or Colab upload.")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")
finally:
    # Option to keep the extracted folder for inspection or future use
    # If you want to clean it up automatically, uncomment the line below:
    print("\n--- Script Finished ---")

Extracting Cell Tower Detection.v3i.tensorflow.zip to extracted_data...
Extraction complete.
Loading _annotations.csv for cross-reference...

Processing folder: train...
✅ CSV file created successfully: train_image_data.csv

--- Validation Report for train ---
❌ Annotation Issue: 6 images had NO corresponding entry in annotations.csv.
Example: image93.jpeg
✅ Dimension Check: All images were confirmed to be 640x640.
Loading _annotations.csv for cross-reference...

Processing folder: test...
✅ CSV file created successfully: test_image_data.csv

--- Validation Report for test ---
✅ Annotation Check: All images had an entry in annotations.csv.
✅ Dimension Check: All images were confirmed to be 640x640.
Loading _annotations.csv for cross-reference...

Processing folder: valid...
✅ CSV file created successfully: valid_image_data.csv

--- Validation Report for valid ---
❌ Annotation Issue: 2 images had NO corresponding entry in annotations.csv.
Example: image22.jpeg
✅ Dimension Check: All ima

In [5]:
import shutil
import os

# --- Configuration ---
# This is the folder created and processed by the previous script
# e.g., 'extracted_data_temp' or 'extracted_data'
FOLDER_TO_ZIP = 'cell_tower_detection_images'

# The name of the final zip file (it will automatically add the .zip extension)
OUTPUT_ZIP_NAME = 'cell_tower_detection_images'

# --- Execution ---

# 1. Create the archive
# The parameters are:
# - base_name: The name for the final archive (no extension)
# - format: The archive format (e.g., 'zip', 'tar', 'gztar')
# - root_dir: The directory that should be archived (FOLDER_TO_ZIP's parent)
# - base_dir: The directory *to start* archiving from (FOLDER_TO_ZIP itself)

print(f"Creating zip file from the folder '{FOLDER_TO_ZIP}'...")

# Using the parent directory as the root and the folder itself as the base
# ensures the zip file contains the folder (e.g., 'extracted_data/train/', etc.)
archive_path = shutil.make_archive(
    base_name=OUTPUT_ZIP_NAME,
    format='zip',
    root_dir=os.getcwd(), # The current working directory in Colab
    base_dir=FOLDER_TO_ZIP
)

print(f"✅ Zip file created successfully at: {archive_path}")


# 2. Download the file (Specific to Google Colab)
from google.colab import files
files.download(f'{OUTPUT_ZIP_NAME}.zip')

Creating zip file from the folder 'cell_tower_detection_images'...
✅ Zip file created successfully at: /content/cell_tower_detection_images.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>