# ZIP File Filtering Tool (Google Colab Version)

This interactive tool lets you **filter the contents of a ZIP archive** stored in Google Drive or uploaded from your local computer.  
You can include or exclude files based on their **extensions** (e.g., `.txt`, `.csv`) or **filename substrings** (e.g., ‚Äúreport‚Äù, ‚Äúdraft‚Äù).  
The filtered result is saved as a new ZIP and can be downloaded directly in Colab.

---

## üß≠ Step-by-Step Instructions

1. **Run this cell** once ‚Äî it will load the tool and mount your Google Drive automatically.  
2. **Choose how to provide the ZIP file:**
   - Click **‚ÄúUpload ZIP (Colab)‚Äù** to upload a file from your computer, **or**
   - Enter the full path to a ZIP already in Drive (e.g. `/content/drive/MyDrive/data/archive.zip`).
3. (Optional) **Set filters:**
   - `Include ext` ‚Üí only keep these extensions (comma-separated, e.g. `.txt,.csv`).
   - `Exclude ext` ‚Üí remove files with these extensions (default excludes image formats).
   - `Include name contains` ‚Üí only keep files whose names include these substrings.
   - `Exclude name contains` ‚Üí remove files whose names include these substrings.
4. (Optional) **Change output settings:**
   - `Dst suffix` ‚Üí suffix for the new ZIP (default `_filtered.zip`).
   - `Dst dir` ‚Üí folder to save the filtered ZIP (empty = same folder as source).
5. Click **‚ÄúRun Filter‚Äù** and watch the progress bar as files are processed.
6. When finished, the tool will show:
   - The number of files copied  
   - Original and filtered ZIP sizes  
   - Compression ratio  
   - A **Download link** for the new ZIP file.

> üí° *Tip:* You can re-run this cell with different filters without re-uploading your file.

---

## üíª Running Locally in VS Code or Jupyter Lab

If you‚Äôre using the notebook locally instead of Colab:

1. Make sure the required packages are installed:
   ```bash
   pip install ipywidgets tqdm
   ```
2. You don‚Äôt need to mount Google Drive ‚Äî simply provide a local ZIP path such as:
```
C:\Users\<username>\Documents\data\archive.zip
```
3. The Upload ZIP (Colab) button will not appear; instead, type or paste the path directly into
the Source ZIP: field.
4. When you click ‚ÄúRun Filter‚Äù, the filtered ZIP will appear in the same directory (or in the one you specify).

5. You can open the resulting ZIP in File Explorer or any archive manager.

üß© Note: Some VS Code environments hide the (venv) prefix in the terminal ‚Äî
you can verify your virtual environment is active by running
python -c "import sys; print(sys.prefix)" in the integrated terminal.


In [None]:
# --- ZIP Filter Utility with Colab/VS Code UI and Progress Bar ---

import zipfile
from pathlib import Path
from IPython.display import display, clear_output
import ipywidgets as widgets
import sys
import os
from tqdm.notebook import tqdm

# --- Detect Colab environment and optionally mount Google Drive ---
in_colab = "google.colab" in sys.modules
if in_colab:
    from google.colab import drive, files
    drive.mount("/content/drive", force_remount=False)
    print("‚úÖ Google Drive mounted at /content/drive")

# --- Core function with tqdm progress bar ---
def filter_zip(
    src_zip,
    include_ext=(),
    exclude_ext=(".jpg", ".jpeg", ".gif", ".tif"),
    include_name_contains=(),
    exclude_name_contains=(),
    dst_suffix="_filtered.zip",
    dst_dir=None,
):
    """
    Filter contents of a ZIP archive by file extensions or filename substrings,
    with progress bar support for large archives.
    """
    src_zip = Path(src_zip)
    dst_dir = Path(dst_dir) if dst_dir else src_zip.parent
    dst_zip = dst_dir / f"{src_zip.stem}{dst_suffix}"
    copied_count = 0

    with zipfile.ZipFile(src_zip, "r") as zin:
        names = [n for n in zin.namelist() if not n.endswith("/")]
        with zipfile.ZipFile(dst_zip, "w", compression=zipfile.ZIP_DEFLATED) as zout:
            for name in tqdm(names, desc="Filtering files", unit="file"):
                name_lower = name.lower()
                ext = Path(name).suffix.lower()

                # --- Apply filters ---
                if include_ext and ext not in include_ext:
                    continue
                if ext in exclude_ext:
                    continue
                if include_name_contains and not any(sub.lower() in name_lower for sub in include_name_contains):
                    continue
                if exclude_name_contains and any(sub.lower() in name_lower for sub in exclude_name_contains):
                    continue

                data = zin.read(name)
                zout.writestr(name, data)
                copied_count += 1

    return dst_zip, copied_count


# --- UI widgets ---
src_zip_w = widgets.Text(
    description="Source ZIP:",
    placeholder="Path to .zip file (e.g. /content/drive/MyDrive/archive.zip)",
    layout=widgets.Layout(width="80%"),
)
include_ext_w = widgets.Text(value="", description="Include ext (comma):", layout=widgets.Layout(width="60%"))
exclude_ext_w = widgets.Text(value=".jpg,.jpeg,.gif,.tif", description="Exclude ext:", layout=widgets.Layout(width="60%"))
include_name_w = widgets.Text(value="", description="Include name contains:", layout=widgets.Layout(width="60%"))
exclude_name_w = widgets.Text(value="", description="Exclude name contains:", layout=widgets.Layout(width="60%"))
dst_suffix_w = widgets.Text(value="_filtered.zip", description="Dst suffix:", layout=widgets.Layout(width="50%"))
dst_dir_w = widgets.Text(value="", description="Dst dir:", placeholder="(empty = same as source)", layout=widgets.Layout(width="80%"))
run_button = widgets.Button(description="Run Filter", button_style="success", icon="play")
output = widgets.Output()

# --- Optional Colab upload button ---
if in_colab:
    upload_button = widgets.Button(description="Upload ZIP (Colab)", icon="upload", button_style="info")

    def on_upload_clicked(b):
        with output:
            clear_output()
            uploaded = files.upload()
            if not uploaded:
                print("‚ùå No file uploaded.")
                return
            filename = list(uploaded.keys())[0]
            src_zip_w.value = f"/content/{filename}"
            print(f"‚úÖ Uploaded: {filename}")

    upload_button.on_click(on_upload_clicked)
else:
    upload_button = widgets.Label("üìÅ File picker available only in Colab.")


# --- Main logic for Run Filter button ---
def on_run_clicked(b):
    with output:
        clear_output()
        if not src_zip_w.value.strip():
            print("‚ùå Please specify a source ZIP file path.")
            return

        src_zip = src_zip_w.value.strip()
        include_ext = tuple(e.strip() for e in include_ext_w.value.split(",") if e.strip())
        exclude_ext = tuple(e.strip() for e in exclude_ext_w.value.split(",") if e.strip())
        include_name_contains = tuple(s.strip() for s in include_name_w.value.split(",") if s.strip())
        exclude_name_contains = tuple(s.strip() for s in exclude_name_w.value.split(",") if s.strip())
        dst_suffix = dst_suffix_w.value.strip() or "_filtered.zip"
        dst_dir = dst_dir_w.value.strip() or None

        try:
            dst_zip, count = filter_zip(
                src_zip,
                include_ext=include_ext,
                exclude_ext=exclude_ext,
                include_name_contains=include_name_contains,
                exclude_name_contains=exclude_name_contains,
                dst_suffix=dst_suffix,
                dst_dir=dst_dir,
            )
            print(f"‚úÖ Filtered ZIP created: {dst_zip}")
            print(f"üì¶ Files copied: {count}")
            if in_colab:
                print("‚¨áÔ∏è You can now download the file:")
                files.download(str(dst_zip))
        except Exception as e:
            print(f"‚ùå Error: {e}")

run_button.on_click(on_run_clicked)


# --- Display the UI ---
ui = widgets.VBox([
    upload_button,
    src_zip_w,
    include_ext_w,
    exclude_ext_w,
    include_name_w,
    exclude_name_w,
    dst_suffix_w,
    dst_dir_w,
    run_button,
    output,
])
display(ui)
