# EBI File List Builder

In [1]:
# %% [markdown]
# # EBI File List Builder (Jupyter Notebook)
#
# This notebook generates an EBI-compatible TSV file with columns: `Files`, `MD5`, `Type`, `Size`.
#
# ## Workflow
#
# This notebook is divided into two main stages:
#
# 1.  **Stage 1: File Discovery & Hashing**: Scans your directories, finds all files, and calculates their MD5 checksums, size, and type. It produces a raw TSV file.
# 2.  **Stage 2: Finalize Output**: Reads the raw TSV file and writes the final output with the four required columns.
#
# ### Instructions
# 1.  **Configure your settings** in the next cell (`Stage 1 User Configuration`). You **must** change the `ROOTS` variable to point to your data directory.
# 2.  **Run all cells** in order (e.g., using "Run All" in the toolbar).
# 3.  The final output will be saved to the file specified in the Stage 2 configuration.

# %%
# ------------------------------------
# Stage 1) User Configuration Cell
# ------------------------------------
# Edit these variables for the initial file discovery and hashing.
from __future__ import annotations
import os
from typing import List

# --- REQUIRED SETTINGS ---
# Add the full paths to the directories you want to scan.
ROOTS: List[str] = [
    "/home/xavier/Documents/DAE_project/dataset/Roy_training/classification_model",
    # "/path/to/your/second_data_folder", # You can add more paths
]

# The base path to make all file paths relative to.
RELATIVE_TO: str | None = "/home/xavier/Documents/DAE_project/dataset/Roy_training/classification_model"

# --- OUTPUT AND CACHE (from Stage 1) ---
# This is the intermediate file that Stage 2 will use as input.
OUT_TSV: str = "/home/xavier/Documents/DAE_project/dataset/Roy_training/classification_model/file_list_generated.tsv"
CACHE_JSON: str = ".ebi_md5_cache.json"

# --- PERFORMANCE ---
WORKERS: int = min(8, (os.cpu_count() or 4))

# --- RUNNING MODES (for Stage 1) ---
# Set ONE of these to True for the initial run:
FAST_MODE: bool = False  # Full build with MD5 hashes.
# FAST_MODE: bool = True # Quick build without MD5 hashes.
FILL_MD5_MODE: bool = False  # Fills missing MD5s from a previous FAST_MODE run.

# --- FILTERS ---
EXCLUDE_GLOBS: List[str] = ["**/*.tmp", "**/.DS_Store"]

# %% [markdown]
# ---
# ## Stage 1 Helper Functions & Main Logic
# *You generally do not need to edit the code in the cell below.*

# %%
import json
import hashlib
import mimetypes
import csv
import time
from pathlib import Path
from typing import Dict, Any, Iterable
from concurrent.futures import ThreadPoolExecutor, as_completed

# Optional progress bar (Jupyter-friendly).
try:
    from tqdm.auto import tqdm
except ImportError:
    def tqdm(iterable=None, *args, **kwargs):
        if iterable: print("tqdm not found. For a progress bar, run: pip install tqdm")
        return iterable if iterable is not None else range(0)

mimetypes.add_type('video/x-msvideo', '.avi')
MIME_MAP = {
    '.zip': 'application/zip', '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
    '.csv': 'text/csv', '.bib': 'text/x-bibtex', '.avi': 'video/x-msvideo', '.pkl': 'application/octet-stream',
    '.pt': 'application/octet-stream', '.pth': 'application/octet-stream', '.ckpt': 'application/octet-stream',
}


def is_tb_event(name: str) -> bool: return name.startswith('events.out.tfevents.')


def guess_mime(p: Path) -> str:
    sfx = p.suffix.lower()
    if sfx in MIME_MAP: return MIME_MAP[sfx]
    if is_tb_event(p.name): return 'application/octet-stream'
    mime, _ = mimetypes.guess_type(str(p));
    return mime or 'application/octet-stream'


def file_sig(p: Path) -> tuple[int, float]: st = p.stat(); return st.st_size, st.st_mtime


def md5sum(p: Path, blocksize: int = 4 * 1024 * 1024) -> str:
    h = hashlib.md5()
    try:
        with p.open('rb') as f:
            for chunk in iter(lambda: f.read(blocksize), b''): h.update(chunk)
    except (IOError, PermissionError) as e:
        print(f"Error reading file for MD5: {p} ({e})");
        return ""
    return h.hexdigest()


def load_cache(cache_path: Path) -> Dict[str, Any]:
    if cache_path.exists():
        try:
            return json.loads(cache_path.read_text(encoding='utf-8'))
        except (json.JSONDecodeError, IOError):
            return {}
    return {}


def save_cache(cache_path: Path, data: Dict[str, Any]) -> None:
    tmp_path = cache_path.with_suffix(cache_path.suffix + '.tmp')
    try:
        tmp_path.write_text(json.dumps(data, indent=2), encoding='utf-8');
        tmp_path.replace(cache_path)
    except IOError as e:
        print(f"Error saving cache: {e}")


def iter_files(roots: Iterable[Path], exclude_globs: List[str]) -> Iterable[Path]:
    for root in roots:
        if not root.is_dir(): print(f"Warning: Root path is not a directory, skipping: {root}"); continue
        for p in root.rglob('*'):
            if p.is_file() and not any(p.match(pat) for pat in exclude_globs): yield p


def write_tsv(out_path: Path, rows: List[tuple[str, str, str, int]]) -> None:
    header = ('Files', 'MD5', 'Type', 'Size')
    with out_path.open('w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t');
        writer.writerow(header);
        writer.writerows(rows)


def build_file_list(
        roots: List[str], out_tsv: str, cache_json: str, relative_to: str | None, workers: int, fast: bool,
        exclude_globs: List[str] | None
) -> int:
    print("--- Stage 1: Starting File List Build ---");
    roots_p = [Path(r).resolve() for r in roots]
    rel_base = Path(relative_to).resolve() if relative_to else Path.cwd()
    out_path, cache_path = Path(out_tsv), Path(cache_json)
    exclude = exclude_globs or []
    cache = load_cache(cache_path);
    files = list(iter_files(roots_p, exclude))
    rows_data: Dict[str, Dict[str, Any]] = {};
    jobs: List[Path] = []
    for p in tqdm(files, desc="1/3: Collecting file metadata"):
        key = str(p)
        try:
            rel_path = str(p.relative_to(rel_base));
            size, mtime = file_sig(p)
        except (FileNotFoundError, ValueError) as e:
            print(f"Skipping {p}: {e}");
            continue
        cached_entry = cache.get(key)
        is_valid = (cached_entry and cached_entry.get('size') == size and abs(
            cached_entry.get('mtime', 0.0) - mtime) < 1e-6)
        md5 = ''
        if not fast:
            if is_valid:
                md5 = cached_entry.get('md5', '')
            else:
                jobs.append(p)
        rows_data[key] = {'rel': rel_path, 'md5': md5, 'mime': guess_mime(p), 'size': size}
    if jobs:
        print(f"Found {len(jobs)} files that need MD5 hashing.")
        with ThreadPoolExecutor(max_workers=workers) as ex:
            futures = {ex.submit(md5sum, p): p for p in jobs}
            for fut in tqdm(as_completed(futures), total=len(jobs), desc="2/3: Hashing MD5 (parallel)"):
                p = futures[fut];
                key = str(p)
                try:
                    md5_result = fut.result()
                    if md5_result:
                        size, mtime = file_sig(p);
                        rows_data[key]['md5'] = md5_result
                        cache[key] = {'size': size, 'mtime': mtime, 'md5': md5_result}
                except Exception as e:
                    print(f"Error hashing {p}: {e}")
        save_cache(cache_path, cache)
    final_rows = [(d['rel'], d['md5'], d['mime'], d['size']) for d in rows_data.values()]
    print("3/3: Writing intermediate TSV file...");
    write_tsv(out_path, final_rows)
    print(f"Stage 1 Complete. Wrote {len(final_rows)} rows to {out_path}")
    return len(final_rows)


def fill_md5_for_existing(out_tsv: str, cache_json: str, relative_to: str | None, workers: int) -> int:
    print("--- Stage 1: Starting to Fill Missing MD5s ---");
    out_path = Path(out_tsv)
    if not out_path.exists(): print(f"Error: TSV file not found at {out_path}. Run a full build first."); return 0
    cache_path = Path(cache_json);
    cache = load_cache(cache_path)
    rel_base = Path(relative_to).resolve() if relative_to else Path.cwd()
    jobs: List[Path] = [];
    print("1/3: Scanning TSV for files needing MD5...")
    try:
        with out_path.open('r', encoding='utf-8', newline='') as f:
            reader = csv.reader(f, delimiter='\t');
            next(reader)
            for row in tqdm(reader, desc="Scanning rows"):
                if len(row) >= 2 and not row[1]:
                    p = (rel_base / row[0]).resolve()
                    if p.exists(): jobs.append(p)
    except (IOError, StopIteration) as e:
        print(f"Could not read TSV file: {e}");
        return 0
    if not jobs: print("No missing MD5s found. File is already complete."); return 0
    print(f"Found {len(jobs)} files that need MD5 hashing.")
    md5_results: Dict[str, str] = {}
    with ThreadPoolExecutor(max_workers=workers) as ex:
        futures = {ex.submit(md5sum, p): p for p in jobs}
        for fut in tqdm(as_completed(futures), total=len(jobs), desc="2/3: Hashing MD5 (parallel)"):
            p = futures[fut];
            key = str(p)
            try:
                md5_result = fut.result()
                if md5_result:
                    size, mtime = file_sig(p);
                    md5_results[str(p.relative_to(rel_base))] = md5_result
                    cache[key] = {'size': size, 'mtime': mtime, 'md5': md5_result}
            except Exception as e:
                print(f"Error hashing {p}: {e}")
    save_cache(cache_path, cache)
    print("3/3: Rewriting TSV with new MD5s...");
    tmp_path = out_path.with_suffix('.tmp')
    with out_path.open('r', encoding='utf-8', newline='') as infile, tmp_path.open('w', encoding='utf-8',
                                                                                   newline='') as outfile:
        reader = csv.reader(infile, delimiter='\t');
        writer = csv.writer(outfile, delimiter='\t')
        writer.writerow(next(reader))
        for row in reader:
            if len(row) >= 2 and not row[1] and row[0] in md5_results: row[1] = md5_results[row[0]]
            writer.writerow(row)
    tmp_path.replace(out_path)
    print(f"Fill complete. Updated {len(md5_results)} rows in {out_path}")
    return len(md5_results)


# %% [markdown]
# ---
# ## ▶️ Execute Stage 1: File Discovery
# *This cell runs the first stage based on your configuration above.*

# %%
def run_stage1():
    """Checks configuration and runs the appropriate file discovery function."""
    if not ROOTS or "/path/to/your/" in ROOTS[0] or (RELATIVE_TO and "/path/to/your/" in RELATIVE_TO):
        print("=" * 60 + "\n!!! CONFIGURATION NEEDED !!!\n" +
              "Please edit the `ROOTS` and `RELATIVE_TO` variables in the first cell\n" +
              "to point to your actual data directories before running.\n" + "=" * 60)
        return False
    print("Stage 1 Configuration loaded. Starting process...");
    start_time = time.time()
    if FILL_MD5_MODE:
        fill_md5_for_existing(OUT_TSV, CACHE_JSON, RELATIVE_TO, WORKERS)
    else:
        build_file_list(ROOTS, OUT_TSV, CACHE_JSON, RELATIVE_TO, WORKERS,
                        fast=FAST_MODE, exclude_globs=EXCLUDE_GLOBS)
    end_time = time.time();
    print(f"\nStage 1 finished in {end_time - start_time:.2f} seconds.")
    return True


# Run the main process
stage1_success = run_stage1()


Stage 1 Configuration loaded. Starting process...
--- Stage 1: Starting File List Build ---


1/3: Collecting file metadata: 100%|██████████| 5/5 [00:00<00:00, 44810.94it/s]


Found 2 files that need MD5 hashing.


2/3: Hashing MD5 (parallel): 100%|██████████| 2/2 [00:00<00:00, 44.68it/s]

3/3: Writing intermediate TSV file...
Stage 1 Complete. Wrote 5 rows to /home/xavier/Documents/DAE_project/dataset/Roy_training/classification_model/file_list_generated.tsv

Stage 1 finished in 0.05 seconds.



