# Connect to Google Drive (exploration)

This notebook shows how to authenticate and interact with a Google Drive folder (list images, sample-download a few) without downloading the full dataset. You can reuse these helpers later in scripts.

Prerequisites:
- Create an OAuth 2.0 Client (Desktop) in Google Cloud Console.
- Download the JSON and place it at `secrets/client_secrets.json` (or set `GOOGLE_OAUTH_CLIENT_SECRETS`).
- First run will open a browser for consent; tokens are cached locally for reuse.

Notes:
- Supports Shared Drives if your data is there (set `INCLUDE_SHARED_DRIVES=True`).
- Use `TEST_MODE=True` to limit the number of files listed/processed.


In [None]:
#%pip install -q pydrive2 google-auth-oauthlib tqdm pillow


In [None]:
#%pip install -q python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [None]:
# Load environment variables from .env if present
from dotenv import load_dotenv
load_dotenv(dotenv_path=Path('.env'), override=False)
print('Loaded .env (if present).')


In [None]:
import os
from pathlib import Path
from typing import Optional

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive


def authenticate(use_local_server: bool = True, client_secrets_path: Optional[str] = None) -> GoogleDrive:
    """Authenticate to Google Drive and return a GoogleDrive client.
    Credentials sourcing (in order):
    1) explicit client_secrets_path arg
    2) env GOOGLE_OAUTH_CLIENT_SECRETS (path to client_secrets.json)
    3) default secrets/client_secrets.json

    Access/refresh tokens are cached at GOOGLE_OAUTH_CREDENTIALS_CACHE or defaults to secrets/credentials.json.
    """
    client_secrets = (
        client_secrets_path
        or os.environ.get("GOOGLE_OAUTH_CLIENT_SECRETS")
        or str(Path("secrets/client_secrets.json").resolve())
    )
    cred_cache = (
        os.environ.get("GOOGLE_OAUTH_CREDENTIALS_CACHE")
        or str(Path("secrets/credentials.json").resolve())
    )

    gauth = GoogleAuth()
    gauth.LoadClientConfigFile(client_secrets)

    # Try to use cached credentials if available
    try:
        if Path(cred_cache).exists():
            gauth.LoadCredentialsFile(cred_cache)
    except Exception:
        pass

    if not getattr(gauth, "credentials", None):
        if use_local_server:
            gauth.LocalWebserverAuth()
        else:
            gauth.CommandLineAuth()
    elif getattr(gauth, "access_token_expired", False):
        gauth.Refresh()
    else:
        gauth.Authorize()

    # Persist credentials for future runs
    try:
        Path(cred_cache).parent.mkdir(parents=True, exist_ok=True)
        gauth.SaveCredentialsFile(cred_cache)
    except Exception:
        pass

    drive = GoogleDrive(gauth)
    return drive


In [None]:
from typing import Iterable, List, Dict, Optional


def list_files_in_folder(
    drive: GoogleDrive,
    folder_id: str,
    *,
    mime_prefix: Optional[str] = None,
    limit: Optional[int] = None,
    include_trashed: bool = False,
    include_shared_drives: bool = True,
    drive_id: Optional[str] = None,
) -> List[Dict]:
    """List files in a Drive folder. Optionally filter by MIME prefix (e.g., 'image/').
    For Shared Drives, set include_shared_drives=True and optionally provide drive_id.
    """
    q = [f"'{folder_id}' in parents"]
    if not include_trashed:
        q.append("trashed = false")
    if mime_prefix:
        q.append(f"mimeType contains '{mime_prefix}'")
    query = " and ".join(q)

    params = {
        "q": query,
        "supportsAllDrives": include_shared_drives,
        "includeItemsFromAllDrives": include_shared_drives,
        # For shared drives, uncomment the next two lines and set driveId
        # "corpora": "drive" if drive_id else "user",
        # "driveId": drive_id,
    }

    file_list = drive.ListFile(params).GetList()
    if limit is not None:
        file_list = file_list[: int(limit)]

    # Normalize output (id, title/name, mimeType, size if present)
    out = []
    for f in file_list:
        out.append({
            "id": f["id"],
            "name": f.get("title") or f.get("name"),
            "mimeType": f.get("mimeType"),
            "size": int(f.get("fileSize", 0)) if f.get("fileSize") else None,
        })
    return out


def download_file_bytes(drive: GoogleDrive, file_id: str) -> bytes:
    """Download a Drive file to memory and return its raw bytes."""
    f = drive.CreateFile({"id": file_id})
    return f.GetContentBinary()


In [None]:
# --- Configuration ---
FOLDER_ID = os.getenv("GOOGLE_DRIVE_FOLDER_ID", "REPLACE_WITH_DRIVE_FOLDER_ID")  # e.g., '1AbC...'
INCLUDE_SHARED_DRIVES = os.getenv("GOOGLE_DRIVE_INCLUDE_SHARED", "true").lower() in {"1","true","yes","y"}
TEST_MODE = os.getenv("TEST_MODE", "true").lower() in {"1","true","yes","y"}
TEST_LIMIT = int(os.getenv("TEST_LIMIT", "25"))  # Only list/process first N files when TEST_MODE is True
MIME_PREFIX = os.getenv("GOOGLE_DRIVE_MIME_PREFIX", "image/")  # set None to list all

# Authenticate and create a Drive client
drive = authenticate(use_local_server=True)
print("Authenticated to Google Drive.")


In [None]:
# List files in the Drive folder (respects TEST_MODE)
limit = TEST_LIMIT if TEST_MODE else None
files = list_files_in_folder(
    drive,
    FOLDER_ID,
    mime_prefix=MIME_PREFIX,
    limit=limit,
    include_shared_drives=INCLUDE_SHARED_DRIVES,
)

print(f"Found {len(files)} files (showing up to {limit or 'all'}):")
for i, f in enumerate(files[:10]):
    print(f"{i:02d} | {f['name']} | {f['id']} | {f['mimeType']}")


In [None]:
# Download one sample image to memory and preview
from io import BytesIO
from PIL import Image
from IPython.display import display

if files:
    sample = files[0]
    raw = download_file_bytes(drive, sample["id"])
    img = Image.open(BytesIO(raw)).convert("RGB")
    print(f"Previewing: {sample['name']} ({len(raw)/1024:.1f} KB)")
    display(img)
else:
    print("No files returned. Check FOLDER_ID and permissions.")
