# Google Drive Inventory & Security Audit

Run the cells below to authenticate and scan your Google Drive for file permissions and ownership.

In [None]:
# Install necessary libraries
!pip install --quiet google-api-python-client google-auth-httplib2 google-auth-oauthlib pandas tqdm

In [None]:
import google.auth
from google.colab import auth
from googleapiclient.discovery import build
from google.auth.transport.requests import Request
from typing import List, Dict
import pandas as pd
from tqdm import tqdm

# Authenticate user (Colab flow)
print("Authenticating...")
auth.authenticate_user()

# Build Drive Service
SCOPES = ['https://www.googleapis.com/auth/drive.metadata.readonly']
creds, _ = google.auth.default(scopes=SCOPES)
drive_service = build('drive', 'v3', credentials=creds)

print("Authentication successful. Service built.")

In [None]:
def get_my_email():
    """Dynamically retrieve the current user's email."""
    creds.refresh(Request())
    try:
        from google.oauth2 import id_token as google_id_token
        info = google_id_token.verify_oauth2_token(creds.id_token, Request())
        return info.get("email")
    except Exception:
        return None

def list_all_files_with_details() -> List[Dict]:
    """Scans all files in Drive (My Drive + Shared Drives) and returns metadata."""
    files_info = []
    page_token = None
    my_email = get_my_email()
    
    print(f"Starting scan for user: {my_email if my_email else 'Unknown'}")
    
    # Query: Not in trash
    query = "trashed = false"

    # We use a progress bar, though total count is unknown initially
    pbar = tqdm(desc="Scanning files", unit=" files")

    while True:
        try:
            resp = drive_service.files().list(
                q=query,
                pageSize=1000, 
                pageToken=page_token,
                fields=(
                    "nextPageToken, "
                    "files("
                    " id, name, mimeType, size, "
                    " createdTime, modifiedTime, "
                    " owners(emailAddress, displayName), "
                    " permissions(emailAddress, role, type), "
                    " capabilities(canEdit, canComment)"
                    ")"
                ),
                includeItemsFromAllDrives=True,
                supportsAllDrives=True,
            ).execute()
        except Exception as e:
            print(f"API Error: {e}")
            break

        batch = resp.get("files", [])
        if not batch:
            break
            
        pbar.update(len(batch))

        for f in batch:
            fid = f["id"]
            name = f.get("name", "Unknown")
            mime = f.get("mimeType", "")
            size = f.get("size") # null for Google Docs/Sheets
            created = f.get("createdTime")
            modified = f.get("modifiedTime")
            owners = f.get("owners", [])
            perms = f.get("permissions", [])
            caps = f.get("capabilities", {})

            # Owner info (usually the first one)
            owner_email = owners[0].get("emailAddress") if owners else ""
            owner_name = owners[0].get("displayName") if owners else ""

            # Determine MY role
            my_role = "unknown"
            if my_email:
                # 1. Check direct permissions
                for p in perms:
                    if p.get("emailAddress") == my_email:
                        my_role = p.get("role", "unknown")
                        break
                
                # 2. If not found, check ownership or capabilities
                if my_role == "unknown":
                    if owners and any(o.get("emailAddress") == my_email for o in owners):
                        my_role = "owner"
                    elif caps.get("canEdit"):
                        my_role = "writer" # derived
                    elif caps.get("canComment"):
                        my_role = "commenter" # derived
                    else:
                        my_role = "reader" # default fallback

            # Check Public Exposure (Anyone with link)
            public_level = ""
            for p in perms:
                if p.get("type") == "anyone":
                    role = p.get("role")
                    if role == "reader":
                        public_level = "Public Link (Read)"
                    elif role == "commenter":
                        public_level = "Public Link (Comment)"
                    elif role == "writer":
                        public_level = "Public Link (Edit) ‚ö†Ô∏è"
                    else:
                        public_level = "Public Link (Unknown)"
                    break

            files_info.append({
                "file_id": fid,
                "file_name": name,
                "mimeType": mime,
                "size_bytes": int(size) if size else 0,
                "createdTime": created,
                "modifiedTime": modified,
                "owner_email": owner_email,
                "owner_name": owner_name,
                "my_role": my_role,
                "public_level": public_level,
                "url": f"https://drive.google.com/open?id={fid}",
            })

        page_token = resp.get("nextPageToken")
        if not page_token:
            break
    
    pbar.close()
    return files_info

# Run the scan
all_files_data = list_all_files_with_details()
print(f"Scan complete. Found {len(all_files_data)} files.")

In [None]:
# Convert to DataFrame
df = pd.DataFrame(all_files_data)

# Helper: Flag breakdown
def role_to_flags(role: str):
    is_owner = (role == "owner")
    can_edit = role in ("owner", "writer")
    can_comment = role in ("owner", "writer", "commenter")
    can_read = True # Assuming if listed, it's readable
    return can_read, can_comment, can_edit, is_owner

if not df.empty:
    df[["can_read", "can_comment", "can_edit", "is_owner"]] = df["my_role"].apply(
        lambda r: pd.Series(role_to_flags(r))
    )

    # Helper: Human readable Doc Type
    def human_type(mime: str) -> str:
        if mime == "application/vnd.google-apps.folder": return "Folder"
        if mime == "application/vnd.google-apps.spreadsheet": return "Google Sheet"
        if mime == "application/vnd.google-apps.document": return "Google Doc"
        if mime == "application/vnd.google-apps.presentation": return "Google Slides"
        if mime == "application/vnd.google-apps.form": return "Google Form"
        return mime.split('/')[-1] if '/' in mime else mime

    df["doc_type"] = df["mimeType"].apply(human_type)

    # Reorder columns for better readability
    final_cols = [
        "file_name", "doc_type", "public_level", "my_role", 
        "owner_name", "owner_email", 
        "size_bytes", "modifiedTime", "url"
    ]
    
    # Create the final clean view
    audit_df = df[final_cols].copy()
    
    print("DataFrame 'audit_df' created successfully.")
    print(f"Total size (approx): {audit_df['size_bytes'].sum() / 1024 / 1024:.2f} MB")
else:
    print("No files found. DataFrame is empty.")

In [None]:
# Show files exposed to the public
if not df.empty:
    public_files = audit_df[audit_df['public_level'] != '']
    print(f"‚ö†Ô∏è Found {len(public_files)} publicly accessible files:")
    display(public_files.head(10))

    # Show top 5 largest files
    print("\nüì¶ Top 5 Largest Files:")
    display(audit_df.sort_values('size_bytes', ascending=False).head(5))