## fetch_repo_text_files.py


fetch_repo_text_files.py
Usage:
    python fetch_repo_text_files.py https://github.com/owner/repo [output_dir]

What it does:
 - Downloads the repo zip (default branch or specified branch if URL includes it)
 - Extracts
 - Copies only non-binary, non-image, non-node_modules files to output_dir

In [1]:
import sys
import os
import re
import tempfile
import zipfile
import shutil
import pathlib

In [2]:
# --- Configuration ---
IMAGE_EXTS = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg', '.webp', '.ico', '.tiff', '.tif'}
SKIP_EXTENSIONS = {'.exe', '.dll', '.so', '.class', '.jar', '.pyc', '.pyo', '.db', '.sqlite', '.bin'}
SKIP_DIR_NAMES = {'node_modules', '.git', '__pycache__'}
TEXT_SAMPLE_SIZE = 4096
NON_TEXT_THRESHOLD = 0.30  # fraction of non-text bytes to consider as binary


In [None]:
# --- Helpers ---
def parse_github_url(url: str):
    # Accepts forms like:
    # https://github.com/owner/repo
    # https://github.com/owner/repo/
    # https://github.com/owner/repo/tree/branch or .../tree/branch/path
    m = re.match(r'https?://github\.com/([^/]+)/([^/]+)(?:/(.*))?', url.strip())
    # print(m)
    if not m:
        raise ValueError("Not a recognized GitHub repo URL.")
    owner = m.group(1)
    repo = m.group(2).removesuffix('.git')
    tail = m.group(3) or ''
    branch = None
    # If tail starts with tree/<branch>
    parts = tail.split('/')
    if len(parts) >= 2 and parts[0] == 'tree':
        branch = parts[1]
    return owner, repo, branch