In [2]:
import requests
import re

In [3]:
def fetch_model_docs_and_comments(repo_owner, repo_name, paths):
    """
    Fetch documentation and code comments explaining model functionalities from specific paths
    in a GitHub repository.

    Parameters:
    - repo_owner: GitHub username or organization name, e.g., "huggingface"
    - repo_name: Repository name, e.g., "transformers"
    - paths: List of specific paths to target, e.g., ["README.md", "docs", "src/transformers"]
    """
    headers = {'Authorization': 'token YOUR_GITHUB_ACCESS_TOKEN'}  # Add your GitHub token
    documentation_content = []

    for path in paths:
        url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{path}"
        response = requests.get(url, headers=headers)
        data = response.json()
        
        if isinstance(data, dict) and data.get('type') == 'file':
            # Handle single files like README.md
            file_content = requests.get(data['download_url'], headers=headers).text
            documentation_content.append({'file_name': data['name'], 'content': file_content})
        elif isinstance(data, list):
            # Handle directories like docs/ or src/transformers
            for file in data:
                if file['type'] == 'file' and (file['name'].endswith('.md') or file['name'].endswith('.py')):
                    file_content = requests.get(file['download_url'], headers=headers).text
                    if file['name'].endswith('.py'):
                        # Extract docstrings and comments
                        comments = extract_python_comments(file_content)
                        documentation_content.append({'file_name': file['name'], 'content': comments})
                    else:
                        documentation_content.append({'file_name': file['name'], 'content': file_content})
        
    return documentation_content

In [4]:
def extract_python_comments(content):
    """
    Extracts comments and docstrings from Python code.
    """
    comments = []
    # Extracts all docstrings (triple quotes) and single-line comments (#)
    docstrings = re.findall(r'"""(.*?)"""', content, re.DOTALL)
    single_comments = re.findall(r'#.*', content)
    comments.extend(docstrings)
    comments.extend(single_comments)
    return "\n".join(comments)

In [5]:
# Paths to target: README.md, docs/, src/transformers
paths = ["README.md", "docs", "src/transformers"]

In [6]:
# Fetch documentation and comments from Hugging Face Transformers repository
docs_comments = fetch_model_docs_and_comments("huggingface", "transformers", paths)
for doc in docs_comments:
    print(f"\n--- {doc['file_name']} ---")
    print(f"Content length: {len(doc['content'])} characters")
    print(doc['content'][:500])  # Print the first 500 characters for preview