In [1]:
import ast
import os
import json
import tokenize
from io import BytesIO
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

## Source code extraction

In [7]:
def extract_comments(source_code):
    comments = []
    tokens = tokenize.tokenize(BytesIO(source_code.encode("utf-8")).readline)
    for toknum, tokval, _, _, _ in tokens:
        if toknum == tokenize.COMMENT:
            comments.append(tokval.strip("# ").strip())
    return comments

def extract_docstrings_and_defs(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        source = f.read()

    tree = ast.parse(source)
    results = []
    module_docstring = ast.get_docstring(tree)
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
            name = node.name
            docstring = ast.get_docstring(node)
            node_type = "function" if isinstance(node, ast.FunctionDef) else "class"
            source_lines = source.splitlines()
            start_line = node.lineno - 1  # ast 行号从1开始，列表索引从0开始
            end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line
            source_code = '\n'.join(source_lines[start_line:end_line])
            results.append({
                "type": node_type,
                "name": name,
                "docstring": docstring or "",
                "source_code": source_code,
                "file_docstring": module_docstring
            })

    comments = extract_comments(source)
    return results, comments

def generate_qa_from_entry(entry):
    name = entry["name"]
    doc = entry["docstring"]
    if not doc:
        return None

    # question = f"What does the {entry['type']} `{name}` do?"
    # answer = doc.strip()
    source_code = entry.get("source_code", "")
    file_docstring = entry.get("file_docstring", "")

    return {
        "name": name,
        "docstring": doc.strip(),
        "file_docstring": file_docstring,
        "source": "source_code",
        "type": entry["type"],
        "code": source_code
    }

def process_directory(dir_path):
    qa_pairs = []
    for root, _, files in tqdm(os.walk(dir_path)):
        for file in tqdm(files):
            if file.endswith(".py"):
                full_path = os.path.join(root, file)
                try:
                    entries, comments = extract_docstrings_and_defs(full_path)
                    for entry in entries:
                        qa = generate_qa_from_entry(entry)
                        if qa:
                            qa["file"] = full_path
                            qa_pairs.append(qa)
                except Exception as e:
                    print(f"Failed to parse {full_path}: {e}")
    return qa_pairs

In [8]:
directory = "/home/cc/transformers/src/transformers"
qa_data = process_directory(directory)

# 保存结果为 JSONL 文件
with open("source_code_qa.json", "w", encoding="utf-8") as f:
    json.dump(qa_data, f, indent=4, ensure_ascii=False)

print(f"Extracted {len(qa_data)} QA pairs.")

0it [00:00, ?it/s]

100%|██████████| 60/60 [00:02<00:00, 25.14it/s]
100%|██████████| 21/21 [00:00<00:00, 117.14it/s]
100%|██████████| 6/6 [00:00<00:00, 93.05it/s]
100%|██████████| 1/1 [00:00<00:00, 1956.30it/s]
100%|██████████| 5/5 [00:00<00:00, 19222.29it/s]
100%|██████████| 3/3 [00:00<00:00, 46776.62it/s]
100%|██████████| 2/2 [00:00<00:00, 99.02it/s]
100%|██████████| 8/8 [00:00<00:00, 66974.91it/s]
100%|██████████| 2/2 [00:00<00:00, 27413.75it/s]
100%|██████████| 2/2 [00:00<00:00, 22733.36it/s]
100%|██████████| 4/4 [00:00<00:00, 49636.73it/s]
100%|██████████| 11/11 [00:00<00:00, 92.11it/s]
100%|██████████| 36/36 [00:00<00:00, 67.93it/s]
100%|██████████| 31/31 [00:00<00:00, 89.41it/s]
100%|██████████| 29/29 [00:00<00:00, 76.03it/s]
100%|██████████| 6/6 [00:00<00:00, 101.47it/s]
100%|██████████| 14/14 [00:00<00:00, 19.61it/s]
100%|██████████| 3/3 [00:00<00:00, 338.86it/s]
100%|██████████| 1/1 [00:00<00:00, 116.90it/s]
100%|██████████| 8/8 [00:00<00:00, 60.39it/s]
100%|██████████| 12/12 [00:00<00:00, 27.76

Extracted 12678 QA pairs.


### Extract QA

In [14]:
from openai import OpenAI
api_key = input("Enter your openai api key: ")
# api_base = input("Enter your openai api base: ")
os.environ["OPENAI_API_KEY"] = api_key
# os.environ["OPENAI_API_BASE"] = api_base


In [20]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import random

In [24]:
def summarize_docstring(name, type, docstring):
    # 使用 GPT 模型生成摘要
    prompt = """
    Summarize the following docstring, tell me what does the {type} `{name}` do.
    Docstring:
    {docstring}
    """
    question_pool = [
        "What does the {type} {name} do?"
        "What is the function of the {type} {name}?",
        "How does the {type} {name} work?",
        "What role does the {type} {name} play?",
        "What is the purpose of the {type} {name}?",
        "What does the {type} {name} accomplish?",
        "Can you explain what the {type} {name} is used for?",
        "Why do we need the {type} {name}?",
        "What is the {type} {name} responsible for?",
        "What task does the {type} {name} perform?",
        "What kind of behavior does the {type} {name} define?"
    ]
    question = random.choice(question_pool)
    question = question.format(name=name, type=type)
    client = OpenAI()
    prompt = prompt.format(name=name, type=type, docstring=docstring)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    print(response.choices[0].message.content)
    return question, response.choices[0].message.content

In [25]:
with open("source_code_qa.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)

qa_data_with_summary = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(summarize_docstring, qa["name"], qa["type"], qa["docstring"]) for qa in qa_data[:10]]
    for future in as_completed(futures):
        qa = qa_data[futures.index(future)]
        qa["question"], qa["answer"] = future.result()
        qa_data_with_summary.append(qa)

with open("source_code_qa_with_summary.json", "w", encoding="utf-8") as f:
    json.dump(qa_data_with_summary, f, indent=4, ensure_ascii=False)

The `softmax_backward_data` function is designed to invoke the internal `_softmax_backward_data` method in PyTorch. Its main purpose is to handle and adjust the arguments it passes to this internal method based on the version of PyTorch that is being used.
The `id_tensor_storage` function provides a unique identifier for a tensor's underlying storage. It ensures that multiple tensors sharing the same storage will have the same identifier. The identifier remains constant and unique for the storage's lifetime, although different storages with non-overlapping lifetimes might share the same identifier.
The `Conv1D` class implements a 1D-convolutional layer used in models like OpenAI's GPT and GPT-2. It functions similarly to a linear layer but with transposed weights. The class takes two arguments: `nf`, the number of output features, and `nx`, the number of input features.
The function `prune_layer` reduces a given Conv1D or linear layer by keeping only the specified indices, effectively 

## Git commit extraction

In [27]:
from git import Repo
import os
import json

In [30]:
# 仓库路径：替换为你本地 transformers 的路径
REPO_PATH = "/home/cc/transformers"
repo = Repo(REPO_PATH)

output = []

# 遍历最近 N 个 commit（可调整）
for commit in repo.iter_commits('main', max_count=10000):
    commit_data = {
        "commit_hash": commit.hexsha,
        "author": commit.author.name,
        "date": commit.committed_datetime.isoformat(),
        "message": commit.message.strip()
    }

    # 获取 diff 的简要变化（可设置为 full_diff=True 看更多上下文）
    diffs = commit.diff(commit.parents[0] if commit.parents else None, create_patch=True)

    diff_texts = []
    for diff in diffs:
        try:
            diff_texts.append(diff.diff.decode("utf-8", errors="ignore"))
        except Exception as e:
            continue

    diff_summary = "\n".join(diff_texts)
    commit_data["diff_summary"] = diff_summary

    # 构造 QA 对
    # qa_item = {
    #     "question": f"What changed in commit {commit.hexsha[:7]}?",
    #     "answer": f"{commit.message.strip()}\n\nSummary of changes:\n{diff_summary[:1000]}...",
    #     "source": "git_commit",
    #     "metadata": commit_data
    # }
    question_pool = [
        "What changed in commit {hash}?",
        "What modifications were introduced in commit {hash}?",
        "Can you summarize the changes made in commit {hash}?",
        "What updates does commit {hash} contain?",
        "What's new in commit {hash}?",
        "Describe the differences introduced by commit {hash}.",
        "What was added, removed, or modified in commit {hash}?",
        "What does commit {hash} change in the codebase?",
        "Which files or functions were affected by commit {hash}?",
        "What's the purpose of commit {hash}?",
        "How does commit {hash} alter the existing implementation?"
    ]
    question = random.choice(question_pool)
    question = question.format(hash=commit.hexsha[:7])
    qa_item = {
        "question": question,
        "answer": f"{commit.message.strip()}",
        "source": "git_commit",
        "metadata": commit_data
    }

    output.append(qa_item)

    question_pool = [
        "Who is the author of the commit {hash}?",
        "Who made the commit {hash}?",
        "Who is responsible for commit {hash}?",
        "Can you tell me who authored commit {hash}?",
        "Who's the person behind commit {hash}?",
        "Who committed {hash}?",
        "Which developer authored commit {hash}?",
        "Who was the contributor for commit {hash}?",
        "Do you know who wrote commit {hash}?",
        "Who pushed commit {hash} to the repository?",
        "Whose work is represented by commit {hash}?"
    ]
    question = random.choice(question_pool)
    question = question.format(hash=commit.hexsha[:7])
    qa_item = {
        "question": question,
        "answer": f"{commit.author.name}",
        "source": "git_commit",
        "metadata": commit_data
    }
    output.append(qa_item)

    question_pool = [
        "When was the commit {hash} made?",
        "What is the timestamp of commit {hash}?",
        "When exactly did commit {hash} occur?",
        "At what time was commit {hash} created?",
        "Can you tell me the date of commit {hash}?",
        "On what date was commit {hash} made?",
        "Do you know when commit {hash} was pushed?",
        "Any idea when commit {hash} happened?",
        "When did commit {hash} go through?",
        "What's the date on commit {hash}?",
        "When did they make commit {hash}?"
    ]
    question = random.choice(question_pool)
    question = question.format(hash=commit.hexsha[:7])
    qa_item = {
        "question": question,
        "answer": f"{commit.committed_datetime.isoformat()}",
        "source": "git_commit",
        "metadata": commit_data
    }
    output.append(qa_item)
# 保存为 JSON
with open("qa_from_commits.json", "w") as f:
    json.dump(output, f, indent=4, ensure_ascii=False)

## Github issue extraction

In [1]:
from github import Github
g_token = input("Enter your github token: ")

In [13]:
g = Github(g_token)  # 用你的 GitHub Token

repo = g.get_repo("huggingface/transformers")
issues = repo.get_issues(state="closed")  # 可加过滤条件
# issues[0].__dict__
# real_issues = [i for i in all_issues if not i.pull_request]
for issue in issues[:2]:
    print("Title:", issue.title)
    print("Body:", issue.body)

Title: Check fork
Body: # What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet though.

Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.

Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.

Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] This PR fixes a typo or improves the docs (yo

In [23]:
count = 0
qa_pairs = []
for issue in issues:
    if issue.pull_request:
        continue
    # print("********************Issue********************")
    # print("Title:", issue.title)
    # print("Body:", issue.body)
    comments = issue.get_comments()
    # print("********************Comments********************")
    comments_list = []
    for comment in comments:
        comments_list.append(
            {
                "comment_author": comment.user.login,
                "comment_body": comment.body
            }
        )
        # print("Comment by", comment.user.login)
        # print(comment.body)
    qa_pair = {
        "Issue Title": issue.title.strip(),
        "Issue Body": issue.body.strip(),
        "Issue Comments": comments_list,
        "source": "github_issue",
        "metadata": {
            "issue_number": issue.number,
            "url": issue.html_url,
            "created_at": str(issue.created_at)
        }
    }
    qa_pairs.append(qa_pair)
    count += 1
    if count > 10:
        break


with open("qa_from_issues.jsonl", "w") as f:
    for qa in qa_pairs:
        f.write(json.dumps(qa, indent=4, ensure_ascii=False) + "\n")


## PR and Code Review Extraction

In [30]:
MAX_PRS = 10  # 设定最多提取几个 PR，避免 API rate limit
OUTPUT_FILE = "huggingface_pr_data.jsonl"

with open(OUTPUT_FILE, "w", encoding="utf-8") as f_out:
    for pr in repo.get_pulls(state="closed", sort="created", direction="desc"):
        if MAX_PRS <= 0:
            break
        MAX_PRS -= 1

        pr_data = {
            "pr_number": pr.number,
            "title": pr.title,
            "body": pr.body,
            "user": pr.user.login,
            "created_at": str(pr.created_at),
            "merged": pr.merged,
            "merge_commit_sha": pr.merge_commit_sha,
            "files": [],
            "review_comments": [],
            "general_comments": [],
        }

        # PR 变更文件
        try:
            for file in pr.get_files():
                pr_data["files"].append({
                    "filename": file.filename,
                    "status": file.status,
                    "patch": file.patch if hasattr(file, "patch") else None
                })
        except Exception as e:
            print(f"Failed to fetch files for PR #{pr.number}: {e}")

        # Code Review 评论
        try:
            for comment in pr.get_review_comments():
                pr_data["review_comments"].append({
                    "user": comment.user.login,
                    "path": comment.path,
                    "line": comment.position,
                    "body": comment.body
                })
        except Exception as e:
            print(f"Failed to fetch comments for PR #{pr.number}: {e}")
        
        try:
            issue = repo.get_issue(number=pr.number)
            for comment in issue.get_comments():
                pr_data["general_comments"].append({
                    "user": comment.user.login,
                    "created_at": str(comment.created_at),
                    "body": comment.body
                })
        except Exception as e:
            print(f"[通用评论出错] PR #{pr.number}: {e}")
        # 保存为 JSONL 行
        f_out.write(json.dumps(pr_data, indent=4, ensure_ascii=False) + "\n")

## Document Extraction

In [47]:
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from urllib.parse import urljoin
import time

def fetch_hf_doc(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    return soup

def extract_sections(soup):
    qa_pairs = []
    headers = soup.find_all(['h1', 'h2', 'h3'])

    for header in tqdm(headers):
        title = header.get_text().strip()
        print(title)
        content = []
        next_sibling = header.find_next_sibling()

        # Collect paragraphs and lists until the next header
        while next_sibling and next_sibling.name not in ['h1', 'h2', 'h3']:
            # if next_sibling.name in ['p', 'ul', 'ol', 'pre', 'li']:
            content.append(next_sibling.get_text().strip())
            next_sibling = next_sibling.find_next_sibling()

        text = "\n".join(content).strip()
        if text:
            qa_pairs.append(
                {
                    "header": title,
                    "content": text,
                    "source": "huggingface_doc",
                    "url": soup.title.string if soup.title else "N/A"
                }
            )
            # question, answer = make_qa_from_section(title, text)

            # if question:
            #     qa_pairs.append({
            #         "question": question,
            #         "answer": answer,
            #         "source": "huggingface_doc",
            #         "metadata": {
            #             "section_title": title,
            #             "url": soup.title.string if soup.title else "N/A"
            #         }
            #     })

    return qa_pairs

def make_qa_from_section(title, text):
    """
    基于标题和内容生成问题模板
    """
    if len(text.split()) < 5:
        return None, None

    # 简单模板化问题构造
    if "tokenizer" in title.lower():
        q = f"What is {title} in HuggingFace Transformers?"
    elif title.lower().startswith("how"):
        q = title + "?"
    elif "parameters" in title.lower():
        q = f"What parameters does {title} include?"
    else:
        q = f"What does '{title}' refer to in Transformers?"

    return q, text

def save_to_jsonl(data, filename="hf_qa_output.jsonl"):
    with open(filename, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, indent=4, ensure_ascii=False) + "\n")

url = "https://huggingface.co/docs/transformers/en/main_classes/agent#transformers.Agent"  # 你可以替换成其他页面
soup = fetch_hf_doc(url)
qa_pairs = extract_sections(soup)
save_to_jsonl(qa_pairs)
print(f"✅ Done! Extracted {len(qa_pairs)} QA pairs.")

100%|██████████| 38/38 [00:00<00:00, 3856.64it/s]

Transformers
Agents & Tools
Agents
Agent
class transformers.Agent
CodeAgent
class transformers.CodeAgent
React agents
class transformers.ReactAgent
class transformers.ReactJsonAgent
class transformers.ReactCodeAgent
ManagedAgent
class transformers.ManagedAgent
Tools
load_tool
tool
Tool
class transformers.Tool
Toolbox
class transformers.Toolbox
PipelineTool
class transformers.PipelineTool
launch_gradio_demo
stream_to_gradio
ToolCollection
class transformers.ToolCollection
Engines
TransformersEngine
class transformers.TransformersEngine
HfApiEngine
class transformers.HfApiEngine
Agent Types
AgentText
class transformers.agents.agent_types.AgentText
AgentImage
class transformers.agents.agent_types.AgentImage
AgentAudio
class transformers.agents.agent_types.AgentAudio
✅ Done! Extracted 37 QA pairs.



