In [24]:
import ast
import os
import json
import tokenize
from io import BytesIO
from tqdm import tqdm

In [25]:
def extract_comments(source_code):
    comments = []
    tokens = tokenize.tokenize(BytesIO(source_code.encode("utf-8")).readline)
    for toknum, tokval, _, _, _ in tokens:
        if toknum == tokenize.COMMENT:
            comments.append(tokval.strip("# ").strip())
    return comments

def extract_docstrings_and_defs(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        source = f.read()

    tree = ast.parse(source)
    results = []
    module_docstring = ast.get_docstring(tree)
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
            name = node.name
            docstring = ast.get_docstring(node)
            node_type = "function" if isinstance(node, ast.FunctionDef) else "class"
            source_lines = source.splitlines()
            start_line = node.lineno - 1  # ast 行号从1开始，列表索引从0开始
            end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line
            source_code = '\n'.join(source_lines[start_line:end_line])
            results.append({
                "type": node_type,
                "name": name,
                "docstring": docstring or "",
                "source_code": source_code,
                "file_docstring": module_docstring
            })

    comments = extract_comments(source)
    return results, comments

def generate_qa_from_entry(entry):
    name = entry["name"]
    doc = entry["docstring"]
    if not doc:
        return None

    # question = f"What does the {entry['type']} `{name}` do?"
    # answer = doc.strip()
    source_code = entry.get("source_code", "")
    file_docstring = entry.get("file_docstring", "")

    return {
        "name": name,
        "docstring": doc.strip(),
        "file_docstring": file_docstring,
        "source": "source_code",
        "type": entry["type"],
        "code": source_code
    }

def process_directory(dir_path):
    qa_pairs = []
    for root, _, files in tqdm(os.walk(dir_path)):
        for file in tqdm(files):
            if file.endswith(".py"):
                full_path = os.path.join(root, file)
                try:
                    entries, comments = extract_docstrings_and_defs(full_path)
                    for entry in entries:
                        qa = generate_qa_from_entry(entry)
                        if qa:
                            qa["file"] = full_path
                            qa_pairs.append(qa)
                except Exception as e:
                    print(f"Failed to parse {full_path}: {e}")
    return qa_pairs

In [26]:
directory = "/home/cc/transformers/src/transformers"
qa_data = process_directory(directory)

# 保存结果为 JSONL 文件
with open("source_code_qa.jsonl", "w", encoding="utf-8") as f:
    for qa in qa_data:
        f.write(json.dumps(qa, indent=4, ensure_ascii=False) + "\n")

print(f"Extracted {len(qa_data)} QA pairs.")

0it [00:00, ?it/s]

100%|██████████| 60/60 [00:02<00:00, 25.82it/s]
100%|██████████| 21/21 [00:00<00:00, 183.93it/s]
100%|██████████| 6/6 [00:00<00:00, 36.56it/s]
100%|██████████| 1/1 [00:00<00:00, 4466.78it/s]
100%|██████████| 5/5 [00:00<00:00, 59747.92it/s]
100%|██████████| 3/3 [00:00<00:00, 31378.83it/s]
100%|██████████| 2/2 [00:00<00:00, 29.81it/s]
100%|██████████| 8/8 [00:00<00:00, 96420.78it/s]
100%|██████████| 2/2 [00:00<00:00, 23431.87it/s]
100%|██████████| 2/2 [00:00<00:00, 22550.02it/s]
100%|██████████| 4/4 [00:00<00:00, 45964.98it/s]
100%|██████████| 11/11 [00:00<00:00, 37.20it/s]
100%|██████████| 36/36 [00:00<00:00, 63.79it/s]
100%|██████████| 31/31 [00:00<00:00, 80.20it/s]
100%|██████████| 29/29 [00:00<00:00, 75.09it/s]
100%|██████████| 6/6 [00:00<00:00, 95.52it/s]
100%|██████████| 14/14 [00:00<00:00, 19.30it/s]
100%|██████████| 3/3 [00:00<00:00, 644.81it/s]
100%|██████████| 1/1 [00:00<00:00, 116.32it/s]
100%|██████████| 8/8 [00:00<00:00, 58.26it/s]
100%|██████████| 12/12 [00:00<00:00, 27.68i

Extracted 12678 QA pairs.
