In [1]:
import ast
import os
import json
import tokenize
from io import BytesIO
from tqdm import tqdm

In [2]:
%load_ext autoreload
%autoreload 2

## Source code extraction

In [7]:
def extract_comments(source_code):
    comments = []
    tokens = tokenize.tokenize(BytesIO(source_code.encode("utf-8")).readline)
    for toknum, tokval, _, _, _ in tokens:
        if toknum == tokenize.COMMENT:
            comments.append(tokval.strip("# ").strip())
    return comments

def extract_docstrings_and_defs(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        source = f.read()

    tree = ast.parse(source)
    results = []
    module_docstring = ast.get_docstring(tree)
    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.ClassDef)):
            name = node.name
            docstring = ast.get_docstring(node)
            node_type = "function" if isinstance(node, ast.FunctionDef) else "class"
            source_lines = source.splitlines()
            start_line = node.lineno - 1  # ast 行号从1开始，列表索引从0开始
            end_line = node.end_lineno if hasattr(node, 'end_lineno') else start_line
            source_code = '\n'.join(source_lines[start_line:end_line])
            results.append({
                "type": node_type,
                "name": name,
                "docstring": docstring or "",
                "source_code": source_code,
                "file_docstring": module_docstring
            })

    comments = extract_comments(source)
    return results, comments

def generate_qa_from_entry(entry):
    name = entry["name"]
    doc = entry["docstring"]
    if not doc:
        return None

    # question = f"What does the {entry['type']} `{name}` do?"
    # answer = doc.strip()
    source_code = entry.get("source_code", "")
    file_docstring = entry.get("file_docstring", "")

    return {
        "name": name,
        "docstring": doc.strip(),
        "file_docstring": file_docstring,
        "source": "source_code",
        "type": entry["type"],
        "code": source_code
    }

def process_directory(dir_path):
    qa_pairs = []
    for root, _, files in tqdm(os.walk(dir_path)):
        for file in tqdm(files):
            if file.endswith(".py"):
                full_path = os.path.join(root, file)
                try:
                    entries, comments = extract_docstrings_and_defs(full_path)
                    for entry in entries:
                        qa = generate_qa_from_entry(entry)
                        if qa:
                            qa["file"] = full_path
                            qa_pairs.append(qa)
                except Exception as e:
                    print(f"Failed to parse {full_path}: {e}")
    return qa_pairs

In [8]:
directory = "/home/cc/transformers/src/transformers"
qa_data = process_directory(directory)

# 保存结果为 JSONL 文件
with open("source_code_qa.json", "w", encoding="utf-8") as f:
    json.dump(qa_data, f, indent=4, ensure_ascii=False)

print(f"Extracted {len(qa_data)} QA pairs.")

0it [00:00, ?it/s]

100%|██████████| 60/60 [00:02<00:00, 25.14it/s]
100%|██████████| 21/21 [00:00<00:00, 117.14it/s]
100%|██████████| 6/6 [00:00<00:00, 93.05it/s]
100%|██████████| 1/1 [00:00<00:00, 1956.30it/s]
100%|██████████| 5/5 [00:00<00:00, 19222.29it/s]
100%|██████████| 3/3 [00:00<00:00, 46776.62it/s]
100%|██████████| 2/2 [00:00<00:00, 99.02it/s]
100%|██████████| 8/8 [00:00<00:00, 66974.91it/s]
100%|██████████| 2/2 [00:00<00:00, 27413.75it/s]
100%|██████████| 2/2 [00:00<00:00, 22733.36it/s]
100%|██████████| 4/4 [00:00<00:00, 49636.73it/s]
100%|██████████| 11/11 [00:00<00:00, 92.11it/s]
100%|██████████| 36/36 [00:00<00:00, 67.93it/s]
100%|██████████| 31/31 [00:00<00:00, 89.41it/s]
100%|██████████| 29/29 [00:00<00:00, 76.03it/s]
100%|██████████| 6/6 [00:00<00:00, 101.47it/s]
100%|██████████| 14/14 [00:00<00:00, 19.61it/s]
100%|██████████| 3/3 [00:00<00:00, 338.86it/s]
100%|██████████| 1/1 [00:00<00:00, 116.90it/s]
100%|██████████| 8/8 [00:00<00:00, 60.39it/s]
100%|██████████| 12/12 [00:00<00:00, 27.76

Extracted 12678 QA pairs.


### Extract QA

In [3]:
from openai import OpenAI
import time
# api_key = input("Enter your openai api key: ")
# api_base = input("Enter your openai api base: ")
os.environ["OPENAI_API_KEY"] = "EMPTY"
os.environ["OPENAI_API_BASE"] = "http://localhost:6006/v1"


In [5]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import random

In [5]:
def summarize_docstring(name, type, docstring):
    # 使用 GPT 模型生成摘要
    while True:
        try:
            prompt = """
            Summarize the following docstring, tell me what does the {type} `{name}` do.
            Docstring:
            {docstring}
            """
            question_pool = [
                "What does the {type} {name} do?"
                "What is the function of the {type} {name}?",
                "How does the {type} {name} work?",
                "What role does the {type} {name} play?",
                "What is the purpose of the {type} {name}?",
                "What does the {type} {name} accomplish?",
                "Can you explain what the {type} {name} is used for?",
                "Why do we need the {type} {name}?",
                "What is the {type} {name} responsible for?",
                "What task does the {type} {name} perform?",
                "What kind of behavior does the {type} {name} define?"
            ]
            question = random.choice(question_pool)
            question = question.format(name=name, type=type)
            client = OpenAI(
                api_key="EMPTY",
                base_url="http://localhost:6006/v1"
            )
            prompt = prompt.format(name=name, type=type, docstring=docstring)
            response = client.chat.completions.create(
                model="mistralai/Ministral-8B-Instruct-2410",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            # print(response.choices[0].message.content)
            return question, response.choices[0].message.content
        except Exception as e:
            print(e)
            time.sleep(10)
            continue


In [6]:
with open("source_code_qa.json", "r", encoding="utf-8") as f:
    qa_data = json.load(f)

qa_data_with_summary = []
with ThreadPoolExecutor(max_workers=100) as executor:
    futures = [executor.submit(summarize_docstring, qa["name"], qa["type"], qa["docstring"]) for qa in qa_data]
    for future in tqdm(as_completed(futures)):
        qa = qa_data[futures.index(future)]
        qa["question"], qa["answer"] = future.result()
        qa_data_with_summary.append(qa)

with open("source_code_qa_with_summary.json", "w", encoding="utf-8") as f:
    json.dump(qa_data_with_summary, f, indent=4, ensure_ascii=False)

12678it [2:28:13,  1.43it/s]


## Git commit extraction

In [27]:
from git import Repo
import os
import json

In [30]:
# 仓库路径：替换为你本地 transformers 的路径
REPO_PATH = "/home/cc/transformers"
repo = Repo(REPO_PATH)

output = []

# 遍历最近 N 个 commit（可调整）
for commit in repo.iter_commits('main', max_count=10000):
    commit_data = {
        "commit_hash": commit.hexsha,
        "author": commit.author.name,
        "date": commit.committed_datetime.isoformat(),
        "message": commit.message.strip()
    }

    # 获取 diff 的简要变化（可设置为 full_diff=True 看更多上下文）
    diffs = commit.diff(commit.parents[0] if commit.parents else None, create_patch=True)

    diff_texts = []
    for diff in diffs:
        try:
            diff_texts.append(diff.diff.decode("utf-8", errors="ignore"))
        except Exception as e:
            continue

    diff_summary = "\n".join(diff_texts)
    commit_data["diff_summary"] = diff_summary

    # 构造 QA 对
    # qa_item = {
    #     "question": f"What changed in commit {commit.hexsha[:7]}?",
    #     "answer": f"{commit.message.strip()}\n\nSummary of changes:\n{diff_summary[:1000]}...",
    #     "source": "git_commit",
    #     "metadata": commit_data
    # }
    question_pool = [
        "What changed in commit {hash}?",
        "What modifications were introduced in commit {hash}?",
        "Can you summarize the changes made in commit {hash}?",
        "What updates does commit {hash} contain?",
        "What's new in commit {hash}?",
        "Describe the differences introduced by commit {hash}.",
        "What was added, removed, or modified in commit {hash}?",
        "What does commit {hash} change in the codebase?",
        "Which files or functions were affected by commit {hash}?",
        "What's the purpose of commit {hash}?",
        "How does commit {hash} alter the existing implementation?"
    ]
    question = random.choice(question_pool)
    question = question.format(hash=commit.hexsha[:7])
    qa_item = {
        "question": question,
        "answer": f"{commit.message.strip()}",
        "source": "git_commit",
        "metadata": commit_data
    }

    output.append(qa_item)

    question_pool = [
        "Who is the author of the commit {hash}?",
        "Who made the commit {hash}?",
        "Who is responsible for commit {hash}?",
        "Can you tell me who authored commit {hash}?",
        "Who's the person behind commit {hash}?",
        "Who committed {hash}?",
        "Which developer authored commit {hash}?",
        "Who was the contributor for commit {hash}?",
        "Do you know who wrote commit {hash}?",
        "Who pushed commit {hash} to the repository?",
        "Whose work is represented by commit {hash}?"
    ]
    question = random.choice(question_pool)
    question = question.format(hash=commit.hexsha[:7])
    qa_item = {
        "question": question,
        "answer": f"{commit.author.name}",
        "source": "git_commit",
        "metadata": commit_data
    }
    output.append(qa_item)

    question_pool = [
        "When was the commit {hash} made?",
        "What is the timestamp of commit {hash}?",
        "When exactly did commit {hash} occur?",
        "At what time was commit {hash} created?",
        "Can you tell me the date of commit {hash}?",
        "On what date was commit {hash} made?",
        "Do you know when commit {hash} was pushed?",
        "Any idea when commit {hash} happened?",
        "When did commit {hash} go through?",
        "What's the date on commit {hash}?",
        "When did they make commit {hash}?"
    ]
    question = random.choice(question_pool)
    question = question.format(hash=commit.hexsha[:7])
    qa_item = {
        "question": question,
        "answer": f"{commit.committed_datetime.isoformat()}",
        "source": "git_commit",
        "metadata": commit_data
    }
    output.append(qa_item)
# 保存为 JSON
with open("qa_from_commits.json", "w") as f:
    json.dump(output, f, indent=4, ensure_ascii=False)

## Github issue extraction

In [1]:
from github import Github
g_token = input("Enter your github token: ")

In [2]:
g = Github(g_token)  # 用你的 GitHub Token

repo = g.get_repo("huggingface/transformers")
issues = repo.get_issues(state="closed")  # 可加过滤条件
qa_pairs = []
# issues[0].__dict__
# real_issues = [i for i in all_issues if not i.pull_request]
# for issue in issues[:2]:
#     print("Title:", issue.title)
#     print("Body:", issue.body)

In [24]:
from threading import Lock
lock = Lock()
def process_issue(issue):
    global qa_pairs
    while True:
        try:
            comments = issue.get_comments()
            # print("********************Comments********************")
            comments_list = []
            for comment in comments:
                comments_list.append(
                    {
                        "comment_author": comment.user.login,
                        "comment_body": comment.body
                    }
                )
                # print("Comment by", comment.user.login)
                # print(comment.body)
            print(issue.title)
            qa_pair = {
                "Issue Title": issue.title,
                "Issue Body": issue.body,
                "Issue Comments": comments_list,
                "source": "github_issue",
                "metadata": {
                    "issue_number": issue.number,
                    "url": issue.html_url,
                    "created_at": str(issue.created_at)
                }
            }
            lock.acquire()
            qa_pairs.append(qa_pair)
            with open("qa_from_issues.json", "w") as f:
                json.dump(qa_pairs, f, indent=4, ensure_ascii=False)
            lock.release()
        except Exception as e:
            print(e)
            time.sleep(10)
            continue

In [4]:
count = 0
qa_pairs = []
for issue in tqdm(issues):
    if issue.pull_request:
        continue
    # print("********************Issue********************")
    # print("Title:", issue.title)
    # print("Body:", issue.body)
    try:
        comments = issue.get_comments()
        # print("********************Comments********************")
        comments_list = []
        for comment in comments:
            comments_list.append(
                {
                    "comment_author": comment.user.login,
                    "comment_body": comment.body
                }
            )
            # print("Comment by", comment.user.login)
            # print(comment.body)
        print(issue.title)
        qa_pair = {
            "Issue Title": issue.title,
            "Issue Body": issue.body,
            "Issue Comments": comments_list,
            "source": "github_issue",
            "metadata": {
                "issue_number": issue.number,
                "url": issue.html_url,
                "created_at": str(issue.created_at)
            }
        }
        qa_pairs.append(qa_pair)
        count += 1

        with open("qa_from_issues.json", "w") as f:
            json.dump(qa_pairs, f, indent=4, ensure_ascii=False)
        if count > 10000:
            break
    except Exception as e:
        print(e)
        time.sleep(10)
        continue


TrOCR (image-to-text) produces incorrect output (':') on 12th Gen Intel CPU (i7-1260P) even with simple input
name 'json' is not defined


NameError: name 'time' is not defined

In [13]:
def summarize_comment(issue_title, issue_body, comments):
    # 使用 GPT 模型生成摘要
    prompt = """
    I will give you a github issue and its comments. Tell me how to resolve the issue based on the comments.
    If the issue isn't resolved in the comments, please just respond with '<Unsolved>'.
    Only give your response base on the comments, don't make up any solution.
    Issue Title:
    {issue_title}
    Issue Body:
    {issue_body}
    Comments:
    {comments}
    """
    while True:
        try:
            client = OpenAI(
                        api_key="EMPTY",
                        base_url="http://localhost:6006/v1"
                    )
            prompt = prompt.format(issue_title=issue_title, issue_body=issue_body, comments=comments)
            response = client.chat.completions.create(
                model="mistralai/Ministral-8B-Instruct-2410",
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            print(response.choices[0].message.content)
            return response.choices[0].message.content
        except Exception as e:
            print(e)
            time.sleep(10)
            continue


In [14]:
with open("qa_from_issues.json", "r") as f:
    qa_data = json.load(f)

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(summarize_comment, qa["Issue Title"], qa["Issue Body"], qa["Issue Comments"]) for qa in qa_data]
    for future in as_completed(futures):
        qa = qa_data[futures.index(future)]
        qa["answer"] = future.result()

with open("qa_from_issues_with_summary.json", "w") as f:
    json.dump(qa_data, f, indent=4, ensure_ascii=False)


<Unsolved>
<Unsolved>
<Unsolved>
<Unsolved>
Based on the comments, it seems that the issue is due to a version mismatch between different dependencies, especially with NumPy. The suggested solution is to create a new conda environment or use another environment manager to manage package versions and install the required dependencies again.
Based on the comments, it is not possible to add a Fast image processor for EfficientFormer because it is in the deprecated folder and contributions to this model are no longer accepted beyond version 4.40.2. So, the issue remains unsolved.
The issue has been resolved with a PR made by `@gmlwns2000`. The mismatch between the default value of `attn_temperature_tuning` in the `transformers` library and the official implementation has been corrected by changing it to a boolean type.
**Solution:**

Change the dtype of `class_labels` from `torch.int64` to `torch.uint8`.

```python
"class_labels": torch.ones(n_objects).to(torch.uint8),  # ensure class_labe

## PR and Code Review Extraction

In [12]:
MAX_PRS = 10  # 设定最多提取几个 PR，避免 API rate limit
OUTPUT_FILE = "huggingface_pr_data.json"


for pr in repo.get_pulls(state="closed", sort="created", direction="desc"):
    if MAX_PRS <= 0:
        break
    MAX_PRS -= 1

    pr_data = {
        "pr_number": pr.number,
        "title": pr.title,
        "body": pr.body,
        "user": pr.user.login,
        "created_at": str(pr.created_at),
        "merged": pr.merged,
        "merge_commit_sha": pr.merge_commit_sha,
        "files": [],
        "review_comments": [],
        "general_comments": [],
    }

    # PR 变更文件
    try:
        for file in pr.get_files():
            pr_data["files"].append({
                "filename": file.filename,
                "status": file.status,
                "patch": file.patch if hasattr(file, "patch") else None
            })
    except Exception as e:
        print(f"Failed to fetch files for PR #{pr.number}: {e}")

    # Code Review 评论
    try:
        for comment in pr.get_review_comments():
            pr_data["review_comments"].append({
                "user": comment.user.login,
                "path": comment.path,
                "line": comment.position,
                "body": comment.body
            })
    except Exception as e:
        print(f"Failed to fetch comments for PR #{pr.number}: {e}")
    
    try:
        issue = repo.get_issue(number=pr.number)
        for comment in issue.get_comments():
            pr_data["general_comments"].append({
                "user": comment.user.login,
                "created_at": str(comment.created_at),
                "body": comment.body
            })
    except Exception as e:
        print(f"[通用评论出错] PR #{pr.number}: {e}")

with open(OUTPUT_FILE, "w") as f:
    json.dump(pr_data, f, indent=4, ensure_ascii=False)


In [None]:
def summarize_pr(issue_title, issue_body, comments):
    # 使用 GPT 模型生成摘要
    prompt = """
    Summarize the following comments, tell me how to resolve issue `{issue_title}`.
    If the issue isn't resolved, please just respond with '<Unsolved>'.
    Issue Title:
    {issue_title}
    Issue Body:
    {issue_body}
    Comments:
    {comments}
    """
    client = OpenAI()
    prompt = prompt.format(issue_title=issue_title, issue_body=issue_body, comments=comments)
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    print(response.choices[0].message.content)
    return response.choices[0].message.content

## Document Extraction

In [13]:
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from urllib.parse import urljoin
import time

def fetch_hf_doc(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    return soup

def extract_sections(soup):
    qa_pairs = []
    headers = soup.find_all(['h1', 'h2', 'h3'])

    for header in tqdm(headers):
        title = header.get_text().strip()
        print(title)
        content = []
        next_sibling = header.find_next_sibling()

        # Collect paragraphs and lists until the next header
        while next_sibling and next_sibling.name not in ['h1', 'h2', 'h3']:
            # if next_sibling.name in ['p', 'ul', 'ol', 'pre', 'li']:
            content.append(next_sibling.get_text().strip())
            next_sibling = next_sibling.find_next_sibling()

        text = "\n".join(content).strip()
        if text:
            qa_pairs.append(
                {
                    "header": title,
                    "content": text,
                    "source": "huggingface_doc",
                    "url": soup.title.string if soup.title else "N/A"
                }
            )
            # question, answer = make_qa_from_section(title, text)

            # if question:
            #     qa_pairs.append({
            #         "question": question,
            #         "answer": answer,
            #         "source": "huggingface_doc",
            #         "metadata": {
            #             "section_title": title,
            #             "url": soup.title.string if soup.title else "N/A"
            #         }
            #     })

    return qa_pairs

def make_qa_from_section(title, text):
    """
    基于标题和内容生成问题模板
    """
    if len(text.split()) < 5:
        return None, None

    # 简单模板化问题构造
    if "tokenizer" in title.lower():
        q = f"What is {title} in HuggingFace Transformers?"
    elif title.lower().startswith("how"):
        q = title + "?"
    elif "parameters" in title.lower():
        q = f"What parameters does {title} include?"
    else:
        q = f"What does '{title}' refer to in Transformers?"

    return q, text

def save_to_json(data, filename="hf_qa_output.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

url = "https://huggingface.co/docs/transformers/en/main_classes/agent#transformers.Agent"  # 你可以替换成其他页面
soup = fetch_hf_doc(url)
qa_pairs = extract_sections(soup)
save_to_json(qa_pairs)
print(f"✅ Done! Extracted {len(qa_pairs)} QA pairs.")

100%|██████████| 38/38 [00:00<00:00, 3195.15it/s]

Transformers
Agents & Tools
Agents
Agent
class transformers.Agent
CodeAgent
class transformers.CodeAgent
React agents
class transformers.ReactAgent
class transformers.ReactJsonAgent
class transformers.ReactCodeAgent
ManagedAgent
class transformers.ManagedAgent
Tools
load_tool
tool
Tool
class transformers.Tool
Toolbox
class transformers.Toolbox
PipelineTool
class transformers.PipelineTool
launch_gradio_demo
stream_to_gradio
ToolCollection
class transformers.ToolCollection
Engines
TransformersEngine
class transformers.TransformersEngine
HfApiEngine
class transformers.HfApiEngine
Agent Types
AgentText
class transformers.agents.agent_types.AgentText
AgentImage
class transformers.agents.agent_types.AgentImage
AgentAudio
class transformers.agents.agent_types.AgentAudio
✅ Done! Extracted 37 QA pairs.





In [None]:
def generate_qa_from_hf_doc(header, content, few_shot_examples):
    # 使用 GPT 模型生成摘要
    prompt = """
    Generate a question and answer based on the following header and content.
    {few_shot_examples}
    Header:
    {header}
    Content:
    {content}
    """
    client = OpenAI()
    prompt = prompt.format(few_shot_examples=few_shot_examples, header=header, content=content)
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    print(response.choices[0].message.content)
    return response.choices[0].message.content


In [None]:
seed_qa_pairs = [
    {
        "Header": "Transformers",
        "Content": "Transformers is a library for natural language processing tasks, including text classification, tokenization, and more.",
        "Question": "What is Transformers?",
        "Answer": "Transformers is a library for natural language processing tasks, including text classification, tokenization, and more."
    },
    {
        "Header": "Agents",
        "Content": "We provide two types of agents, based on the main Agent class:\nCodeAgent acts in one shot, generating code to solve the task, then executes it at once. ReactAgent acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes:ReactJsonAgent writes its tool calls in JSON. ReactCodeAgent writes its tool calls in Python code.",
        "Question": "What is the difference between CodeAgent and ReactAgent?",
        "Answer": "CodeAgent acts in one shot, generating code to solve the task, then executes it at once. ReactAgent acts step by step, each step consisting of one thought, then one tool call and execution. It has two classes:ReactJsonAgent writes its tool calls in JSON. ReactCodeAgent writes its tool calls in Python code."
    },
    {
        "Header": "Agent Types",
        "Content": "Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return\ntext, image, audio, video, among other types. In order to increase compatibility between tools, as well as to\ncorrectly render these returns in ipython (jupyter, colab, ipython notebooks, …), we implement wrapper classes\naround these types.\nThe wrapped objects should continue behaving as initially; a text object should still behave as a string, an image\nobject should still behave as a PIL.Image.\nThese types have three specific purposes:\nCalling to_raw on the type should return the underlying object Calling to_string on the type should return the object as a string: that can be the string in case of an AgentText\nbut will be the path of the serialized version of the object in other instances Displaying it in an ipython kernel should display the object correctly",
        "Question": "What is the purpose of wrapper classes for Agents?",
        "Answer": "In order to increase compatibility between tools, as well as to correctly render these returns in ipython (jupyter, colab, ipython notebooks, …), we implement wrapper classes around these types."
    }
]

In [1]:
import requests

def get_questions(tag='huggingface', page=1, pagesize=10):
    url = "https://api.stackexchange.com/2.3/questions"
    params = {
        'order': 'desc',
        'sort': 'activity',
        'tagged': tag,
        'site': 'stackoverflow',
        'filter': 'withbody',
        'page': page,
        'pagesize': pagesize
    }
    response = requests.get(url, params=params)
    data = response.json()
    return data['items']

questions = get_questions()
for q in questions:
    print(f"Title: {q['title']}")
    print(f"Body: {q['body'][:200]}...")
    print(f"Link: {q['link']}")
    print("-" * 40)


Title: &quot;Inconsistent Predictions in PyTorch Model: Single Image vs. Batch Processing&quot;
Body: <p>I am noticing a significant difference in model predictions when running predictions on a single image versus the whole dataset. The model, which was trained using PyTorch, gives drastically differ...
Link: https://stackoverflow.com/questions/79294216/inconsistent-predictions-in-pytorch-model-single-image-vs-batch-processing
----------------------------------------
Title: What is the proper way to fill a batch in training an LM all the way to the end eg how to correct my tokenize_and_group_texts_via_blocks?
Body: <p>I’m preparing a text dataset for next-token language-model pre-training.
Using HF datasets with batched=True, I wrote a helper that
1.  prepends a BOS token (if the tokenizer has one),
2.  appends ...
Link: https://stackoverflow.com/questions/79615125/what-is-the-proper-way-to-fill-a-batch-in-training-an-lm-all-the-way-to-the-end
----------------------------------------


In [6]:
print(questions[0]['answer_count'])

2


In [5]:
def get_answers(question_id):
    url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers"
    params = {
        'order': 'desc',
        'sort': 'activity',
        'site': 'stackoverflow',
        'filter': 'withbody'
    }
    res = requests.get(url, params=params)
    data = res.json()
    return data['items']

# 示例：抓第一个问题的答案
question_id = questions[0]['question_id']
answers = get_answers(question_id)
for ans in answers:
    print("Answer:", ans['body'])


Answer: <p>I have run into the same issue myself and to my best understanding these tiny differences (≈2.3e-10) are due to floating point non-associativity or microkernel behavior in batched matrix ops. Apparently, It is expected in complex models with Attention layers.</p>
<p>These differences are not a bug! They are not fixed by setting options like <strong><code>torch.use_deterministic_algorithms(True)</code></strong>, and they are not due to randomness, but due to numerical artifacts of batching.</p>

Answer: <p>Try with eval mode.
I have added it and updated the code.</p>
<pre><code>from transformers import Trainer, TrainingArguments, PreTrainedModel, PretrainedConfig
from torch.utils.data import Dataset
import torch
import torch.nn.functional as F
import numpy as np

# Number of Features
num_of_features = 128

# Dataset Class
class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=t

In [7]:
import requests
import time
import json

def get_questions_with_answers(tag='huggingface', target_count=100):
    collected = []
    page = 1
    page_size = 100
    while len(collected) < target_count:
        print(f"Fetching page {page}...")
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            'order': 'desc',
            'sort': 'activity',
            'tagged': tag,
            'site': 'stackoverflow',
            'filter': 'withbody',
            'pagesize': page_size,
            'page': page
        }
        res = requests.get(url, params=params)
        data = res.json()
        items = data.get('items', [])

        for item in items:
            if item.get('answer_count', 0) > 0:
                collected.append({
                    'question_id': item['question_id'],
                    'title': item['title'],
                    'body': item['body'],
                    'link': item['link']
                })
                if len(collected) >= target_count:
                    break

        if not data.get('has_more', False):
            break  # 没有更多数据
        page += 1
        time.sleep(1)  # 避免触发速率限制
    return collected

def get_answers_for_question(qid):
    url = f"https://api.stackexchange.com/2.3/questions/{qid}/answers"
    params = {
        'order': 'desc',
        'sort': 'votes',
        'site': 'stackoverflow',
        'filter': 'withbody'
    }
    res = requests.get(url, params=params)
    data = res.json()
    return [ans['body'] for ans in data.get('items', [])]

# 抓取前100个有答案的问题
questions = get_questions_with_answers()

# 获取每个问题的答案
for q in questions:
    q['answers'] = get_answers_for_question(q['question_id'])
    print(f"Fetched {len(q['answers'])} answers for question: {q['title'][:50]}...")
    time.sleep(0.5)

# 保存为 JSON 文件
with open('huggingface_qa.json', 'w', encoding='utf-8') as f:
    json.dump(questions, f, indent=2, ensure_ascii=False)

print("✅ Done! Saved 100 Q&A items to huggingface_qa.json")

Fetching page 1...
Fetching page 2...
Fetched 2 answers for question: &quot;Inconsistent Predictions in PyTorch Model: S...
Fetched 1 answers for question: How to Compute Teacher-Forced Accuracy (TFA) for H...
Fetched 4 answers for question: How to load a huggingface dataset from local path?...
Fetched 1 answers for question: Unable to connect to hugging face model...
Fetched 2 answers for question: How to broadcast a tensor from main process using ...
Fetched 1 answers for question: Hugging Face Sentence Transformer API returning 40...
Fetched 1 answers for question: Error while loading Deepseek using HuggingFace...
Fetched 3 answers for question: ImportError: cannot import name &#39;HuggingFaceIn...
Fetched 1 answers for question: How to convert safetensors model to onnx model?...
Fetched 1 answers for question: Serving models using VLLM on Huggingface Spaces...
✅ Done! Saved 100 Q&A items to huggingface_qa.json


In [8]:
import json
with open('huggingface_qa.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

formatted_data = []
for item in data:
    for i in range(len(item['answers'])):
        formatted_data.append({
            "question": item['title'] + "\n" + item['body'],
            "answer": item['answers'][i],
            "source": "stackoverflow",
            "metadata": {
            "question_id": item['question_id'],
            "link": item['link']
        }
    })

with open('huggingface_qa_formatted.json', 'w', encoding='utf-8') as f:
    json.dump(formatted_data, f, indent=2, ensure_ascii=False)

print("✅ Done! Saved 100 Q&A items to huggingface_qa_formatted.json")

✅ Done! Saved 100 Q&A items to huggingface_qa_formatted.json
