Skip to content

Commit

Permalink
add TSSB-3M dataset (#2693)
Browse files Browse the repository at this point in the history
add the TSSM-3M code bugs dataset
issue: [#1395](#1395)

---------

Co-authored-by: 张子锐 <zirui.zhang@yiducloud.cn>
Co-authored-by: Oliver Stanley <olivergestanley@gmail.com>
  • Loading branch information
3 people committed May 11, 2023
1 parent 7f1ff64 commit ea6af41
Show file tree
Hide file tree
Showing 5 changed files with 7,056 additions and 0 deletions.
14 changes: 14 additions & 0 deletions data/datasets/TSSB-3M/README.md
@@ -0,0 +1,14 @@
# Dataset summary

This dataset contains over 3 million isolated single statement bug fixes. Each
bug fix is related to a commit in a public Python that does not change more than
a single statement

1. The original dataset comes from the
[TSSB-3M](https://zenodo.org/record/5845439) dataset
2. By requesting the GitHub api to obtain the commit message, we expand and
create a new dataset
[TSSB-3M-ext](https://huggingface.co/datasets/zirui3/TSSB-3M-ext)
3. Convert `TSSB-3M-ext` into instruction form to form the
[TSSB-3M-instruction](https://huggingface.co/datasets/zirui3/TSSB-3M-instructions)
dataset
230 changes: 230 additions & 0 deletions data/datasets/TSSB-3M/generate_dataset.py
@@ -0,0 +1,230 @@
"""Convert the source TSSB-3M dataset to instruction data
"""

import json
import random
import re
from os.path import join

from tqdm import tqdm

INSTRUCTIONS_LIST = [
"Find the bug in the following code:",
"Identify the error in the code snippet provided:",
"Spot the issue within the given code segment:",
"Locate the problem in the code example below:",
"Uncover the malfunction in the following piece of code:",
"Detect the flaw in the code provided:",
"Pinpoint the glitch in the code sample below:",
"Search for the anomaly in the given code:",
"Determine the defect within the following code:",
"Discover the fault in the code segment provided:",
"Trace the irregularity in the code example below:",
"Please locate the error in the code provided.",
"Can you identify the mistake in this code?",
"There seems to be a problem with this code. Can you find it?",
"Please investigate the code and locate the bug.",
"Please examine the code and find the error.",
"Can you pinpoint the issue with this code?",
"Please review the code and identify the bug.",
"Can you detect the problem with this code?",
"Please analyze the code and find the mistake.",
"Can you spot the bug in the code provided?",
]


RESPONSE_PREFIX_WORDS = [
"The fix of the bug can be laid out as",
"The resolution of the error can be portrayed like so",
"The solution for the flaw can be summarized as such",
"The remedy of the mistake can be captured in this way",
"The correction of the fault can be depicted like this",
"The patch for the glitch can be articulated as",
"The workaround of the defect can be conveyed in this manner",
"The troubleshooting of the issue can be explained like this",
"The adjustment to the anomaly can be illustrated as follows",
"The modification for the irregularity can be exemplified like this",
]


def gen_instruction():
idx = random.randint(0, len(INSTRUCTIONS_LIST) - 1)
return INSTRUCTIONS_LIST[idx]


def gen_response_prefix():
idx = random.randint(0, len(RESPONSE_PREFIX_WORDS) - 1)
return RESPONSE_PREFIX_WORDS[idx]


TEMPLATE = """User: {}
{}
Reply: The fixed code is:
```
{}
```
"""


# template for pretty output(multiple lines with `User:` & `Reply`)
TEMPLATE_COMMIT_MSG = """User: {}
{}
Reply: {}:
{}
The fixed code is:
```
{}
```
"""

INSTRUCTON_TEMPLATE = """{}
{}
"""


# template for json output(value)

RESPONSE_TEMPLATE = """The fixed code is:
```
{}
```
"""

RESPONSE_TEMPLATE_COMMIT_MSG = """{}:
{}
The fixed code is:
```
{}
```
"""


def remove_starting_plus_minus(text):
if text.startswith("+") or text.startswith("-"):
return text[1:]
else:
return text


def remove_extraneous_diff_info(text):
pattern = "@@.*@@"
return re.sub(pattern, "", text)


def clean(text):
return remove_extraneous_diff_info(remove_starting_plus_minus(text))


def clean_PII(text):
# Remove sign-off messege generated by `git commit --signoff`, eg. "Signed-off-by: user_name <xx@yy.zz.com>"
signoff_index = text.rfind("\n\nSigned-off-by:")
if signoff_index != -1:
# Remove the sign-off string from the commit message
text = text[:signoff_index]

# remove email
email_pattern = r"[a-zA-Z0-9._%+-]+@(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}"
clean_text = re.sub(email_pattern, "", text)
return clean_text


INVALID_COMMIT_MESSAGES = set([line.strip().split("\t")[0] for line in open("invalid_commit_messages.tsv").readlines()])


def is_invaid_commit_msg(text):
"""commit message that is incomplete, eg. "fix bug", "hotfix" """
return text.strip() in INVALID_COMMIT_MESSAGES


def clean_commit_msg(text):
"""
# 1. remove issue id , eg. msg: "rename (hetr_passes -> passes) #1195" -> "rename (hetr_passes -> passes)"
# 2. remove `fix` prefix:
some typical cases:
## eg. [fix] 拼写错误 -> 拼写错误
## eg. [FIX] purchase_indonesia : AttributeError 'NoneType' object has no attribute 'id' -> AttributeError 'NoneType' object has no attribute 'id'
## "fix force insert error refs #2" -> "fix force insert error"
## "Fix namespace of RPCError Fixes #76" -> "Fix namespace of RPCError"
## "fix a minor bug in survey_spec password field handling see: #5477" -> "fix a minor bug in survey_spec password field handling"
## issue #973 -> ""
## "Fixes #246" -> ""
## "Close #152." -> ""
## "wrong learning rate schedule (#2360)" -> "wrong learning rate schedule"
"""
# filter commit message that contains PII(github user name/email..)
text = clean_PII(text)

# Remove issue id
pattern = r"\(?#\d{1,6}\)?"
# re.sub(r"(.+?\s\(.+?\))\s#\d{1,6}", '\\1', text)
text = re.sub(pattern, "", text)
# Replace multiple spaces with a single space
text = re.sub(r"\s+", " ", text).strip()

# filter commit message that is too short
if len(text) < 4:
return None

if is_invaid_commit_msg(text):
return None
return text


def create(input_file, output_file, output_json=True):
fout = open(output_file, "w")
with open(input_file) as fin:
for line in tqdm(fin):
row = json.loads(line.strip())
wrong = "\n".join(clean(line) for line in row["diff"].split("\n") if not line.startswith("+"))
correct = "\n".join(clean(line) for line in row["diff"].split("\n") if not line.startswith("-"))

instruction = INSTRUCTON_TEMPLATE.format(wrong, correct)

commit_msg = clean_commit_msg(row["commit_message"]) if "commit_message" in row else None
if commit_msg:
# template: (instruct, wrong_code, resposne_prefix, commit_message, correct_code)
out_str = TEMPLATE_COMMIT_MSG.format(
gen_instruction(), wrong, gen_response_prefix(), commit_msg, correct
)
response = RESPONSE_TEMPLATE_COMMIT_MSG.format(gen_response_prefix(), commit_msg, correct)
else:
# no commit message
out_str = TEMPLATE.format(gen_instruction(), wrong, correct)
response = RESPONSE_TEMPLATE.format(correct)

if output_json:
row = {
"INSTRUCTION": instruction,
"RESPONSE": response,
"SOURCE": "TSSM-3M",
"METADATA": {
"project_url": row["project_url"],
"file_path": row["file_path"],
"commit_sha": row["commit_sha"],
},
}
out_str = json.dumps(row, ensure_ascii=False)

print(out_str, file=fout)
fout.close()


if __name__ == "__main__":
"""
# get source data from huggingface repository
!wget https://huggingface.co/datasets/zirui3/TSSB-3M-ext/blob/main/data.jsonl.gz
!gzip -d data.jsonl.gz
"""

data_dir = "."
# source TSSB-3M data
input_file = join(data_dir, "data.jsonl")

# output multiple lines
# output_file = join(data_dir, "instructions_multple_lines.txt")
# create(input_file, output_file, output_json=False)

# output jsonl
output_file = join(data_dir, "instructions.jsonl")
create(input_file, output_file, output_json=True)

0 comments on commit ea6af41

Please sign in to comment.