In [12]:
import pandas as pd

col_names = [
    "PROJECT_NAME",
    "PROJECT_OWNER",
    "PROJECT_DESCRIPTION",
    "PROJECT_LABEL",
    "PROJECT_LANGUAGE",
    "SHA_FIXED",
    "SHA_BUG",
    "DIFF_CODE",
    "COMMIT_DESCRIPTION",
    "COMMIT_TIME",
    "OLD_CONTENT",
    "NEW_CONTENT",
    "OLD_PATH",
    "NEW_PATH",
    "PR_TITLE",
    "PR_DESCRIPTION",
]


df = pd.read_csv("../resource/ghprdata.csv", header=None, names=col_names)

In [13]:
sub_df = df[["PR_TITLE", "DIFF_CODE"]]
sub_df = sub_df.dropna(subset=["PR_TITLE"])
print(sub_df.head())
print(sub_df.isna().sum())
print(sub_df.shape)

                                            PR_TITLE  \
0              Roidmi: fix duplicated DeviceType key   
1  NPE fix: DayPickerView accessibilityAnnouncePa...   
2  Fix NPE in PDE, affecting color picker and oth...   
3  [BACKLOG-16118] Fixed issue with field lengths...   
4  Fix #670: Use Files.move instead of File.renam...   

                                           DIFF_CODE  
0  diff --git a/app/src/main/java/nodomain/freeyo...  
1  diff --git a/library/src/main/java/com/wdullae...  
2  diff --git a/core/src/processing/core/PApplet....  
3  diff --git a/engine/src/org/pentaho/di/trans/s...  
4  diff --git a/frontend-plugin-core/src/main/jav...  
PR_TITLE     0
DIFF_CODE    0
dtype: int64
(3022, 2)


In [20]:
reverse_ratio = 0.3
random_state = 42

rev_df = sub_df.sample(frac=reverse_ratio, random_state=random_state)
ori_df = sub_df.drop(rev_df.index)

In [21]:
from git_diff_tools import reverse_git_diff,extract_diff_payload

rev_df = rev_df.copy()
ori_df = ori_df.copy()

rev_df["DIFF_CODE"] = rev_df["DIFF_CODE"].apply(
    lambda d: reverse_git_diff(d) if pd.notna(d) else d
)

ori_df["DIFF_CODE"] = ori_df["DIFF_CODE"].apply(
    lambda d: extract_diff_payload(d) if pd.notna(d) else d
)

rev_df["DIFF_CODE"] = rev_df["DIFF_CODE"].apply(
    lambda d: extract_diff_payload(d) if pd.notna(d) else d
)

rev_df["IS_REVERSED"] = 1
ori_df["IS_REVERSED"] = 0

In [22]:
final_df = pd.concat([ori_df, rev_df], ignore_index=True)
final_df["IS_REVERSED"].value_counts()

IS_REVERSED
0    2115
1     907
Name: count, dtype: int64

In [24]:
print(final_df.head())

                                            PR_TITLE  \
0  NPE fix: DayPickerView accessibilityAnnouncePa...   
1  Fix NPE in PDE, affecting color picker and oth...   
2  [BACKLOG-16118] Fixed issue with field lengths...   
3  Fix #670: Use Files.move instead of File.renam...   
4  Fix possible NPE in configuration when no db p...   

                                           DIFF_CODE  IS_REVERSED  
0   \n     void accessibilityAnnouncePageChanged(...            0  
1             render();\n         } else {\n     ...            0  
2       boolean bEndedLineWrote = false;\n     bo...            0  
3   import java.io.File;\n import java.io.FileNot...            0  
4   \n import org.apereo.cas.authentication.adapt...            0  


In [25]:
final_df.to_csv("final_diff_dataset.csv", index=False)