In [20]:
import pandas as pd

jira_df = pd.read_csv('./data/issues_jira.csv', encoding="UTF-8", low_memory=False, sep=';')
jira_links_df = pd.read_csv('./data/links_jira.csv', encoding="UTF-8", low_memory=False, sep=';')

mongodb_df = pd.read_csv('./data/issues_mongodb.csv', encoding="UTF-8", low_memory=False, sep=';')
mongodb_links_df = pd.read_csv('./data/links_mongodb.csv', encoding="UTF-8", low_memory=False, sep=';')

hyperledger_df = pd.read_csv('./data/issues_hyperledger.csv', encoding="UTF-8", low_memory=False, sep=';')
hyperledger_links_df = pd.read_csv('./data/links_hyperledger.csv', encoding="UTF-8", low_memory=False, sep=';')

### Conversion of Issues into JSONL File

In [21]:
# Extract title, description and issue_id from the dataframes
jira_df = jira_df[['issue_id', 'title', 'description']]
mongodb_df = mongodb_df[['issue_id', 'title', 'description']]
hyperledger_df = hyperledger_df[['issue_id', 'title', 'description']]

In [22]:
# Convert the dataframe into an array of jsonl objects
jira_df = jira_df.to_json(orient='records', lines = True)
mongodb_df = mongodb_df.to_json(orient='records', lines = True)
hyperledger_df = hyperledger_df.to_json(orient='records', lines = True)

In [23]:
# Save it into a JSONL File
with open('./data/issues_jira.jsonl', 'w') as f:
    f.write(jira_df)

with open('./data/issues_mongodb.jsonl', 'w') as f:
    f.write(mongodb_df)

with open('./data/issues_hyperledger.jsonl', 'w') as f:
    f.write(hyperledger_df)

### Conversion of Links into JSONL File

In [24]:
# Remove any linktype which is not Duplicate, and remove records which have duplicate name
jira_links_df = jira_links_df[jira_links_df['linktype'] == 'Duplicate']
jira_links_df = jira_links_df.drop_duplicates(subset=['name'])

mongodb_links_df = mongodb_links_df[mongodb_links_df['linktype'] == 'Duplicate']
mongodb_links_df = mongodb_links_df.drop_duplicates(subset=['name'])

hyperledger_links_df = hyperledger_links_df[hyperledger_links_df['linktype'] == 'Duplicate']
hyperledger_links_df = hyperledger_links_df.drop_duplicates(subset=['name'])

In [25]:
# Drop name and linktype
jira_links_df = jira_links_df[['issue_id_1', 'issue_id_2']]
mongodb_links_df = mongodb_links_df[['issue_id_1', 'issue_id_2']]
hyperledger_links_df = hyperledger_links_df[['issue_id_1', 'issue_id_2']]

In [26]:
# Convert the dataframe into an array of jsonl objects
jira_links_df = jira_links_df.to_json(orient='records', lines = True)
mongodb_links_df = mongodb_links_df.to_json(orient='records', lines = True)
hyperledger_links_df = hyperledger_links_df.to_json(orient='records', lines = True)

In [27]:
# Save it into a JSONL File
with open('./data/links_jira.jsonl', 'w') as f:
    f.write(jira_links_df)

with open('./data/links_mongodb.jsonl', 'w') as f:
    f.write(mongodb_links_df)

with open('./data/links_hyperledger.jsonl', 'w') as f:
    f.write(hyperledger_links_df)