In [11]:
import pymongo

For more details refer to 
 https://zenodo.org/record/5901956

Command used to export the data (this command takes about 15 minutes to complete).

`mongodump --db=JiraRepos --gzip --archive=mongodump-JiraRepos.archive`

Accompanying command to restore the data (this command takes about 15 minutes to complete). Expanded, this data is ~60GB inside MongoDB.

`mongorestore --gzip --archive=mongodump-JiraRepos.archive --nsFrom "JiraRepos.*" --nsTo "JiraRepos.*"`

Change the `--nsTo` command to contain the desired name for the JiraRepos database.
mongorestore --gzip --archive=mongodump-JiraRepos.archive --nsFrom "JiraRepos.*" --nsTo "JiraRepos.Apache"


For more information see: https://docs.mongodb.com/manual/tutorial/backup-and-restore-tools/

Jira Dataset for TD filtered was extracted from https://zenodo.org/record/5901956  (https://arxiv.org/pdf/2201.08368.pdf) and adapted 

Montgomery, Lloyd, Lüders, Clara, & Maalej, Prof. Dr. Walid. (2022). The Public Jira Dataset [Data set]. Zenodo. https://doi.org/10.5281/zenodo.5901956



In [12]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")


In [13]:
mydb = myclient["JiraRepos"]


In [14]:
myapache = mydb["Apache"]
myjiraecosystem = mydb["JiraEcosystem"]
mySonatype = mydb["Sonatype"]
mymongo = mydb["MongoDB"]


In [15]:
collist = mydb.list_collection_names()
collist


['Apache',
 'SecondLife',
 'Mindville',
 'MariaDB',
 'JiraEcosystem',
 'IntelDAOS',
 'JFrog',
 'Spring',
 'Qt',
 'Jira',
 'MongoDB',
 'Hyperledger',
 'RedHat',
 'Sonatype',
 'Sakai',
 'Mojang']

In [16]:
import pandas as pd

In [17]:
def get_keys(dictionary):
    result = []
    for key, value in dictionary.items():
        if type(value) is dict:
            new_keys = get_keys(value)
            result.append(key)
            for innerkey in new_keys:
                result.append(f'{key}/{innerkey}')
        else:
            result.append(key)
    return result


In [18]:
def flatten_dict(nested_dict):
    res = {}
    if isinstance(nested_dict, dict):
        for k in nested_dict:
            flattened_dict = flatten_dict(nested_dict[k])
            for key, val in flattened_dict.items():
                key = list(key)
                # print(key , k )
                key.insert(0, k)
                res[tuple(key)] = val
    else:
        res[()] = nested_dict
    return res


def nested_dict_to_df(values_dict):
    flat_dict = flatten_dict(values_dict)
    
    df = pd.DataFrame.from_dict(flat_dict, orient="index")
   
    return df


In [19]:
import pymongo as pm

CHUNK_SIZE = 500
client = pm.MongoClient()
coll = client.get_database('JiraRepos').get_collection('Sonatype')
cursor = coll.find({}, batch_size=CHUNK_SIZE)


def yield_rows(cursor, chunk_size):
    """
    Generator to yield chunks from cursor
    :param cursor:
    :param chunk_size:
    :return:
    """
    chunk = []
    for i, row in enumerate(cursor):
        if i % chunk_size == 0 and i > 0:
            yield chunk
            del chunk[:]
        chunk.append(row)
    yield chunk


chunks = yield_rows(cursor, CHUNK_SIZE)
for chunk in chunks:
    df = pd.json_normalize(
        chunk, errors='ignore')

    df = df[["id", "fields.project.name", "fields.created", "fields.labels", "fields.summary", "fields.description", "fields.status.name",
             "fields.status.description", "fields.issuetype.name", "fields.issuetype.description", "fields.issuetype.subtask", "fields.comments"]]

    df.to_csv(f"Sonatype-{df.id[0]}.csv")

   

In [20]:
import dask.dataframe as dd


In [22]:
data_Sonatype = dd.read_csv("Sonatype-*.csv")


In [23]:
data_Sonatype = data_Sonatype.compute()


In [24]:
data_Sonatype


Unnamed: 0.1,Unnamed: 0,id,fields.project.name,fields.created,fields.labels,fields.summary,fields.description,fields.status.name,fields.status.description,fields.issuetype.name,fields.issuetype.description,fields.issuetype.subtask,fields.comments
0,0,10119,Dev - Nexus Repo,2008-06-24T11:47:21.000+0000,[],Snapshot Remover not handling artifacts using ...,Currently the snapshot remover is not removing...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
1,1,10118,Dev - Nexus Repo,2008-06-23T21:21:17.000+0000,[],Evict Service fails to remove artifacts in Rep...,\r\nThe evict scheduled service does not delet...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
2,2,10117,Dev - Nexus Repo,2008-06-23T15:45:35.000+0000,[],Repository targets,A repository may consist of several logical ta...,Closed,"The issue is considered finished, the resoluti...",Improvement,An improvement or enhancement to an existing f...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
3,3,10114,Dev - Nexus Repo,2008-06-23T15:34:33.000+0000,[],User Management,The user should be able to create and manage u...,Closed,"The issue is considered finished, the resoluti...",Improvement,An improvement or enhancement to an existing f...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
4,4,10113,Dev - Nexus Repo,2008-06-23T15:14:23.000+0000,[],After Evicting all artifacts in all repositori...,"I ran an evict task for all repositories, and ...",Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,834582,Community Support - Open Source Project Reposi...,2021-12-18T01:30:58.000+0000,[],Rule failure while trying to close staging rep...,[信息] * 本地上演的工件上传完成。\r\n [信息] * 正在关闭 ID 为“orgmv...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...
496,496,834580,Community Support - Open Source Project Reposi...,2021-12-18T01:15:56.000+0000,[],Create new com.babylonhealth.lit project,I get a `Profile com.babylonhealth.lit is not ...,Closed,"The issue is considered finished, the resoluti...",New Project,Add a new project to the repository,False,[{'self': 'https://issues.sonatype.org/rest/ap...
497,497,834579,Community Support - Open Source Project Reposi...,2021-12-18T00:20:04.000+0000,['migrate'],Please migrate com.solace and com.solacesystems,With the recent instability we have been unabl...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...
498,498,834568,Community Support - Open Source Project Reposi...,2021-12-17T23:41:19.000+0000,[],Timeout closing repository,I have been unable to publish to maven central...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...


In [25]:
data_Sonatype1 = data_Sonatype.drop(columns="Unnamed: 0")


In [28]:
data_Sonatype1


Unnamed: 0,id,fields.project.name,fields.created,fields.labels,fields.summary,fields.description,fields.status.name,fields.status.description,fields.issuetype.name,fields.issuetype.description,fields.issuetype.subtask,fields.comments
0,10119,Dev - Nexus Repo,2008-06-24T11:47:21.000+0000,[],Snapshot Remover not handling artifacts using ...,Currently the snapshot remover is not removing...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
1,10118,Dev - Nexus Repo,2008-06-23T21:21:17.000+0000,[],Evict Service fails to remove artifacts in Rep...,\r\nThe evict scheduled service does not delet...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
2,10117,Dev - Nexus Repo,2008-06-23T15:45:35.000+0000,[],Repository targets,A repository may consist of several logical ta...,Closed,"The issue is considered finished, the resoluti...",Improvement,An improvement or enhancement to an existing f...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
3,10114,Dev - Nexus Repo,2008-06-23T15:34:33.000+0000,[],User Management,The user should be able to create and manage u...,Closed,"The issue is considered finished, the resoluti...",Improvement,An improvement or enhancement to an existing f...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
4,10113,Dev - Nexus Repo,2008-06-23T15:14:23.000+0000,[],After Evicting all artifacts in all repositori...,"I ran an evict task for all repositories, and ...",Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
...,...,...,...,...,...,...,...,...,...,...,...,...
495,834582,Community Support - Open Source Project Reposi...,2021-12-18T01:30:58.000+0000,[],Rule failure while trying to close staging rep...,[信息] * 本地上演的工件上传完成。\r\n [信息] * 正在关闭 ID 为“orgmv...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...
496,834580,Community Support - Open Source Project Reposi...,2021-12-18T01:15:56.000+0000,[],Create new com.babylonhealth.lit project,I get a `Profile com.babylonhealth.lit is not ...,Closed,"The issue is considered finished, the resoluti...",New Project,Add a new project to the repository,False,[{'self': 'https://issues.sonatype.org/rest/ap...
497,834579,Community Support - Open Source Project Reposi...,2021-12-18T00:20:04.000+0000,['migrate'],Please migrate com.solace and com.solacesystems,With the recent instability we have been unabl...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...
498,834568,Community Support - Open Source Project Reposi...,2021-12-17T23:41:19.000+0000,[],Timeout closing repository,I have been unable to publish to maven central...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...


In [29]:
data_Sonatype1["fields.issuetype.name"].value_counts()


New Project           65487
Publishing Support     9413
Bug                    6495
Improvement            2480
Problem                1597
Sub-task               1342
Story                   271
Task                    147
Technical Debt           29
New Feature               9
Epic                      7
Technical task            5
Question                  2
Name: fields.issuetype.name, dtype: int64

In [30]:
data_Sonatype1.reset_index(drop=True, inplace=True)


In [31]:
data_Sonatype1

Unnamed: 0,id,fields.project.name,fields.created,fields.labels,fields.summary,fields.description,fields.status.name,fields.status.description,fields.issuetype.name,fields.issuetype.description,fields.issuetype.subtask,fields.comments
0,10119,Dev - Nexus Repo,2008-06-24T11:47:21.000+0000,[],Snapshot Remover not handling artifacts using ...,Currently the snapshot remover is not removing...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
1,10118,Dev - Nexus Repo,2008-06-23T21:21:17.000+0000,[],Evict Service fails to remove artifacts in Rep...,\r\nThe evict scheduled service does not delet...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
2,10117,Dev - Nexus Repo,2008-06-23T15:45:35.000+0000,[],Repository targets,A repository may consist of several logical ta...,Closed,"The issue is considered finished, the resoluti...",Improvement,An improvement or enhancement to an existing f...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
3,10114,Dev - Nexus Repo,2008-06-23T15:34:33.000+0000,[],User Management,The user should be able to create and manage u...,Closed,"The issue is considered finished, the resoluti...",Improvement,An improvement or enhancement to an existing f...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
4,10113,Dev - Nexus Repo,2008-06-23T15:14:23.000+0000,[],After Evicting all artifacts in all repositori...,"I ran an evict task for all repositories, and ...",Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.sonatype.org/rest/ap...
...,...,...,...,...,...,...,...,...,...,...,...,...
87279,834582,Community Support - Open Source Project Reposi...,2021-12-18T01:30:58.000+0000,[],Rule failure while trying to close staging rep...,[信息] * 本地上演的工件上传完成。\r\n [信息] * 正在关闭 ID 为“orgmv...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...
87280,834580,Community Support - Open Source Project Reposi...,2021-12-18T01:15:56.000+0000,[],Create new com.babylonhealth.lit project,I get a `Profile com.babylonhealth.lit is not ...,Closed,"The issue is considered finished, the resoluti...",New Project,Add a new project to the repository,False,[{'self': 'https://issues.sonatype.org/rest/ap...
87281,834579,Community Support - Open Source Project Reposi...,2021-12-18T00:20:04.000+0000,['migrate'],Please migrate com.solace and com.solacesystems,With the recent instability we have been unabl...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...
87282,834568,Community Support - Open Source Project Reposi...,2021-12-17T23:41:19.000+0000,[],Timeout closing repository,I have been unable to publish to maven central...,Closed,"The issue is considered finished, the resoluti...",Publishing Support,For general OSSRH publishing support tickets,False,[{'self': 'https://issues.sonatype.org/rest/ap...


In [54]:
data_Sonatype1.to_hdf('jira_all.h5', key="sonatype")


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['fields.project.name', 'fields.created', 'fields.labels',
       'fields.summary', 'fields.description', 'fields.status.name',
       'fields.status.description', 'fields.issuetype.name',
       'fields.issuetype.description', 'fields.comments'],
      dtype='object')]

  data_mongo1.to_hdf('jira_all.h5', key="mongodb")


In [None]:
df = pd.read_hdf('jira_all.h5', key="apache")
df1 = pd.read_hdf('jira_all.h5', key="mongodb")
df2 = pd.read_hdf('jira_all.h5', key="sonatype")
df3 = pd.read_hdf('jira_all.h5', key="jiraecosystem")
