In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def set_seed(seed=2022):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)

    os.environ['PYTHONHASHSEED'] = str(seed)


set_seed(2022)


In [3]:
def train_validate_test_split(df, train_percent=.85, validate_percent=.075, seed=2022):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test


In [4]:
def train_test_split(df, train_percent=.8, seed=2022):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[perm[:train_end]]
    test = df.iloc[perm[train_end:]]
    return train, test


In [5]:
df_apache = pd.read_hdf('jira_all.h5', key="apache")
df_mongo = pd.read_hdf('jira_all.h5', key="mongodb")
df_sonatype = pd.read_hdf('jira_all.h5', key="sonatype")
df_jira = pd.read_hdf('jira_all.h5', key="jiraecosystem")


In [6]:
df_jiradata = pd.concat([df_apache, df_mongo , df_sonatype ,df_jira], ignore_index = True )

In [7]:
df_jiradata

Unnamed: 0,id,fields.project.name,fields.created,fields.labels,fields.summary,fields.description,fields.status.name,fields.status.description,fields.issuetype.name,fields.issuetype.description,fields.issuetype.subtask,fields.comments
0,12749100,HBase,2014-10-19T13:55:54.000+0000,[],Avoid onheap buffer copying at RPCServer#serRe...,We would like to see DBB end-to-end in read pa...,Resolved,"A resolution has been taken, and it is awaitin...",Sub-task,The sub-task of the issue,True,[{'self': 'https://issues.apache.org/jira/rest...
1,13318144,Infrastructure,2020-07-20T00:15:10.000+0000,[],Create 'Heron' folder on ci-builds.a.o,"Please create a folder ""Heron"" and grant me ri...",Closed,"The issue is considered finished, the resoluti...",Task,A task that needs to be done.,False,[{'self': 'https://issues.apache.org/jira/rest...
2,12679975,Apache Flex,2013-11-19T15:42:53.000+0000,"['easyfix', 'patch']",SystemManager/mouseEventHandler replaces event...,When an application uses a class that extends ...,Open,The issue is open and ready for the assignee t...,Bug,A problem which impairs or prevents the functi...,False,[]
3,13103349,Apache Gobblin,2017-09-19T20:23:44.000+0000,['Writer:HDFS'],Support writing Kafka messages to db/table fil...,"- Add a new write file path type `DB_TABLE`, w...",Resolved,"A resolution has been taken, and it is awaitin...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.apache.org/jira/rest...
4,12816836,Groovy,2013-03-11T14:47:40.000+0000,[],making super-call safe causes a verify error,Summary says it all; simplest test case:\n\n73...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...
367985,76647,Atlassian Connect,2013-02-11T17:01:50.000-0600,"['p3_core', 'plugin_point']","As a Plugins 3 plugin developer, I have the ab...",P3 plugins shall provide developers with the a...,Closed,"The issue is considered finished, the resoluti...",New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...
367986,76606,Atlassian Connect,2013-02-28T10:22:22.000-0600,"['JIRA', 'Kentoo', 'defer_triage']",Get all users for a project or a project role,"This is a follow up of ARA-117, expandUsers re...",Closed,"The issue is considered finished, the resoluti...",New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...
367987,76791,Atlassian Connect,2012-12-23T16:27:57.000-0600,['external'],I can use a JIRA project picker control in my ...,,Closed,"The issue is considered finished, the resoluti...",New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...
367988,76832,Atlassian Connect,2012-12-04T19:44:03.000-0600,"['external', 'jira']",I can add remote tab panels to the Project Adm...,Add a tab to the Project Admin page,Under Review,,New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...


In [12]:
df_jiradataset = df_jiradata[["fields.labels","fields.description", "fields.issuetype.name"]]


In [13]:
df_jiradataset

Unnamed: 0,fields.labels,fields.description,fields.issuetype.name
0,[],We would like to see DBB end-to-end in read pa...,Sub-task
1,[],"Please create a folder ""Heron"" and grant me ri...",Task
2,"['easyfix', 'patch']",When an application uses a class that extends ...,Bug
3,['Writer:HDFS'],"- Add a new write file path type `DB_TABLE`, w...",Bug
4,[],Summary says it all; simplest test case:\n\n73...,Bug
...,...,...,...
367985,"['p3_core', 'plugin_point']",P3 plugins shall provide developers with the a...,New Feature
367986,"['JIRA', 'Kentoo', 'defer_triage']","This is a follow up of ARA-117, expandUsers re...",New Feature
367987,['external'],,New Feature
367988,"['external', 'jira']",Add a tab to the Project Admin page,New Feature


In [14]:
contains_priority = df_jiradataset['fields.issuetype.name'].str.contains('priority').any()


In [36]:
# show count per row in the col "fields.issuetype.name". show top 50
df_jiradataset['fields.issuetype.name'].value_counts().to_frame().head(50)

Unnamed: 0_level_0,count
fields.issuetype.name,Unnamed: 1_level_1
Bug,127321
New Project,65487
Improvement,63060
Task,54591
New Feature,18089
Sub-task,12812
Publishing Support,9413
Story,3803
Build Failure,2491
Question,1862


In [37]:

df_jiradataset['fields.labels'].value_counts().to_frame().head(50)

Unnamed: 0_level_0,count
fields.labels,Unnamed: 1_level_1
[],296240
['pull-request-available'],3996
['collector-298ba4e7'],2804
['neweng'],1998
['external-user'],1830
['confirmed_bug'],672
['buildbot'],589
['host-management'],574
['supportant'],563
['Bug'],558


In [28]:
pattern = 'Priority'
hp = df_jiradataset[df_jiradataset['fields.issuetype.name'].str.contains(pattern)]
hp

Unnamed: 0,fields.labels,fields.description,fields.issuetype.name


In [15]:
contains_priority

False

In [None]:
df_jiradataset.drop_duplicates(
    subset=["fields.description"], keep='last', inplace=True)


In [None]:
df_jiradataset.dropna(inplace=True)
df_jiradataset.reset_index(inplace=True)
df_jiradataset.drop(columns=["index"], inplace=True)


In [None]:
df_jiradataset_TD = df_jiradataset[df_jiradataset["fields.issuetype.name"]=="Technical Debt"]

In [None]:
df_jiradataset_NTD = df_jiradataset[df_jiradataset["fields.issuetype.name"]
                                    != "Technical Debt"]


In [None]:
df_jiradataset_NTD.reset_index(inplace=True)


In [None]:
df_jiradataset_TD.reset_index(inplace=True)


In [None]:
df_jiradataset_TD

In [None]:
df_jiradataset_NTD

In [None]:
df_jiradataset_NTD = df_jiradataset_NTD.sample(frac=0.0015 , random_state= 2022)


In [None]:
df_jiradataset_NTD

In [None]:
df_jiradataset_NTD.drop(columns=["index"], inplace = True)

In [None]:
df_jiradataset_TD.drop(columns=["index"], inplace=True)


In [None]:
df_jiradataset_NTD["label"] = 0


In [None]:
df_jiradataset_TD["label"] = 1


In [None]:
df_jiradataset_data = pd.concat([df_jiradataset_NTD,df_jiradataset_TD],ignore_index=True )

In [None]:
df_jiradataset_data

In [None]:
df_jiradataset_data.columns = ["text","class", "label"]

In [None]:
df_jira_data = df_jiradataset_data.sample(frac=1 , random_state=2022)


In [None]:
df_jira_data.reset_index(inplace=True)

In [None]:
df_jira_data.drop(columns=["index" , "class"] , inplace=True)

In [None]:
import re
from string import punctuation


def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


In [None]:
df_jira_data["text"] = df_jira_data["text"].map(clean_text)


In [None]:
df_jira_data.to_csv("jira_TD_dataset.csv")


In [None]:
train , test = train_test_split(df_jira_data)


In [None]:
train

In [None]:
test