In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
def set_seed(seed=2022):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)

    os.environ['PYTHONHASHSEED'] = str(seed)


set_seed(2022)


In [5]:
def train_validate_test_split(df, train_percent=.85, validate_percent=.075, seed=2022):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test


In [6]:
def train_test_split(df, train_percent=.8, seed=2022):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    train = df.iloc[perm[:train_end]]
    test = df.iloc[perm[train_end:]]
    return train, test


In [7]:
df_apache = pd.read_hdf('jira_all.h5', key="apache")
df_mongo = pd.read_hdf('jira_all.h5', key="mongodb")
df_sonatype = pd.read_hdf('jira_all.h5', key="sonatype")
df_jira = pd.read_hdf('jira_all.h5', key="jiraecosystem")


In [11]:

# List all keys in jira_all.h5
keys = pd.HDFStore('jira_all.h5').keys()
keys

['/apache', '/jiraecosystem', '/mongodb', '/sonatype']

In [8]:
df_jiradata = pd.concat([df_apache, df_mongo , df_sonatype ,df_jira], ignore_index = True )

In [10]:
df_jiradata

Unnamed: 0,id,fields.project.name,fields.created,fields.labels,fields.summary,fields.description,fields.status.name,fields.status.description,fields.issuetype.name,fields.issuetype.description,fields.issuetype.subtask,fields.comments
0,12749100,HBase,2014-10-19T13:55:54.000+0000,[],Avoid onheap buffer copying at RPCServer#serRe...,We would like to see DBB end-to-end in read pa...,Resolved,"A resolution has been taken, and it is awaitin...",Sub-task,The sub-task of the issue,True,[{'self': 'https://issues.apache.org/jira/rest...
1,13318144,Infrastructure,2020-07-20T00:15:10.000+0000,[],Create 'Heron' folder on ci-builds.a.o,"Please create a folder ""Heron"" and grant me ri...",Closed,"The issue is considered finished, the resoluti...",Task,A task that needs to be done.,False,[{'self': 'https://issues.apache.org/jira/rest...
2,12679975,Apache Flex,2013-11-19T15:42:53.000+0000,"['easyfix', 'patch']",SystemManager/mouseEventHandler replaces event...,When an application uses a class that extends ...,Open,The issue is open and ready for the assignee t...,Bug,A problem which impairs or prevents the functi...,False,[]
3,13103349,Apache Gobblin,2017-09-19T20:23:44.000+0000,['Writer:HDFS'],Support writing Kafka messages to db/table fil...,"- Add a new write file path type `DB_TABLE`, w...",Resolved,"A resolution has been taken, and it is awaitin...",Bug,A problem which impairs or prevents the functi...,False,[{'self': 'https://issues.apache.org/jira/rest...
4,12816836,Groovy,2013-03-11T14:47:40.000+0000,[],making super-call safe causes a verify error,Summary says it all; simplest test case:\n\n73...,Closed,"The issue is considered finished, the resoluti...",Bug,A problem which impairs or prevents the functi...,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...
367985,76647,Atlassian Connect,2013-02-11T17:01:50.000-0600,"['p3_core', 'plugin_point']","As a Plugins 3 plugin developer, I have the ab...",P3 plugins shall provide developers with the a...,Closed,"The issue is considered finished, the resoluti...",New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...
367986,76606,Atlassian Connect,2013-02-28T10:22:22.000-0600,"['JIRA', 'Kentoo', 'defer_triage']",Get all users for a project or a project role,"This is a follow up of ARA-117, expandUsers re...",Closed,"The issue is considered finished, the resoluti...",New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...
367987,76791,Atlassian Connect,2012-12-23T16:27:57.000-0600,['external'],I can use a JIRA project picker control in my ...,,Closed,"The issue is considered finished, the resoluti...",New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...
367988,76832,Atlassian Connect,2012-12-04T19:44:03.000-0600,"['external', 'jira']",I can add remote tab panels to the Project Adm...,Add a tab to the Project Admin page,Under Review,,New Feature,"A new feature of the product, which has yet to...",False,[{'self': 'https://ecosystem.atlassian.net/res...


In [9]:
df_jiradataset = df_jiradata[["fields.description", "fields.issuetype.name"]]


In [11]:
df_jiradataset

Unnamed: 0,fields.description,fields.issuetype.name
0,We would like to see DBB end-to-end in read pa...,Sub-task
1,"Please create a folder ""Heron"" and grant me ri...",Task
2,When an application uses a class that extends ...,Bug
3,"- Add a new write file path type `DB_TABLE`, w...",Bug
4,Summary says it all; simplest test case:\n\n73...,Bug
...,...,...
367985,P3 plugins shall provide developers with the a...,New Feature
367986,"This is a follow up of ARA-117, expandUsers re...",New Feature
367987,,New Feature
367988,Add a tab to the Project Admin page,New Feature


In [12]:
df_jiradataset.drop_duplicates(
    subset=["fields.description"], keep='last', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jiradataset.drop_duplicates(


In [13]:
df_jiradataset.dropna(inplace=True)
df_jiradataset.reset_index(inplace=True)
df_jiradataset.drop(columns=["index"], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jiradataset.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jiradataset.drop(columns=["index"], inplace=True)


In [14]:
df_jiradataset_TD = df_jiradataset[df_jiradataset["fields.issuetype.name"]=="Technical Debt"]

In [27]:
df_jiradataset_NTD = df_jiradataset[df_jiradataset["fields.issuetype.name"]
                                    != "Technical Debt"]


In [28]:
df_jiradataset_NTD.reset_index(inplace=True)


In [17]:
df_jiradataset_TD.reset_index(inplace=True)


In [18]:
df_jiradataset_TD

Unnamed: 0,index,fields.description,fields.issuetype.name
0,92612,Fixes the following issues found by spotbugs:\...,Technical Debt
1,92613,These factories are part of the old type syste...,Technical Debt
2,92614,The Flink codebases uses Kafka Client and Conf...,Technical Debt
3,92615,We should update from Log4j 2.17.0 to 2.17.1 t...,Technical Debt
4,92616,"Flink uses net.sf.py4j:py4j version 0.10.8.1, ...",Technical Debt
...,...,...,...
371,278331,This issue was raised following comments on th...,Technical Debt
372,278378,Develop a functional test which asserts that n...,Technical Debt
373,282229,Currently SourceClear scan for UPM is defined ...,Technical Debt
374,282634,We are waiting for Jira to unlocked after Impo...,Technical Debt


In [29]:
df_jiradataset_NTD

Unnamed: 0,index,fields.description,fields.issuetype.name
0,0,We would like to see DBB end-to-end in read pa...,Sub-task
1,1,"Please create a folder ""Heron"" and grant me ri...",Task
2,2,When an application uses a class that extends ...,Bug
3,3,"- Add a new write file path type `DB_TABLE`, w...",Bug
4,4,Summary says it all; simplest test case:\n\n73...,Bug
...,...,...,...
304766,305142,"We need a plugin point for remote apps, simila...",New Feature
304767,305143,P3 plugins shall provide developers with the a...,New Feature
304768,305144,"This is a follow up of ARA-117, expandUsers re...",New Feature
304769,305145,Add a tab to the Project Admin page,New Feature


In [30]:
df_jiradataset_NTD = df_jiradataset_NTD.sample(frac=0.0015 , random_state= 2022)


In [31]:
df_jiradataset_NTD

Unnamed: 0,index,fields.description,fields.issuetype.name
16272,16272,The recent change to use the custom ObjectProp...,Bug
127659,127902,I'm having an issue I think is likely faced by...,Improvement
5782,5782,When a service object cannot be created due to...,Improvement
9277,9277,I've created a patch for jakarta-commons to ad...,Bug
148372,148672,The mongos forwards the {{createIndexes}} comm...,Bug
...,...,...,...
197741,198053,The synchro tests are failing due to internal ...,Bug
267882,268252,* coder-validator: help developer check Java m...,New Project
53452,53452,Fix the following issues spotted by OSS-Fuzz:\...,Bug
229082,229438,"Hi,\r\n\r\nI need a group ID for my Java proje...",New Project


In [32]:
df_jiradataset_NTD.drop(columns=["index"], inplace = True)

In [33]:
df_jiradataset_TD.drop(columns=["index"], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jiradataset_TD.drop(columns=["index"], inplace=True)


In [34]:
df_jiradataset_NTD["label"] = 0


In [35]:
df_jiradataset_TD["label"] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jiradataset_TD["label"] = 1


In [36]:
df_jiradataset_data = pd.concat([df_jiradataset_NTD,df_jiradataset_TD],ignore_index=True )

In [37]:
df_jiradataset_data

Unnamed: 0,fields.description,fields.issuetype.name,label
0,The recent change to use the custom ObjectProp...,Bug,0
1,I'm having an issue I think is likely faced by...,Improvement,0
2,When a service object cannot be created due to...,Improvement,0
3,I've created a patch for jakarta-commons to ad...,Bug,0
4,The mongos forwards the {{createIndexes}} comm...,Bug,0
...,...,...,...
828,This issue was raised following comments on th...,Technical Debt,1
829,Develop a functional test which asserts that n...,Technical Debt,1
830,Currently SourceClear scan for UPM is defined ...,Technical Debt,1
831,We are waiting for Jira to unlocked after Impo...,Technical Debt,1


In [38]:
df_jiradataset_data.columns = ["text","class", "label"]

In [39]:
df_jira_data = df_jiradataset_data.sample(frac=1 , random_state=2022)


In [41]:
df_jira_data.reset_index(inplace=True)

In [44]:
df_jira_data.drop(columns=["index" , "class"] , inplace=True)

In [48]:
import re
from string import punctuation


def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


In [49]:
df_jira_data["text"] = df_jira_data["text"].map(clean_text)


In [51]:
df_jira_data.to_csv("jira_TD_dataset.csv")


In [52]:
train , test = train_test_split(df_jira_data)


In [53]:
train

Unnamed: 0,text,label
391,made arbiters not try to create session colle...,0
817,the flink codebases uses kafka client and conf...,1
713,i dont have a lot of information about this de...,1
485,we are waiting for jira to unlocked after impo...,1
515,theres some tech debt piling up in the request...,1
...,...,...
5,add missing documentation commentsvxqueryvxque...,0
821,in btvrfydskc wtconndatacorruption is set in ...,1
445,we disabled the column store tests in the cove...,1
365,in we added a few compiler warnings for the c...,1


In [54]:
test

Unnamed: 0,text,label
487,hi\r\rwe just brought our sqs utils project to...,0
756,the sinterstore command has been implemented b...,0
528,migration from jcenter to maven central,0
364,the author of does not support his library an...,0
0,this is in kahaimplindexindexmanagerjavathe in...,0
...,...,...
240,flink currently provides support for two logg...,1
689,the streaming fieldaccessors keyedstreamkeyby ...,1
624,when page is first loaded and input is some c...,0
173,wtpagemodifylaststabletimestamp is no longer used,1
