In [1]:
import pandas as pd
from pymongo import MongoClient

connection = MongoClient(
    host="research.cassee.dev",
    username="read-shark",
    password="msr2021shark")

db = connection.smartshark_2_1

In [2]:
#print(len(list(db.issue.find())), "total issues")  # --> 163057 db.issue.count() or db.issue.find().count() does not seem to work
#print(len(list(db.issue.find({'issue_type' : {'$exists': True}}))), "issues containing an issue_type field")  # --> 158222
#print(len(list(db.issue.find({'issue_type_verified': {'$exists': True }}, {'issue_type': 1, 'issue_type_verified' : 1}))), # --> 15517
#      "issues containing both issue_type and issue_type_verified")  # note, every issue with issue_type_verified also has an issue_type

In [3]:
# Query all issues which have an issue type (158222 out of 163057 total issues)
issues = list(db.issue.find({'issue_type': {'$exists': True }}, {'_id': 1, 'external_id': 1, 'creator_id':1, 'title': 1, 'issue_type': 1, 'issue_type_verified': 1, 'priority': 1, 'status': 1, 'resolution': 1}))

# Convert to dataframe
df = pd.DataFrame(issues)
df.head()

Unnamed: 0,_id,external_id,title,creator_id,issue_type,priority,status,resolution,issue_type_verified
0,58bfca910ccb2667fe269dd1,ZOOKEEPER-24,Do Application based outstanding request throt...,58b938adf04620b395fccf94,New Feature,Major,Open,,
1,58bfca920ccb2667fe269dd2,ZOOKEEPER-35,Replay logs,58b938adf04620b395fccf94,New Feature,Major,Open,,
2,58bfca930ccb2667fe269dd3,ZOOKEEPER-37,WebDAV access to ZooKeeper,58b938adf04620b395fccf94,New Feature,Major,Open,,
3,58bfca930ccb2667fe269dd4,ZOOKEEPER-46,Clients should check the latencies to the vari...,58b938adf04620b39bfccf95,Improvement,Major,Open,,
4,58bfca940ccb2667fe269dd7,ZOOKEEPER-52,Session ids on the stat cmd information,58b938adf04620b39bfccf95,Improvement,Minor,Open,,


## Issue Types (Verified)

In [4]:
# Combine issue type and issue type verified, such that if there is a verified type we pick that, otherwise keep the original issue type
df['new_issue_type'] = df['issue_type_verified'].combine_first(df['issue_type'])

# drop the old issue types and rename new_issue_type to issue_type
df = df.drop(columns = ['issue_type', 'issue_type_verified'])
df = df.rename(columns={'new_issue_type': 'issue_type'})

In [5]:
# put all issue types in lowercase
df['issue_type'] = df['issue_type'].apply(lambda x: x.lower())
print("All issue type currently present:\n", df['issue_type'].value_counts())

# grouping of values which mean the same but have different labels
feature_equivs = ['improvement', 'new feature', 'wish', 'feature_request', 'brainstorming', 'request', 'proposal']
feature_val = 'feature'
task_equivs = ['sub-task', 'task', 'technical task']
task_val = 'task'
dependency_equivs = ['dependency upgrade', 'dependency']
dep_val = 'dependency'
other_equivs = ['other', 'epic', 'refactoring', 'temp', 'umbrella', 'story', 'blog - new blog request', 'tck challenge', 'it help', 'new jira project', 'access', 'blogs - new blog user account request', 'project', 'outage', 'new tlp ', 'new git repo', 'planned work']
other_val = 'other'

# mainly not sure what to do with: test, documentation, question, epic, refactoring, temp and umbrella

df['issue_type'] = df['issue_type'].apply(lambda val: feature_val if val in feature_equivs else val) 
df['issue_type'] = df['issue_type'].apply(lambda val: task_val if val in task_equivs else val) 
df['issue_type'] = df['issue_type'].apply(lambda val: dep_val if val in dependency_equivs else val) 
df['issue_type'] = df['issue_type'].apply(lambda val: other_val if val in other_equivs else val) 

print("\nCombined issue types:\n", df['issue_type'].value_counts())

All issue type currently present:
 bug                                      81504
improvement                              42240
new feature                               9676
sub-task                                  9501
task                                      8436
test                                      1739
wish                                      1397
other                                     1227
documentation                             1080
feature_request                            549
question                                   310
dependency upgrade                         153
epic                                       133
refactoring                                 92
story                                       56
temp                                        31
umbrella                                    24
brainstorming                               20
dependency                                  18
blog - new blog request                      6
technical task           

### Status

In [6]:
print("Initial unique status values\n", df['status'].value_counts(dropna=False))

# All issues have a statement (i.e. no nan values), interestingly blocked only occurs twice
# We'll only look at closed and resolved issues
df = df[(df['status'] == 'Closed') | (df['status'] == 'Resolved')]

print("\nFiltered status\n", df['status'].value_counts(dropna=False))

Initial unique status values
 Closed             81182
Resolved           47828
Open               26776
Patch Available     1106
Reopened             707
In Progress          621
Blocked                2
Name: status, dtype: int64

Filtered status
 Closed      81182
Resolved    47828
Name: status, dtype: int64


### Resolution

In [7]:
print("Initial unique resolution values:\n", df['resolution'].value_counts(dropna=False))

# grouping of values which mean the same but have different labels
resolved_equivs = ['Fixed', 'Done', 'Not A Bug', 'Resolved', 'Workaround', 'Works for Me', 'Delivered', 'Implemented', 'Staged']
resolved_val = 'Resolved'
unresolved_equivs = ["Won't Fix", 'Auto Closed', "Won't Do", "Abandoned", 'Unresolved', 'Pending Closed', 'REMIND']
unresolved_val = 'Unresolved'
invalid_equivs = ['Duplicate', 'Not A Problem', 'Invalid', 'Cannot Reproduce', 'Incomplete']
invalid_val = 'Invalid'

# Not a problem, not a bug -> invalid or resolved?
# information provided?   About half resolved, half closed
# Feedback Received?  About half resolved, half closed
# Later ?  Again about half resolved, half closed

df['resolution'] = df['resolution'].apply(lambda val: resolved_val if val in resolved_equivs else val) 
df['resolution'] = df['resolution'].apply(lambda val: unresolved_val if val in unresolved_equivs else val) 
df['resolution'] = df['resolution'].apply(lambda val: invalid_val if val in invalid_equivs else val) 
df['resolution'] = df['resolution'].apply(lambda val: invalid_val if str(val) == 'nan' else val)  # drop nans as .dropna() doesn't seem to work
df['resolution'] = df['resolution'].dropna()
print("\nCombined unique resolution values:\n", df['resolution'].value_counts(dropna=False))

Initial unique resolution values:
 Fixed                   99497
Won't Fix                7067
Duplicate                6914
Not A Problem            4318
Invalid                  3155
Cannot Reproduce         2598
Incomplete               2478
Done                     1018
Implemented               351
Not A Bug                 326
Resolved                  298
Later                     289
Auto Closed               180
Won't Do                  128
Abandoned                 105
Information Provided       85
Workaround                 73
Unresolved                 51
Pending Closed             39
Works for Me               17
Feedback Received           9
Delivered                   7
NaN                         4
REMIND                      2
Staged                      1
Name: resolution, dtype: int64

Combined unique resolution values:
 Resolved                101588
Invalid                  19467
Unresolved                7572
Later                      289
Information Provided   