# 🎛️Program Setup

### Import Libraries

In [1]:
# Standard Python Imports
from collections import defaultdict
from pprint import pprint
import random

# Third-Party Imports (requires pip install)
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm

# Local Libraries
import utils

In [2]:
pd.set_option('display.max_columns', None)  # We want to visualise all columns
pd.set_option('display.max_colwidth', None)  # Don't limit the width of the columns

### Define Globals

In [3]:
# Use Case 2 Globals
DATA_PATH = './data/'

LOG = utils.CustomLogger('CustomLogger', log_level='info', display_loglevel=False, display_datetime=False)
PICKLE_LIB = utils.PickleLib(data_path=DATA_PATH, logger=LOG)

### Load Data Sources

In [4]:
# Load in the evolution dataframe from the GenerateEvolutionDataframe script
evo_df = PICKLE_LIB.pickle_load(f"{DATA_PATH}load_evolution_dataframe(sample_data_n=10000)", 'gzip')

[Start] 🥒 Loading data from Pickle: "./data/load_evolution_dataframe(sample_data_n=10000).pgzip"


	 Data: 100%|#####################################################| 555M/555M [00:02<00:00, 231MB/s]

[ End ] Duration: 00:00:02.8913





In [5]:
evo_df.head(10)

Unnamed: 0,jira,issue_id,history_order,field,field_evo_order,field_evo_first,field_evo_last,data_from,data_to,history_author,history_created_date,issue_self,issue_creator,issue_created_date,final_issuetype,final_issuetype_theme,final_issuetype_code,final_project,minutes_since_creation,hours_since_creation,days_since_creation,field_theme,last_creator,last_reporter,last_assignee,last_commenter,last_evolver,prev_creators,prev_reporters,prev_assignees,prev_commenters,prev_evolvers
0,Apache,12851045,0,Summary,0,True,True,,Replace deprecated boxjavalibv2 with box-java-sdk,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Content,,,,,,[],[],[],[],[]
1,Apache,12851045,0,Description,0,True,True,,"camel-box component is based on boxjavalibv2 \[1\], which was made deprecated in favour of box-java-sdk \[2\] \[3\]. ""The new SDK is not backwards compatible and any code using this SDK will need to be migrated in order to take advantage of any new features and bug fixes.""\n\n\[1\] https://github.com/box/deprecated-box-java-sdk-v2\n\[2\] http://opensource.box.com/box-java-sdk/\n\[3\] https://www.box.com/blog/the-new-box-java-sdk/",Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Content,,,,,,[],[],[],[],[]
2,Apache,12851045,0,VersionsAffected,0,True,True,,2.15.2,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,MetaContent,,,,,,[],[],[],[],[]
3,Apache,12851045,0,IssueType,0,True,True,,Improvement,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,RepoStructure,,,,,,[],[],[],[],[]
4,Apache,12851045,0,Project,0,True,True,,Camel,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,RepoStructure,,,,,,[],[],[],[],[]
5,Apache,12851045,0,Components,0,True,True,,camel-box,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,RepoStructure,,,,,,[],[],[],[],[]
6,Apache,12851045,0,CreatedDate,0,True,True,,2015-08-03T08:13:50.000+0000,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[]
7,Apache,12851045,0,ResolvedDate,0,True,True,,2017-03-06T11:15:35.000+0000,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[]
8,Apache,12851045,0,Status,0,True,False,,Open,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[]
9,Apache,12851045,0,Priority,0,True,True,,Major,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[]


# Cleaning the Data

In [6]:
evo_df.shape

(1511000, 32)

In [7]:
print(f"Number of Evolutions: {evo_df.shape[0]:,}")

Number of Evolutions: 1,511,000


In [8]:
# Add to each for the combind "Jira Issue ID", which is a combination of the Jira name and the Issue ID.
# This field creates a truly unique ID across Jiras and issues.
evo_df['jira_issue_id'] = evo_df.jira + ' ' + evo_df.issue_id

In [9]:
print(f"Number of Issues: {len(evo_df.jira_issue_id.unique()):,}")

Number of Issues: 64,840


In [10]:
evo_df.shape

(1511000, 33)

In [11]:
evo_df.head(10)

Unnamed: 0,jira,issue_id,history_order,field,field_evo_order,field_evo_first,field_evo_last,data_from,data_to,history_author,history_created_date,issue_self,issue_creator,issue_created_date,final_issuetype,final_issuetype_theme,final_issuetype_code,final_project,minutes_since_creation,hours_since_creation,days_since_creation,field_theme,last_creator,last_reporter,last_assignee,last_commenter,last_evolver,prev_creators,prev_reporters,prev_assignees,prev_commenters,prev_evolvers,jira_issue_id
0,Apache,12851045,0,Summary,0,True,True,,Replace deprecated boxjavalibv2 with box-java-sdk,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Content,,,,,,[],[],[],[],[],Apache 12851045
1,Apache,12851045,0,Description,0,True,True,,"camel-box component is based on boxjavalibv2 \[1\], which was made deprecated in favour of box-java-sdk \[2\] \[3\]. ""The new SDK is not backwards compatible and any code using this SDK will need to be migrated in order to take advantage of any new features and bug fixes.""\n\n\[1\] https://github.com/box/deprecated-box-java-sdk-v2\n\[2\] http://opensource.box.com/box-java-sdk/\n\[3\] https://www.box.com/blog/the-new-box-java-sdk/",Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Content,,,,,,[],[],[],[],[],Apache 12851045
2,Apache,12851045,0,VersionsAffected,0,True,True,,2.15.2,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,MetaContent,,,,,,[],[],[],[],[],Apache 12851045
3,Apache,12851045,0,IssueType,0,True,True,,Improvement,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,RepoStructure,,,,,,[],[],[],[],[],Apache 12851045
4,Apache,12851045,0,Project,0,True,True,,Camel,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,RepoStructure,,,,,,[],[],[],[],[],Apache 12851045
5,Apache,12851045,0,Components,0,True,True,,camel-box,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,RepoStructure,,,,,,[],[],[],[],[],Apache 12851045
6,Apache,12851045,0,CreatedDate,0,True,True,,2015-08-03T08:13:50.000+0000,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[],Apache 12851045
7,Apache,12851045,0,ResolvedDate,0,True,True,,2017-03-06T11:15:35.000+0000,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[],Apache 12851045
8,Apache,12851045,0,Status,0,True,False,,Open,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[],Apache 12851045
9,Apache,12851045,0,Priority,0,True,True,,Major,Tomas Rohovsky,2015-08-03 08:13:50+00:00,https://issues.apache.org/jira/rest/api/2/issue/12851045,Tomas Rohovsky,2015-08-03 08:13:50+00:00,Improvement,Requirements,Improvement Suggestion,Camel,0.0,0.0,0.0,Workflow,,,,,,[],[],[],[],[],Apache 12851045


# NLP Techniques for Identifying Entities

In [12]:
evo_df.field.unique()

array(['Summary', 'Description', 'VersionsAffected', 'IssueType',
       'Project', 'Components', 'CreatedDate', 'ResolvedDate', 'Status',
       'Priority', 'Creator', 'Reporter', 'Assignee', 'Comments',
       'VersionsFixed', 'Resolution', 'IssueLinks', 'Labels',
       'TimeEstimateRemaining', 'TimeSpent', 'Sprint', 'Parent',
       'Environment', 'TimeEstimateOriginal', 'Rank', 'Flagged'],
      dtype=object)

# Collect Target Entities: Fields and Field States

We are interested in identifying entities that match issue fields. We want both the field names themselves, as well as
the possible states of those fields. To begin, we will create a complete list of all field names, and every value that
has ever been set to those fields, organised per field. We will segment this analysis on a per-Jira level, but you could
choose any segmentation (or not) of the data based on your level of analysis. Since fields and their available options
are set on a per-Jira level, this is a good starting point.

In [13]:
def collect_fields_and_states():

    # Collect all unique field states
    field_states = defaultdict(dict)
    
    # First, collect a list of all fields in our dataset. This is just the list of unique values in our "fields" column
    all_fields = list(evo_df.field.unique())

    # We don't want to extract the states for certain fields, such as the Summary and Description
    fields_to_ignore_state = ['Summary', 'Description', 'Comments', 'CreatedDate', 'ResolvedDate']
    fields_to_extract_states = [f for f in all_fields if f not in fields_to_ignore_state]

    # The analysis is per-Jira, so we need a list of all Jiras
    all_jiras = list(evo_df.jira.unique())

    # For each field, get all unique states this field has ever been in
    for field in fields_to_extract_states:

        # Reduce the dataset to just the relevant field entries
        evo_field_df = evo_df[evo_df.field == field]

        # Segment the unique field states that are used within each Jira
        for jira in all_jiras:

            # Reduce the dataset to just the relevant jira entries
            evo_jira_field_df = evo_field_df[evo_field_df.jira == jira]

            # Get all unqiue states this field has ever been in: stored in the data_from and data_to columns
            all_states = set(list(evo_jira_field_df.data_from) + list(evo_jira_field_df.data_to))

            # Convert all states to a string. This is required for our comparison to strings later
            all_states = set([str(state) for state in all_states if str(state).strip()])

            # Save all field jira states
            field_states[field][jira] = all_states

        # Now that we have gathered all unique states for this field across all Jiras, we want to create two more sets
        # per field: all_jiras_intersection and all_jiras_union. This allows us to check some other interesting things.
        field_states[field]['all_jiras_intersection'] = set.intersection(*list(field_states[field].values()))
        field_states[field]['all_jiras_union'] = set.union(*list(field_states[field].values()))

    return utils.defaultdict_to_dict(field_states)

field_states = collect_fields_and_states()

In [14]:
# Display the fields and field state counts. There are too many field states to reasonably visualise them.
def display_field_states_counts():

    # First, create a dict of dicts, where each dict represents a single field, and each item is the count within a Jira
    field_states_counts = {}
    for field, field_obj in field_states.items():
        field_states_counts[field] = {}
        for jira, jira_field_obj in field_obj.items():
            field_states_counts[field][jira] = len(jira_field_obj)
    
    # Convert dict of dicts into a dataframe, and display it
    display(pd.DataFrame(field_states_counts))

display_field_states_counts()

Unnamed: 0,VersionsAffected,IssueType,Project,Components,Status,Priority,Creator,Reporter,Assignee,VersionsFixed,Resolution,IssueLinks,Labels,TimeEstimateRemaining,TimeSpent,Sprint,Parent,Environment,TimeEstimateOriginal,Rank,Flagged
Apache,1293,19,433,1747,33,14,3791,3841,2734,1676,24,3508,618,75,330,73,699,774,34,3,2
Hyperledger,76,10,35,96,48,6,764,767,600,116,11,1912,340,34,31,405,687,162,33,3,2
IntelDAOS,33,12,7,83,10,18,99,99,92,66,11,2676,357,0,0,111,313,19,0,3,2
JFrog,488,16,28,149,21,13,1129,1130,167,523,11,1721,236,0,0,0,92,549,0,0,2
Jira,745,27,73,702,90,10,2257,2458,648,744,27,2771,1127,45,69,136,59,561,30,3,0
JiraEcosystem,749,18,121,349,60,6,1229,1256,429,1150,24,1934,608,37,96,309,436,683,27,3,2
MongoDB,463,22,53,258,32,12,1254,1501,475,793,12,2649,477,0,0,866,58,552,0,3,0
Qt,615,10,36,397,15,12,2214,2214,614,456,12,1448,259,21,6,238,331,1844,19,3,0
RedHat,1349,31,253,906,130,16,1605,1617,1198,1982,22,1529,686,33,31,657,409,98,19,3,2
Sakai,288,9,45,260,13,6,609,614,329,286,11,1814,149,25,20,3,212,1040,22,2,2


In [15]:
pprint(field_states['IssueType']['all_jiras_intersection'])

{'None', 'Bug', 'Task'}


In [16]:
field_states.keys()

dict_keys(['VersionsAffected', 'IssueType', 'Project', 'Components', 'Status', 'Priority', 'Creator', 'Reporter', 'Assignee', 'VersionsFixed', 'Resolution', 'IssueLinks', 'Labels', 'TimeEstimateRemaining', 'TimeSpent', 'Sprint', 'Parent', 'Environment', 'TimeEstimateOriginal', 'Rank', 'Flagged'])

# Search for Target Entities

Search Method: Text must contain 1) a field name and 2) any field value we found earlier (for that field and Jira)

In [17]:
def get_discussion_analysis_items(evo_df, field_states, fields_to_analyse=None, num_issues_to_search=None):

    def save_discussion_item(jira_issue_id, text, field, field_state):
        issue_discussion_items[jira_issue_id][field].append({
            'field_state': field_state,
            'text': text,
        })

    # If not specified, analyse all fields in evo_df
    if not fields_to_analyse:
        fields_to_analyse = list(evo_df.field.unique())

    # We are only analysing the Comments and Description (selectable)
    evo_df = evo_df[evo_df.field.isin([
        'Comments',
        # 'Description',
    ])]
    # We are not interested in analysing the "creational" evolutions
    evo_df = evo_df[evo_df.history_order > 0]

    # Get the set of unique issue ids in our evolution dataframe
    jira_issue_ids = list(evo_df.jira_issue_id.unique())
    # Shuffle the data, so people running "sample_issue_num" get different ones each time
    random.shuffle(jira_issue_ids)

    # Limit the number of issues searched, if that is requested
    if num_issues_to_search:
        jira_issue_ids = jira_issue_ids[:num_issues_to_search]
    
    # Store all identified discussion items
    issue_discussion_items = defaultdict(lambda: defaultdict(list))
    
    # For each issue, check the Description and Comments for the target fields
    for issue_index, jira_issue_id in enumerate(tqdm(jira_issue_ids, total=len(jira_issue_ids), ncols=100, ascii=True)):

        # Reduce evo_df to just the relevant data
        evo_jira_issue_df = evo_df[evo_df.jira_issue_id == jira_issue_id]

        # Extract the jira of this issue, for future use
        issue_jira = evo_jira_issue_df.iloc[0].jira

        # Analyse every "data_to" text field
        for _, evolution in evo_jira_issue_df.iterrows():

            # Extract the text
            text = evolution.data_to
            if not isinstance(text, str):
                continue  # The text must be a string

            # Check all requested fields
            for field in fields_to_analyse:

                if field not in text:
                    continue  # We didn't find any mention of this field
                
                # We want to check every past field state
                for field_state in field_states[field][issue_jira]:
                    if field_state in text:
                        # Save this item
                        save_discussion_item(jira_issue_id, text, field, field_state)

    print(f"Number of discussion items found: {sum(
        [len(n_item) for item in issue_discussion_items.values() for n_item in item.values()])}")
    
    return utils.defaultdict_to_dict(issue_discussion_items)

issue_discussion_items = get_discussion_analysis_items(
    evo_df, field_states, fields_to_analyse=['Priority'], num_issues_to_search=10000)

100%|########################################################| 10000/10000 [01:05<00:00, 152.05it/s]

Number of discussion items found: 6





In [18]:
print(f"Number of issues with Discussion Items: {len(issue_discussion_items)}")
print(f"Number of total Discussion Items: {sum(
    [len(n_item) for item in issue_discussion_items.values() for n_item in item.values()])}")

Number of issues with Discussion Items: 6
Number of total Discussion Items: 6


In [19]:
def display_issue_discussion_items(issue_discussion_items, sample_data_n=None):

    # If there are no ambiguities, this provides a helpful message instead of a blank output
    if not issue_discussion_items:
        print('There are no discussion items to display')
        return

    # Shuffle the keys so we can print a random order every time.
    issues_ambs_found_keys = list(issue_discussion_items.keys())
    random.shuffle(list(issue_discussion_items.keys()))

    # Sample the data displayed, if requested
    if sample_data_n:
        issues_ambs_found_keys = issues_ambs_found_keys[:sample_data_n]

    # Use those keys to build a shuffled dataset for displaying
    issue_discussion_items = {key: issue_discussion_items[key] for key in issues_ambs_found_keys}
    
    # Display each group of ambiguities, one issue at a time
    for jira_issue_id, issue_discussion_items in issue_discussion_items.items():
        for field, issue_field_discussion_items in issue_discussion_items.items():
            # Print the identifiers of this discussion item
            print(f"\n{'-'*50} New Discussion Item {'-'*50}\n")
            print(f"{jira_issue_id} {field}")
            pprint(issue_field_discussion_items)

# Display the discussion items found, one issue at a time
display_issue_discussion_items(issue_discussion_items, sample_data_n=None)


-------------------------------------------------- New Discussion Item --------------------------------------------------

Apache 12928477 Priority
[{'field_state': 'P0',
  'text': 'Standard Output\n'
          '{noformat}\n'
          'Previously run tests: [ClientServerTransactionDUnitTest, '
          'ClearDAckDUnitTest, SizingFlagDUnitTest, '
          'EvictionObjectSizerDUnitTest, GridAdvisorDUnitTest, '
          'PartitionedRegionDestroyDUnitTest, Bug40632DUnitTest, '
          'NetSearchMessagingDUnitTest, ClientServerGetAllDUnitTest, '
          'ConnectDisconnectDUnitTest, PartitionedRegionEntryCountDUnitTest, '
          'PartitionedRegionBucketCreationDistributionDUnitTest, '
          'ConcurrentMapOpsDUnitTest, PartitionedRegionAPIDUnitTest, '
          'Bug41091DUnitTest, PartitionedRegionLocalMaxMemoryDUnitTest, '
          'OffHeapEvictionStatsDUnitTest, '
          'ClientServerTransactionCCEDUnitTest]\n'
          '[vm_1][info 2016/01/06 11:13:13.900 PST <RMI TCP 