# 🎛️Program Setup

### Import Libraries

In [1]:
# Standard Python Imports
from collections import defaultdict
from pprint import pprint
import random

# Third-Party Imports (requires pip install)
import numpy as np
import pandas as pd
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from tqdm import tqdm

# Local Libraries
import utils

In [2]:
pd.options.mode.chained_assignment = None

### Define Globals

In [3]:
# Use Case 2 Globals
DATA_PATH = './data/'

LOG = utils.CustomLogger('CustomLogger', log_level='info', display_loglevel=False, display_datetime=False)
PICKLE_LIB = utils.PickleLib(data_path=DATA_PATH, logger=LOG)

### Load Data Sources

In [4]:
# Load the Jira Data Sources JSON
# with open('./data/jira_issuetype_thematic_analysis.json') as f:
#     jira_issuetypes = json.load(f)

# Load in the evolution dataframe from the GenerateEvolutionDataframe script
evo_df = PICKLE_LIB.pickle_load(f"{DATA_PATH}load_evolution_dataframe(sample_data_n=10000)", 'gzip')

[Start] 🥒 Loading data from Pickle: "./data/load_evolution_dataframe(sample_data_n=10000).pgzip"


	 Data: 100%|#####################################################| 555M/555M [00:03<00:00, 160MB/s]

[ End ] Duration: 00:00:04.1036





In [5]:
# Count how many issues we have from each group
def report_issues_per_jira(evo_df):
    # Keep track of how many tickets per jira
    issues_per_jira = defaultdict(int)
    # Group the issues by jira and issue id, which creates a group for each ticket
    issues_df_groupby = evo_df.groupby(['jira', 'issue_id'])
    # Loop through each group
    for _, group in issues_df_groupby:
        issue_jira = group.iloc[0]['jira']
        issues_per_jira[issue_jira] += 1
    # Print the resulting count of issues per jira
    print(f"Total Issues: {issues_df_groupby.ngroups:,}")
    pprint(dict(issues_per_jira))

report_issues_per_jira(evo_df)

Total Issues: 64,840
{'Apache': 7103,
 'Hyperledger': 6999,
 'IntelDAOS': 6134,
 'JFrog': 5150,
 'Jira': 4399,
 'JiraEcosystem': 6755,
 'MongoDB': 4588,
 'Qt': 5298,
 'RedHat': 4728,
 'Sakai': 4748,
 'SecondLife': 451,
 'Sonatype': 569,
 'Spring': 7918}


In [6]:
evo_df.head(10)

Unnamed: 0,jira,issue_id,history_order,field,field_evo_order,field_evo_first,field_evo_last,data_from,data_to,history_author,...,last_creator,last_reporter,last_assignee,last_commenter,last_evolver,prev_creators,prev_reporters,prev_assignees,prev_commenters,prev_evolvers
0,Apache,12851045,0,Summary,0,True,True,,Replace deprecated boxjavalibv2 with box-java-sdk,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
1,Apache,12851045,0,Description,0,True,True,,camel-box component is based on boxjavalibv2 \...,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
2,Apache,12851045,0,VersionsAffected,0,True,True,,2.15.2,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
3,Apache,12851045,0,IssueType,0,True,True,,Improvement,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
4,Apache,12851045,0,Project,0,True,True,,Camel,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
5,Apache,12851045,0,Components,0,True,True,,camel-box,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
6,Apache,12851045,0,CreatedDate,0,True,True,,2015-08-03T08:13:50.000+0000,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
7,Apache,12851045,0,ResolvedDate,0,True,True,,2017-03-06T11:15:35.000+0000,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
8,Apache,12851045,0,Status,0,True,False,,Open,Tomas Rohovsky,...,,,,,,[],[],[],[],[]
9,Apache,12851045,0,Priority,0,True,True,,Major,Tomas Rohovsky,...,,,,,,[],[],[],[],[]


### Cleaning the Data

In [7]:
evo_df.shape

(1511000, 32)

In [8]:
# Clean the data to just be description changes
evo_df = evo_df[evo_df.field == 'Description']

In [9]:
# Add to each for the combind "Jira Issue ID", which is a combination of the Jira name and the Issue ID.
# This field creates a truly unique ID across Jiras and issues.
evo_df['jira_issue_id'] = evo_df.jira + ' ' + evo_df.issue_id

In [10]:
# Set a minimum required number of evolutions for our analysis, where the creation itself counts as an evolution.
minimum_evolutions = 2
evo_df = evo_df.groupby('jira_issue_id').filter(lambda x: len(x.index) >= minimum_evolutions)

In [11]:
evo_df.shape

(35639, 33)

In [12]:
evo_df.head(10)

Unnamed: 0,jira,issue_id,history_order,field,field_evo_order,field_evo_first,field_evo_last,data_from,data_to,history_author,...,last_reporter,last_assignee,last_commenter,last_evolver,prev_creators,prev_reporters,prev_assignees,prev_commenters,prev_evolvers,jira_issue_id
23,Apache,13004080,0,Description,0,True,False,,I was surprised how little contention was bein...,Christopher Batey,...,,,,,[],[],[],[],[],Apache 13004080
35,Apache,13004080,1,Description,1,False,True,I was surprised how little contention was bein...,I was surprised how little contention was bein...,Christopher Batey,...,Christopher Batey,Christopher Batey,,Christopher Batey,[Christopher Batey],[Christopher Batey],[Christopher Batey],[],[Christopher Batey],Apache 13004080
88,Apache,13401522,1,Description,0,True,False,,TonY has update some versions an introduce mor...,Junfan Zhang,...,Junfan Zhang,,,Junfan Zhang,[Junfan Zhang],[Junfan Zhang],[],[],[Junfan Zhang],Apache 13401522
89,Apache,13401522,2,Description,1,False,True,TonY has update some versions an introduce mor...,TonY has update some versions an introduce mor...,Junfan Zhang,...,Junfan Zhang,,,Junfan Zhang,[Junfan Zhang],[Junfan Zhang],[],[],"[Junfan Zhang, Junfan Zhang]",Apache 13401522
98,Apache,12695730,0,Description,0,True,False,,"A route as\n{code}\n from(""dire...",Claus Ibsen,...,,,,,[],[],[],[],[],Apache 12695730
112,Apache,12695730,1,Description,1,False,True,"A route as\n{code}\n from(""dire...","A route as\n{code}\n from(""dire...",Claus Ibsen,...,Claus Ibsen,Claus Ibsen,,Claus Ibsen,[Claus Ibsen],[Claus Ibsen],[Claus Ibsen],[],[Claus Ibsen],Apache 12695730
256,Apache,13073652,0,Description,0,True,False,,"Currently, Non-windowed group aggregate is ea...",sunjincheng,...,,,,,[],[],[],[],[],Apache 13073652
269,Apache,13073652,1,Description,1,False,True,"Currently, Non-windowed group aggregate is ea...","Currently, Non-windowed group aggregate is ea...",sunjincheng,...,sunjincheng,sunjincheng,,sunjincheng,[sunjincheng],[sunjincheng],[sunjincheng],[],[sunjincheng],Apache 13073652
355,Apache,12715573,0,Description,0,True,False,,The underlying hive job failed because hive-si...,Sowmya Ramesh,...,,,,,[],[],[],[],[],Apache 12715573
365,Apache,12715573,1,Description,1,False,True,The underlying hive job failed because hive-si...,The hive job failed because hive-site.xml was ...,Sowmya Ramesh,...,Sowmya Ramesh,,,Sowmya Ramesh,[Sowmya Ramesh],[Sowmya Ramesh],[],[],[Sowmya Ramesh],Apache 12715573


### NLP Techniques for Checking Sentiment

In [13]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')  # https://spacy.io/universe/project/spacy-textblob
# text = 'I had a really horrible day. It was the worst day ever! But every now and then I have a really good day that makes me happy.'
# doc = nlp(text)
# doc._.blob.polarity                            # Polarity: -0.125
# doc._.blob.subjectivity                        # Subjectivity: 0.9
# doc._.blob.sentiment_assessments.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]
# doc._.blob.ngrams()                            # [WordList(['I', 'had', 'a']), WordList(['had', 'a', 'really']), WordList(['a', 'really', 'horrible']), WordList(['really', 'horrible', 'day']), WordList(['horrible', 'day', 'It']), WordList(['day', 'It', 'was']), WordList(['It', 'was', 'the']), WordList(['was', 'the', 'worst']), WordList(['the', 'worst', 'day']), WordList(['worst', 'day', 'ever']), WordList(['day', 'ever', 'But']), WordList(['ever', 'But', 'every']), WordList(['But', 'every', 'now']), WordList(['every', 'now', 'and']), WordList(['now', 'and', 'then']), WordList(['and', 'then', 'I']), WordList(['then', 'I', 'have']), WordList(['I', 'have', 'a']), WordList(['have', 'a', 'really']), WordList(['a', 'really', 'good']), WordList(['really', 'good', 'day']), WordList(['good', 'day', 'that']), WordList(['day', 'that', 'makes']), WordList(['that', 'makes', 'me']), WordList(['makes', 'me', 'happy'])]

<spacytextblob.spacytextblob.SpacyTextBlob at 0x169561a30>

In [14]:
def get_sentiment(text):
    doc = nlp(text)
    return doc._.blob.polarity

In [15]:
def get_sentiment_trends(evo_df, sample_data_n=None):

    # Get the set of unique issue ids in our evolution dataframe
    jira_issue_ids_unique = list(np.unique(evo_df.jira_issue_id))

    # For time reasons, it is helpful to check just a sample of the issues
    if sample_data_n and sample_data_n < len(jira_issue_ids_unique):
        # Get a random sample of issues equal to the requested sample size
        jira_issue_ids_unique = random.sample(jira_issue_ids_unique, sample_data_n)
        # Reduce the evolution dataframe to just the issues we are going to check
        evo_df = evo_df[evo_df.jira_issue_id.isin(jira_issue_ids_unique)]

    # Store all sentiments
    issues_sentiment_trends = dict()

    # Go through each issue description, and check for ambiguities
    for jira_issue_id in tqdm(jira_issue_ids_unique, total=len(jira_issue_ids_unique), ncols=100, ascii=True):
        # Get the reduced dataframe with just the data we need for this analysis
        issue_df = evo_df[evo_df.jira_issue_id == jira_issue_id]

        # Extract the first and last description text
        text_first = issue_df.data_to.iloc[0]
        text_last = issue_df.data_to.iloc[-1]

        # Check that these are both a str, and not None
        if text_first is None or text_last is None:
            continue  # There is no way to check the sentiment of "None", so we skip it and move on

        # Get the sentiment of the first and last description, and then get the difference (the change)
        try:
            sentiment_first = get_sentiment(text_first)
            sentiment_last = get_sentiment(text_last)
        except ValueError:
            continue  # We were not able to get the sentiment for this text, so just skip it
        sentiment_diff = sentiment_last - sentiment_first

        # Save this sentiment trend for future analysis
        issues_sentiment_trends[jira_issue_id] = {
            'text_first': text_first,
            'text_first_sentiment':  sentiment_first,
            'text_last': text_last,
            'text_last_sentiment': sentiment_last,
            'sentiment_diff': sentiment_diff,
        }
        
    # Return all ambiguities found
    return issues_sentiment_trends

In [16]:
def display_issue_sentiment_trends(issues_sentiment_trends, sample_data_n=None, max_desc_len=None, min_sent_delta=None):

    # If there are no sentiment trends to display, this provides a helpful message instead of a blank output
    if not issues_sentiment_trends:
        print('There are no sentiment trends to display')
        return

    # For time reasons, it may be better to just display a sample of issues
    samples_displayed = 0
    # Shuffle the keys so that these samples displayed are different every time
    issues_sentiment_trends_keys = list(issues_sentiment_trends.keys())
    random.shuffle(issues_sentiment_trends_keys)
    # Put the dict back together with this new key order
    issues_sentiment_trends = {key: issues_sentiment_trends[key] for key in issues_sentiment_trends_keys}
    
    # Display each group of ambiguities, one issue at a time
    for jira_issue_id, sentiment_trend_obj in issues_sentiment_trends.items():
        desc_first = sentiment_trend_obj['text_first']
        desc_last = sentiment_trend_obj['text_last']
        sent_delta = sentiment_trend_obj['sentiment_diff']

        # Put conditions on the samples you are viewing
        if max_desc_len and (len(desc_first) > max_desc_len or len(desc_last) > max_desc_len):
            continue
        if min_sent_delta and abs(sent_delta) < min_sent_delta:
            continue

        print(f"\n{'-'*50} Issue ID: {jira_issue_id} {'-'*50}\n")
        print('------------------------------')
        print('------ FIRST DESCRIPTION -----')
        print('------------------------------')
        print(f"{desc_first}\n")
        print('------------------------------')
        print('------ LAST DESCRIPTION ------')
        print('------------------------------')
        print(f"{desc_last}\n")
        # Print the ambs found
        print(f"Sentiment Trend: {sent_delta}")

        # If we got this far, then we have displayed it
        samples_displayed += 1

        # Check if we have enough samples displayed
        if sample_data_n and sample_data_n == samples_displayed:
            break
    if samples_displayed == 0:
        print(f"!! No samples were displayed. !!\nThis is likely due to your filter parameters (max_desc_len or min_sent_delta) being too restrictive. There are {len(issues_sentiment_trends)} possible sentiment trends to display in 'issues_sentiment_trends'.")

### Sentiment Trend Detection

In [27]:
#  Check for Nominalisations ambiguities using the lexicon defined above
issues_sentiment_trends = get_sentiment_trends(evo_df, sample_data_n=1000)

100%|#########################################################| 10000/10000 [13:47<00:00, 12.09it/s]


In [30]:
# Display the ambiguites found, one issue at a time
display_issue_sentiment_trends(issues_sentiment_trends, sample_data_n=5, max_desc_len=200, min_sent_delta=.5)


-------------------------------------------------- Issue ID: MongoDB 1637513 --------------------------------------------------

------------------------------
------ FIRST DESCRIPTION -----
------------------------------
For instance, 

------------------------------
------ LAST DESCRIPTION ------
------------------------------
For instance, we should raise InvalidBSON exceptions when we are unable to decode a BSON stream using libbson.

Sentiment Trend: -0.5

-------------------------------------------------- Issue ID: IntelDAOS 16900 --------------------------------------------------

------------------------------
------ FIRST DESCRIPTION -----
------------------------------
Triage 10 snapshot regression failures from CI regression (for DAOS-2473).

https://build.hpdd.intel.com/job/daos-stack/job/daos//view/change-requests/job/PR-574/8/testReport/(root)/

 

 

 

------------------------------
------ LAST DESCRIPTION ------
------------------------------
 

Pull the latest and bu