# 🎛️Program Setup

### Import Libraries

In [1]:
# Standard Python Imports
from collections import defaultdict
import csv
import json
from pprint import pprint
import random
import re
import sys

# Third-Party Imports (requires pip install)
import nltk
from nltk.corpus import wordnet as wn
from pymongo import MongoClient
import spacy
from tqdm import tqdm

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Hermes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Load Data Sources

In [6]:
# Load the Jira Data Sources JSON
with open('./data/jira_issuetype_thematic_analysis.json') as f:
    jira_issuetypes = json.load(f)

### Define Globals

In [7]:
# Load the spacy English model. Must first run "python -m spacy download en" in terminal to download the package
nlp_spacy_en = spacy.load('en_core_web_sm')
# Use Case 1 Globals
data_folder = './data'

### Get Use-Case Data: 1️⃣ Use Database or 2️⃣ Attached TSV

##### 1️⃣ Export from DB and into a CSV

In [8]:
# Connect to the Mongo database
mongo_client = MongoClient()
db = mongo_client['JiraRepos']

In [9]:
# Gather list of Jiras and their "Story" issue type
jira_story_map = []
for jira, issuetypes in jira_issuetypes.items():
    for issuetype, issuetype_obj in issuetypes.items():
        if issuetype_obj['theme'] == 'Requirements' and issuetype_obj['code'] == 'Story':
            jira_story_map.append((jira, issuetype))

# Retrieve counts of the "Story" issue type per Jira
results = []
for jira, story_issuetype in tqdm(jira_story_map, total=len(jira_story_map), ncols=100, ascii=True):
    result = list(db[jira].aggregate([
        # Match only certain issues
        { '$match': { '$and': [
            # Limit the data to resolved issues, otherwise we have "in progress" issues which are not yet complete
            { 'fields.resolution': { '$ne': None } },
            # Match the Jira-specific name for their Story issue type
            { 'fields.issuetype.name': story_issuetype },
        ]}},
        # Count the number of documents at this point in the pipeline
        { '$count': 'count' }
    ], allowDiskUse=True))
    issuetype_num_issues = int(result[0]['count']) if len(result) and 'count' in result[0] else 0
    # Save this result to be printed later
    results.append((jira, story_issuetype, issuetype_num_issues))

for jira, story_issuetype, issuetype_num_issues in results:
    print(f"{issuetype_num_issues: >6,} ({jira}, {story_issuetype})")
    
# RESULT (for those without the database data)
#  1,538 (Apache, Story)
#  3,942 (Hyperledger, Story)
#  4,536 (IntelDAOS, Story)
#      0 (JFrog, Story)
#  2,788 (JiraEcosystem, Story)
#    245 (MongoDB, Story)
#  2,375 (Qt, User Story)
# 13,452 (RedHat, Story)
#    337 (RedHat, Requirement)
#    220 (Sonatype, Story)
#  2,286 (Spring, Story)
#    301 (SecondLife, Story)

100%|###############################################################| 12/12 [00:53<00:00,  4.49s/it]

 1,538 (Apache, Story)
 3,942 (Hyperledger, Story)
 4,536 (IntelDAOS, Story)
     0 (JFrog, Story)
 2,788 (JiraEcosystem, Story)
   245 (MongoDB, Story)
 2,375 (Qt, User Story)
13,452 (RedHat, Story)
   337 (RedHat, Requirement)
   220 (Sonatype, Story)
 2,286 (Spring, Story)
   301 (SecondLife, Story)





In [10]:
# Select the desired Jira repo and issue type
chosen_jira = 'RedHat'
chosen_issuetype = 'Story'

# Get the issues for the chosen Jira
issues = list(db[chosen_jira].aggregate([
    # Match only certain issues
    { '$match': { '$and': [
        # Limit the data to resolved issues, otherwise we have "in progress" issues which are not yet complete
        { 'fields.resolution': { '$ne': None } },
        # Match the Jira-specific name for their Story issue type
        { 'fields.issuetype.name': chosen_issuetype },
        # Only get documents where the description is not empty
        { 'fields.description': { '$ne': None } },
    ]}},
    # Retrieve and rename fields 
    { '$project': {
        '_id': 0, 'id': 1,
        'description': '$fields.description',
    }}]))

In [11]:
# Write these issues to CSV so they can be easily loaded later without the full database.
with open(f"{data_folder}/use_case_1_data.tsv", 'w', newline='') as tsv_file:
    writer = csv.writer(tsv_file, delimiter='\t', quotechar='"', lineterminator='\n')
    writer.writerow(issues[0].keys())
    writer.writerows([issue.values() for issue in issues])

##### 2️⃣ Load TSV

In [12]:
# Increase the limit of the csv data we can read in
csv.field_size_limit(sys.maxsize)

with open(f"{data_folder}/use_case_1_data.tsv") as tsv_file:
    reader = csv.reader(tsv_file, delimiter='\t', quotechar='"')
    headers = next(reader)
    issues = [dict(zip(headers, row)) for row in reader]

In [13]:
print(f"There are {len(issues):,} issues.")

There are 11,911 issues.


### Cleaning the Data

In [14]:
# Remove all issues that don't have at least "As a" in the description, which is a minimum for a user story
issues = [issue for issue in issues if 'As a' in issue['description']]
print(f"Remaining Issues: {len(issues):,}")

Remaining Issues: 1,601


In [15]:
# Get random samples of issues for display
number_of_sample_issues = 1
for issue in random.sample(issues, number_of_sample_issues):
    print(f"\n{'-'*50} RANDOM ISSUE {'-'*50}\n")
    print(f"Issue ID: {issue['id']}\n")
    print(issue['description'])


-------------------------------------------------- RANDOM ISSUE --------------------------------------------------

Issue ID: 13269613

h1. Story

As an administrator of cluster logging, I want the operator to raise a kubernete event if the top ClusterLogForwarder Status has conditions Invalid=true or Degraded=true.
h1. Acceptance Criteria
 * Operator raises event when the CLF reconciles as invalid or degraded.
 * Event contains human readable summary and status information (review format of existing k8s events)
 * Once an Event is raised, no further events are raised unless/until status changes (e.g. degraded for a new reason)
 * The CLO must-gather is updated to capture events
 * Document created events


### NLP Techniques for Detecting Ambiguities

In [16]:
def check_lexical(issue_id, text, ambiguity_type_obj, ambs_found):

    def whole_phrase_regexp(phrase):
        # Handle spaces in strings
        phrase = phrase.replace(' ', r'\s')
        try:
            return re.compile(r'\b{0}\b'.format(phrase), flags=re.I|re.X)
        except re.error:
            return re.compile(r'\b\{0}\b'.format(phrase), flags=re.I|re.X)

    # Go over all phrases in lexicon
    for word_phrase in ambiguity_type_obj['lexicon']:
        # Search for all word phrases in sentence
        for match in re.finditer(whole_phrase_regexp(word_phrase), text):
            ambs_found[issue_id].append({
                'text': match[0],
                'amb_type': ambiguity_type_obj['title'],
                'index_start': match.start(),
                'index_end': match.end()
            })

In [17]:
def check_regexes(issue_id, text, ambiguity_type_obj, ambs_found):
    # Create Python regular expression object
    regexp = re.compile(ambiguity_type_obj['regexp'], flags=re.I|re.X)
    # Search for all regexps in requirement
    for match in re.finditer(regexp, text):
        ambs_found[issue_id].append({
            'text': match[0],
            'amb_type': ambiguity_type_obj['title'],
            'index_start': match.start(),
            'index_end': match.end()
        })

In [18]:
def check_pos_regexes(issue_id, text, ambiguity_type_obj, ambs_found):
    # Get the original indexes, before the truple design messed with it
    def get_original_indexes(_req_original_string, _req_tokenized_string, _req_truple_string, _match):
        # Add up extra letters (indexes) due to truple design
        def count_extra_indexes(up_to_index):
            # Count the extra letters in a given truple
            def count_extra_letters(req_truple):
                try:
                    split = req_truple.split('°')
                    return len(split[1]) + len(split[2]) + 2
                except:
                    return 0

            # Calculate space added by tokenization process
            def count_tokenize_space(_req_original_string, _req_tokenized_string):
                orig_i = 0
                tokn_i = 0
                while tokn_i < len(_req_tokenized_string):
                    if _req_original_string[orig_i] != _req_tokenized_string[tokn_i]:
                        tokn_i += 1
                        continue
                    orig_i += 1
                    tokn_i += 1
                return tokn_i - orig_i

            # Remove string after the index
            words_pre_index = _req_truple_string[:up_to_index].split()
            # Calculate extra indexes added by the truple system
            extra_truple_indexes = sum([count_extra_letters(req_truple) for req_truple in words_pre_index])
            # Update the 'up_to_index' to reflect the newly discovered mistakes
            up_to_index = up_to_index - extra_truple_indexes
            # Calculate the extra indexes added by the tokenizing process
            extra_tokenize_space = count_tokenize_space(_req_original_string[:up_to_index], _req_tokenized_string[:up_to_index])

            return extra_truple_indexes + extra_tokenize_space
        return (
            _match.start() - count_extra_indexes(_match.start()),
            _match.end() - count_extra_indexes(_match.end()))

    doc = nlp_spacy_en(text)

    # Create list of truples strings (word, POS tag, lemma) with degree symbol in between each part
    truple_list = ['{0}°{1}°{2}'.format(token.text, token.tag_, token.lemma_) for token in doc]

    # Create variables for easier and more readable use later
    req_original_string = text
    req_tokenized_string = ' '.join([token.text for token in doc])
    req_truple_string = ' '.join(truple_list)  # Convert into string so regex can be performed

    # Create Python regular expression object
    regexp = re.compile(ambiguity_type_obj['regexp'], flags=re.I|re.X)
    # Search for all regexps in requirement
    for match in re.finditer(regexp, req_truple_string):
        # Get the original indexes, since the truple string design messes with them
        orig_indexes = get_original_indexes(
            req_original_string, req_tokenized_string, req_truple_string, match)

        orig_text = ' '.join([req_truple.split('°')[0] for req_truple in match[0].split()])
        # Save this found ambiguity
        ambs_found[issue_id].append({
            'text': match[0],
            'amb_type': ambiguity_type_obj['title'],
            'index_start': orig_indexes[0],
            'index_end': orig_indexes[1],
        })

In [19]:
def check_compound_nouns(issue_id, text, ambiguity_type_obj, ambs_found):

    # Initialise a spacy doc object for advanced NLP applications
    doc = nlp_spacy_en(text)

    # Search each noun chunk for consecutive nouns 
    for chunk in doc.noun_chunks:
        compound_list = [token for token in chunk if (token.dep_ == 'compound')
                            or (token.tag_ in ('NN', 'NNS', 'NNP', 'NNPS') and token.dep_ in ('nmod', 'amod'))
                            or (token.tag_ == 'VBG' and token.dep_ == 'nmod')
                            or token == chunk.root]
        if len(compound_list) > 2:
            new_indexes = [compound_list[0].idx, compound_list[-1].idx + len(compound_list[-1].text)]
            original_text = text[new_indexes[0]:new_indexes[1]]
            # Save this found ambiguity
            ambs_found[issue_id].append({
                'text': original_text,
                'amb_type': ambiguity_type_obj['title'],
                'index_start': new_indexes[0],
                'index_end': new_indexes[1],
            })

In [20]:
def check_nominalisations(issue_id, text, ambiguity_type_obj, ambs_found):

    # Initialise a spacy doc object for advanced NLP applications
    doc = nlp_spacy_en(text)

    # Generate a list of gerund nouminalizations that have pos VB
    nominalizations = [[t for t in token.subtree] for token in doc
                    if(token.text[-3:] in ambiguity_type_obj['gerund']
                        or token.text[-4:] in ambiguity_type_obj['gerund_plural'])
                        and token.tag_ == 'VBG'
                        and token.dep_ not in ('root', 'aux', 'advmod', 'compound', 'acl')
                        and doc[token.i - 1].dep_ != 'aux'
                        and token.text.lower() not in ambiguity_type_obj['rule_exceptions']]

    # Generate a list of nominalizations with pos NN based on suffixes
    nouns = [token for token in doc if (token.lemma_[-4:] in ambiguity_type_obj['suffixes_len4']
                                        or token.lemma_[-3:] in ambiguity_type_obj['suffixes_len3']
                                        or token.lemma_[-2:] in ambiguity_type_obj['suffixes_len2'])
            and token.tag_ in ('NN', 'NNS')
            and wn.synsets(token.text)]
    # Filter list of nouns based on semantic hierarchy
    for token in nouns:
        # Generate and flatten the list of hypernyms for each noun
        hypernyms = list(
            map(lambda x: x.name().split('.')[0],
                sum(wn.synsets(token.text)[0].hypernym_paths(), [])))
        # Only consider nouns that express an event or a process
        if [l for l in hypernyms if l in ['event', 'process', 'act']] \
                and token.text.lower() not in ambiguity_type_obj['rule_exceptions']:
            nominalizations.append([t for t in token.subtree])

    # Return all ambiguous nominalization sequences found
    for token_seq in nominalizations:
        if token_seq:
            new_text = ' '.join([t.text for t in token_seq])
            new_indexes = [token_seq[0].idx, token_seq[-1].idx + len(token_seq[-1].text)]
            # Save this found ambiguity
            ambs_found[issue_id].append({
                'text': new_text,
                'amb_type': ambiguity_type_obj['title'],
                'index_start': new_indexes[0],
                'index_end': new_indexes[1],
            })

In [21]:
nlp_method_map = {
    'check_lexical': check_lexical,
    'check_regexes': check_regexes,
    'check_pos_regexes': check_pos_regexes,
    'check_compound_nouns': check_compound_nouns,
    'check_nominalisations': check_nominalisations,
}

In [22]:
def check_for_ambiguities(issues, lexicons, sample_data_n=None):

    # For time reasons, it is helpful to check just a sample of the issues
    issues_to_check = issues
    if sample_data_n and sample_data_n < len(issues):
        issues_to_check = random.sample(issues_to_check, sample_data_n)

    # Store all ambiguities
    issues_ambs_found = defaultdict(list)

    # Go through each issue description, and check for ambiguities
    for issue in tqdm(issues_to_check, total=len(issues_to_check), ncols=100, ascii=True):
        # Iterate through all lexicons, where each one represents a different form of NLP algorithms
        for nlp_check, lexicon in lexicons.items():
            # Get the NLP method as needed by the lexicon
            lexicon_nlp_method = nlp_method_map[nlp_check]
            # For each lexicon, iterate through the available ambiguity checks that can be performed
            for ambiguity_type_obj in lexicon.values():
                # Apply the NLP method, which will add any found ambiguities to the "ambs_found" list
                lexicon_nlp_method(issue['id'], issue['description'], ambiguity_type_obj, issues_ambs_found)
        
    # Return all ambiguits found
    return issues_ambs_found

In [23]:
def display_issue_ambiguities(issues_ambs_found, sample_data_n=None):

    # If there are no ambiguities, this provides a helpful message instead of a blank output
    if not issues_ambs_found:
        print('There are no ambiguities to display')
        return

    # For time reasons, it may be better to just display a sample of issues
    if sample_data_n and sample_data_n < len(issues_ambs_found):
        # Get a random sample of keys
        issues_ambs_found_keys = random.sample(list(issues_ambs_found.keys()), sample_data_n)
        # Use those keys to build a sample of the original dict
        issues_ambs_found = {key: issues_ambs_found[key] for key in issues_ambs_found_keys}

    # Display each group of ambiguities, one issue at a time
    for amb_issue_id, amb_found in issues_ambs_found.items():
        # Print the issue description
        amb_issue = [issue for issue in issues if issue['id'] == amb_issue_id][0]
        print(f"\n{'-'*50} Issue ID: {amb_issue['id']} {'-'*50}\n")
        print(f"{amb_issue['description']}\n")
        # Print the ambs found
        print('Ambiguities Found:')
        pprint(amb_found)

### Ambiguit Detection: Subjective Language

In [24]:
# Some example Subjective Language detection lexicons
subject_language_lexicons = {
    'check_lexical': {
        'dangerous_plural' : {
            'title'             : 'Dangerous Plural',
            'description'       : 'Potentially dangerous plural.',
            'lexicon'           : ['all', 'each', 'every', 'any', 'few', 'little', 'many', 'much', 'several', 'some', 'a lot'],
            'language_construct': 'Subjective Language',
            'lit_reference'     : 'Tjong SF, Berry DM. The design of SREE—a prototype potential ambiguity finder for requirements specifications and lessons learned. InInternational Working Conference on Requirements Engineering: Foundation for Software Quality 2013 Apr 8 (pp. 80-95). Berlin, Heidelberg: Springer Berlin Heidelberg.',
        },
        'inside_behaviour' : {
            'title'             : 'Inside Behaviour',
            'description'       : 'These expressions do not specify the "outside boundaries" behaviour.',
            'lexicon'           : ['until', 'during', 'through', 'after', 'at'],
            'language_construct': 'Subjective Language',
            'lit_reference'     : 'Tjong SF, Berry DM. The design of SREE—a prototype potential ambiguity finder for requirements specifications and lessons learned. InInternational Working Conference on Requirements Engineering: Foundation for Software Quality 2013 Apr 8 (pp. 80-95). Berlin, Heidelberg: Springer Berlin Heidelberg.',
        },
    },
    'check_regexes': {
        'unclear_inclusion' : {
            'title'             : 'Unclear Inclusion',
            'description'       : 'Up to with unclear inclusion.',
            'regexp'            : 'up\\sto\\s(?!.*including|excluding)',
            'language_construct': 'Subjective Language',
            'lit_reference'     : 'Gleich B, Creighton O, Kof L. Ambiguity detection: Towards a tool explaining ambiguity sources. InRequirements Engineering: Foundation for Software Quality: 16th International Working Conference, REFSQ 2010, Essen, Germany, June 30–July 2, 2010. Proceedings 16 2010 (pp. 218-232). Springer Berlin Heidelberg.',
        },
        'dangerous_plural'  : {
            'title'             : 'Dangerous Reference Plural',
            'description'       : 'Dangerous plural with ambiguous reference.',
            'regexp'            : '(?:\\ball\\b|\\beach\\b|\\bevery\\b) .* (?:\\bhis\\b|\\bher\\b|\\bits\\b|\\btheir\\b|\\bthey\\b)',
            'language_construct': 'Subjective Language',
            'lit_reference'     : 'Gleich B, Creighton O, Kof L. Ambiguity detection: Towards a tool explaining ambiguity sources. InRequirements Engineering: Foundation for Software Quality: 16th International Working Conference, REFSQ 2010, Essen, Germany, June 30–July 2, 2010. Proceedings 16 2010 (pp. 218-232). Springer Berlin Heidelberg.',
        },
    },
    'check_pos_regexes': {
        'passive_ambiguity' : {
            'title'             : 'Passive Voice Ambiguity',
            'description'       : 'Authors should state requirements in active form, as passive conceals who is responsible for the action.',
            'regexp'            : '\\b\\w+?°V[^°]*°be (\\W[^°]+?°(?!VB.)[^°]*°[^ ]+?)* \\W\\w+?°VBN°\\w+',
            'language_construct': 'Subjective Language',
            'lit_reference'     : 'Gleich B, Creighton O, Kof L. Ambiguity detection: Towards a tool explaining ambiguity sources. InRequirements Engineering: Foundation for Software Quality: 16th International Working Conference, REFSQ 2010, Essen, Germany, June 30–July 2, 2010. Proceedings 16 2010 (pp. 218-232). Springer Berlin Heidelberg.',
        },
    }
}

In [25]:
#  Check for Subjective Language ambiguities using the lexicon defined above
issues_ambs_found_subjective_language = check_for_ambiguities(issues, subject_language_lexicons, sample_data_n=100)

100%|#############################################################| 100/100 [00:03<00:00, 29.34it/s]


In [26]:
# Display the ambiguites found, one issue at a time
display_issue_ambiguities(issues_ambs_found_subjective_language, sample_data_n=5)


-------------------------------------------------- Issue ID: 13313275 --------------------------------------------------

As a user of OpsnShift I want to be able to tell if my underlying network is providing a solid foundation for my SDN.

I also want to be sure that my SDN is performing correctly.

We need to make sure there is an easy way to see the node-to-node performance numbers across the cluster.  We also need to see if we can alert on bad underlay performance (without too many false positives) and we need to see if we can push any meaningful metrics up to insights (can we do a median and a standard distribution across nodes?)

Ambiguities Found:
[{'amb_type': 'Dangerous Plural',
  'index_end': 423,
  'index_start': 420,
  'text': 'any'},
 {'amb_type': 'Dangerous Plural',
  'index_end': 368,
  'index_start': 364,
  'text': 'many'},
 {'amb_type': 'Unclear Inclusion',
  'index_end': 449,
  'index_start': 443,
  'text': 'up to '}]

------------------------------------------------

### Ambiguit Detection: Coordination

In [27]:
# Some example Coordination detection lexicons
coordination_lexicons = {
    'check_lexical': {
        # None 
    },
    'check_regexes': {
        # None 
    },
    'check_pos_regexes': {
        'coordination_adj' : {
            'title'         : 'Coordination of two nouns modified by an adjective',
            'description'   : 'Following an adjective by two nouns joint by "and" or "or" makes it unclear if the adjective describes the first noun only or both nouns.',
            'regexp'        : '(?<=\\s)\\S*°JJ[^°]*°[\\S]+\\s[\\S]+°NN[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s[\\S]+°NN[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        },
        'coordination_vb' : {
            'title'         : 'Coordination of two nouns preceded by a verb',
            'description'   : 'Following a verb by two nouns joint by "and" or "or" makes it unclear if the verb describes the first noun only or both nouns.',
            'regexp'        : '(?<=\\s)\\S*°VBN*°[\\S]+\\s[\\S]+°NN[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s[\\S]+°NN[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        },
        'coordination_nn' : {
            'title'         : 'Coordination of two nouns preceded by another noun',
            'description'   : 'Following a noun by two nouns joint by "and" or "or" makes it unclear if the noun describes the first noun only or both nouns.',
            'regexp'        : 'w+?°NN[^°]*°[\\S]+\\s([\\S]+°IN°[\\S]+\\s)*[\\S]+°NN[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s[\\S]+°NN[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        },
        'coordination_post_nn' : {
            'title'         : 'Coordination of two nouns followed by another noun',
            'description'   : 'Following two nouns joint by "and" or "or" by a third noun makes the word association unclear.',
            'regexp'        : '([\\S]+°DT°[\\S]+\\s)*([\\S]+°JJ[^°]*°[\\S]+\\s)*[\\S]+°NN[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s([\\S]+°DT°[\\S]+\\s)*([\\S]+°JJ[^°]*°[\\S]+\\s)*[\\S]+°NN[^°]*°[\\S]+\\s([\\S]+°IN°[\\S]+\\s)*([\\S]+°DT°[\\S]+\\s)*([\\S]+°JJ[^°]*°[\\S]+\\s)*[\\S]+°NN[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        },
        'coordination_adv' : {
            'title'         : 'Coordination of two verbs modified by an adverb',
            'description'   : 'Following an adverb by two verbs joint by "and" or "or" makes it unclear if the adverb describes the first verb only or both verbs.',
            'regexp'        : '(?<=\\s)\\S*°RB[^°]*°[\\S]+\\s[\\S]+°VB[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s[\\S]+°VB[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        },
        'coordination_nn_vb' : {
            'title'         : 'Coordination of two verbs preceded by a noun',
            'description'   : 'Following a noun with two verbs joint by "and" or "or" makes it unclear if the noun describes the first verb only or both verbs.',
            'regexp'        : '[\\S]+°NN[^°]*°[\\S]+\\s([\\S]+°IN°[\\S]+\\s)*([\\S]+°JJ[^°]*°[\\S]+\\s)*[\\S]+°VB[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s([\\S]+°JJ[^°]*°[\\S]+\\s)*[\\S]+°VB[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        },
        'coordination_vb_nn' : {
            'title'         : 'Coordination of two verbs followed by a noun',
            'description'   : 'Following two verbs joint by "and" or "or" by a noun makes it unclear if the noun describes the second verb only or both verbs.',
            'regexp'        : '(?<=\\s)\\S*°VB[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s[\\S]+°VB[^°]*°[\\S]+\\s(\\S]+°IN°[\\S]+\\s)*([\\S]+°DT°[\\S]+\\s)*([\\S]+°(JJ[^°]*|VBN)°[\\S]+\\s)*[\\S]+°NN[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        },
        'coordination_vb_adv' : {
            'title'         : 'Coordination of two verbs followed by an adverb',
            'description'   : 'Following two verbs joint by "and" or "or" by an adverb makes it unclear if the adverb describes the second verb only or both verbs.',
            'regexp'        : '(?<=\\s)\\S*°VB[^°]*°[\\S]+\\s(and|or)°CC°(and|or)\\s[\\S]+°VB[^°]*°[\\S]+\\s[\\S]+°RB[^°]*°[\\S]+',
            'lit_reference'  : 'Yang H, Willis A, De Roeck A, Nuseibeh B. Automatic detection of nocuous coordination ambiguities in natural language requirements. InProceedings of the 25th IEEE/ACM International Conference on Automated Software Engineering 2010 Sep 20 (pp. 53-62).',
            'language_construct': 'Coordination'
        } 
    }
}

In [28]:
#  Check for Coordination ambiguities using the lexicon defined above
issues_ambs_found_coordination = check_for_ambiguities(issues, coordination_lexicons, sample_data_n=100)

100%|#############################################################| 100/100 [00:23<00:00,  4.20it/s]


In [29]:
# Display the ambiguites found, one issue at a time
display_issue_ambiguities(issues_ambs_found_coordination, sample_data_n=5)


-------------------------------------------------- Issue ID: 13335622 --------------------------------------------------

As a release engineer
I want to be able to attach RHCOS builds to the OCP errata
So that I can easily include RHCOS artifacts as part of the OCP release process.

*Acceptance Criteria*

  - pipeline step is added that does the tagging of the RHCOS build uploaded to Brew
  - jobspec.yaml has config knobs to control tagging and tag used
  - tagging can be performed successfully with existing keytab
  - tagging can be performed successfully for all arches

*Notes*
Tag values can be found at https://github.com/openshift/ocp-build-data/blob/openshift-4.7/erratatool.yml.
Substitute the release version in the URL for 4.6, 4.7, 4.8

Ambiguities Found:
[{'amb_type': 'Coordination of two nouns preceded by a verb',
  'index_end': 340,
  'index_start': 319,
  'text': 'control°VB°control tagging°NN°tagging and°CC°and tag°NN°tag'}]

----------------------------------------------

### Ambiguit Detection: Compound Nouns

In [30]:
# Some example Compound Nouns language detection lexicons
compound_nouns_lexicons = {
    'check_compound_nouns': {
        'ambiguous_compounds'  : {
            'title'             : 'Ambiguous Compound Nouns',
            'description'       : 'A sequence of more than two consecutive nouns may have more than one interpretation depending on the possible associations between the words.',
            'lit_reference'     : 'NOVEL',
            'language_construct': 'Compound Noun'
        }
    }
}

In [31]:
#  Check for Compound Nouns ambiguities using the lexicon defined above
issues_ambs_found_compound_nouns = check_for_ambiguities(issues, compound_nouns_lexicons, sample_data_n=100)

100%|#############################################################| 100/100 [00:03<00:00, 31.62it/s]


In [32]:
# Display the ambiguites found, one issue at a time
display_issue_ambiguities(issues_ambs_found_compound_nouns, sample_data_n=100)


-------------------------------------------------- Issue ID: 14244911 --------------------------------------------------

**USER STORY:**

As a customer, I want receive alerts when credentials are inadequate so that I can scale storage and capacity as needed.

**DESCRIPTION:**

Hemant reached out to me to see if we could integrate [https://github.com/rvanderp3/vsphere-priv-check] in to the vSphere problem detector.  

**Required:**

- Pull request 

**Nice to have:**

**ACCEPTANCE CRITERIA:**

- Must check that credentials required for machinesets to scale up/down and storage to provision

**ENGINEERING DETAILS:**

 

Ambiguities Found:
[{'amb_type': 'Ambiguous Compound Nouns',
  'index_end': 295,
  'index_start': 271,
  'text': 'vSphere problem detector'}]

-------------------------------------------------- Issue ID: 13380031 --------------------------------------------------

h3. Description

As a user,
h3. Acceptance Criteria
 # <criteria>

h3. Additional Details:

Ambiguities Foun

### Ambiguit Detection: Nominalisations

In [33]:
# Some example Nominalisations detection lexicons
nominalisations_lexicons = {
    'check_nominalisations': {
        'ambiguous_nominalization'  : {
            'title'         : 'Ambiguous Nominalization',
            'description'   : 'A nominalization means the use of the noun form of a verb which may lead to loss of information about the nominalized action(subject, time, location).',
            'suffixes_len2' : ['ty'] ,
            'suffixes_len3' : ['ism', 'ion', 'ing'],
            'suffixes_len4' : ['ment', 'ness', 'ance', 'ence'],
            'hypernyms'     : ['event', 'process', 'act'],
            'gerund'        : ['ing'],
            'gerund_plural' : ['ings'],
            'rule_exceptions' : ['activity',
                                'application',
                                'navigation',
                                'notification',
                                'feedback',
                                'question',
                                'reading',
                                'clicking',
                                'registration',
                                'shopping',
                                'rating',
                                'monitoring',
                                'following',
                                'selection',
                                'shopping',
                                'transaction',
                                'transmission',],
            'lit_reference'  : 'NOVEL',
            'language_construct': 'Nominalization'
        }
    },
}

In [34]:
#  Check for Nominalisations ambiguities using the lexicon defined above
issues_ambs_found_nominalisations = check_for_ambiguities(issues, nominalisations_lexicons, sample_data_n=100)

100%|#############################################################| 100/100 [00:04<00:00, 24.96it/s]


In [35]:
# Display the ambiguites found, one issue at a time
display_issue_ambiguities(issues_ambs_found_nominalisations, sample_data_n=5)


-------------------------------------------------- Issue ID: 13307813 --------------------------------------------------

As a mesh administrator, I want to be able to join two meshes into a federation that do not share a root certificate, so that administrative domains can be completely separate

There has been some work upstream to support SPIFFE TrustBundles - they offer exactly the functionality required here, by mapping trust domains to certificate chains. We should look at cherry-picking that work, if possible.

Acceptance Criteria:
 * Every mesh can define its own trust domain and cert chain
 * Proxies validate remote certificates depending on the trust domain 

This story covers exchange of certificate chains at Federation initialization- for continuous updates of cert chains, see MAISTRA-2238

Ambiguities Found:
[{'amb_type': 'Ambiguous Nominalization',
  'index_end': 376,
  'index_start': 362,
  'text': 'cherry - picking'}]

--------------------------------------------------