# Evaluation of the CiRA Tool

This notebook contains the evaluation of CiRA's capability to automatically generate test case descriptions from natural language requirements.

In [None]:
import os
import json

import requests
import pandas as pd

from difflib import SequenceMatcher

### Data Loading

First, load both the data set of sentences (which also contains the ground truth regarding causality) and the ground truth of test case descriptions.

In [None]:
sentences = pd.read_csv('./../data/cwa-acceptance-criteria.csv', usecols=['ID', 'Acceptance Criterion', 'Causal'])

In [None]:
data_location = './../data/ground-truth/'

ground_truth = {}
for filename in os.listdir(data_location):
    if not filename.startswith('s'):
        continue

    sentence_id = int(filename.split('.')[0][1:])
    
    with open(f'{data_location}{filename}') as file:
        data = json.load(file)
        ground_truth[sentence_id] = data

Check that CiRA is running locally. If the `api/health` request receives a response resembling `{'status': 'up', 'cira-version': '0.9.4'}` then the tool is available and the evaluation can be performed.

In [None]:
response = requests.get('http://localhost:8000/api/health')
response.json()

### Classification

Evaluate CiRA's ability to correctly classify sentences as either causal or non-causal.

In [None]:
cira_classification = []
for index, row in sentences.iterrows():
    sentence = row['Acceptance Criterion']

    classification = requests.put('http://localhost:8000/api/classify', 
                                  data=json.dumps({'sentence': sentence}), 
                                  headers={'content-type':'application/json'})
    cira_classification.append(classification.json()['causal'])

In [None]:
gold_standard = list(sentences['Causal'].values)

agreement = 0
for index in range(0, len(gold_standard)):
    if gold_standard[index] == cira_classification[index]:
        agreement += 1

print(f'CiRA classified {float(agreement)/len(gold_standard):.2%} of all sentences correctly.')

In [None]:
sentences['CiRA Causal'] = cira_classification

### Test Case Generation

Finally, perform the evaluation of the test case description generation. For this, methods to compare the ground truth test cases have to be defined first before iterating through all causal sentences, generating test cases, and calculating the similarity between the generated test suite and the ground truth.

In [None]:
def get_variable_by_id(testsuite, id: str) -> dict:
    """Retrieves a variable from a testsuite dictionary with the given id.
    
    parameters:
      testsuite -- dictionary containing a list of condition variables, a list of expected variables, and a list of combinations
      id -- alphanumeric identifier of a variable, typically P plus a number
      
    returns:
      none -- if neither the conditions nor the expected variables contain a variable with the given id
      the respective variable -- otherwise"""
    
    variables = testsuite['conditions'] + testsuite['expected']

    candidates = [variable for variable in variables if variable['id']==id]
    if len(candidates) > 0:
        return candidates[0]
    return None

def calculate_variable_similarity(ground_truth: dict, generated: dict) -> float:
    """Calculate the average similarity of the variables (conditions and expected) between the ground truth and a generated test suite.
    
    parameters:
      ground_truth -- a test suite dictionary generated manually
      generated -- a test suite dictionary generated automatically by CiRA
      
    returns: similarity score between 0 (no match) and 1 (perfect match)"""

    similarity_scores: list[float] = []

    for variable in ground_truth['conditions'] + ground_truth['expected']:
        variable_generated = get_variable_by_id(generated, variable['id'])

        variable_similarity = 0.0
        condition_similarity = 0.0
        
        if variable_generated != None:
            variable_similarity = SequenceMatcher(None, variable['variable'], variable_generated['variable']).ratio()
            condition_similarity = SequenceMatcher(None, variable['condition'], variable_generated['condition']).ratio()

        similarity_scores.append(variable_similarity)
        similarity_scores.append(condition_similarity)
        
    avg_similarity = sum(similarity_scores)/len(similarity_scores)
    return avg_similarity

In [None]:
def calculate_combinatorics_similarity(ground_truth: dict, generated: dict) -> float: 
    """Calculate the average combinatorics similarity between the ground truth and a generated test suite.
    parameters:
      ground_truth -- a test suite dictionary generated manually
      generated -- a test suite dictionary generated automatically by CiRA
      
    returns: similarity score between 0 (no match) and 1 (perfect match)"""
    
    tcs_manual = ground_truth['cases']
    tcs_generated = generated['cases']

    similarity_scores: list[float] = []

    for index, tc in enumerate(tcs_manual):
        similarity = 1.0 if (tc==tcs_generated[index]) else 0.0
        similarity_scores.append(similarity)

    avg_similarity = sum(similarity_scores)/len(similarity_scores)
    return avg_similarity

In [None]:
similarity_scores_variable: list[float] = []
similarity_scores_combinatorics: list[float] = []

for index, row in sentences.iterrows():
    #if not row['Causal'] or not row['CiRA Causal']:
    if not row['Causal'] or index == 42:
        continue

    sentence = row['Acceptance Criterion']
    sentence_id = row['ID']
    #print(f'{sentence_id}: {sentence}')

    labels = requests.put('http://localhost:8000/api/label', 
                            data=json.dumps({'sentence': sentence}),
                            headers={'content-type':'application/json'}).json()
    #print(labels)
    
    ceg = requests.put('http://localhost:8000/api/graph', 
                            data=json.dumps({
                                'sentence': sentence,
                                'labels': labels['labels']
                                }),
                            headers={'content-type':'application/json'}).json()
    #print(ceg)

    tests = requests.put('http://localhost:8000/api/testsuite', 
                            data=json.dumps({
                                'sentence': sentence,
                                'graph': ceg['graph']
                                }),
                            headers={'content-type':'application/json'}).json()
    #print(tests['suite'])

    variable_similarity = calculate_variable_similarity(ground_truth=ground_truth[sentence_id]['testsuite'], generated=tests['suite'])
    similarity_scores_variable.append(variable_similarity)

    combinatorics_similarity = calculate_combinatorics_similarity(ground_truth=ground_truth[sentence_id]['testsuite'], generated=tests['suite'])
    similarity_scores_combinatorics.append(combinatorics_similarity)

print(f'Average variable similarity over {len(similarity_scores_variable)} sentences: {sum(similarity_scores_variable)/len(similarity_scores_variable)}')
print(f'Average combinatorics similarity over {len(similarity_scores_combinatorics)} sentences: {sum(similarity_scores_combinatorics)/len(similarity_scores_combinatorics)}')