In [None]:
"""
Analysis of ambiguity, inconsistency, and incompleteness
"""

import re
import sys
from pathlib import Path

import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer, util

try:
    ROOT = Path(__file__).resolve().parents[1]
except NameError:
    ROOT = Path.cwd().parent

if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from config import DATA_PROCESSED


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Loading data and embeddings
train_df = pd.read_csv(DATA_PROCESSED / "train.csv")
test_df = pd.read_csv(DATA_PROCESSED / "test.csv")
reqs = pd.concat([train_df, test_df], ignore_index=True)
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(reqs["requirement"].tolist(), convert_to_tensor=True, show_progress_bar=True)


Batches: 100%|██████████| 187/187 [00:29<00:00,  6.26it/s]


In [None]:
# Check ambiguity
AMBIGUOUS_TERMS = [
    "adequate", "fast", "user-friendly", "as soon as possible",
    "efficient", "minimal", "reliable", "sufficient", "normal", "various"
]

def check_ambiguity(req: str):
    found = [w for w in AMBIGUOUS_TERMS if re.search(rf"\b{w}\b", req, re.IGNORECASE)]
    return ", ".join(found) if found else None

reqs["ambiguous_terms"] = reqs["requirement"].apply(check_ambiguity)
reqs["is_ambiguous"] = reqs["ambiguous_terms"].notna()

reqs[reqs["is_ambiguous"]][["requirement", "ambiguous_terms"]].head()


Unnamed: 0,requirement,ambiguous_terms
46,"Fees for various features like job posting, pr...",various
60,The system shall allow users to create new for...,various
99,The system shall be compatible with various op...,various
115,"System shall process up to 10,000 transactions...",normal
120,System shall ensure that the V Mart website is...,user-friendly


In [None]:
# Check inconsistencies
contradictions = []
for i in range(len(reqs)):
    for j in range(i+1, len(reqs)):
        sim = util.cos_sim(embeddings[i], embeddings[j]).item()
        if sim > 0.75:
            # If similar phrases contain negation
            text_i, text_j = reqs.loc[i, "requirement"], reqs.loc[j, "requirement"]
            if ("shall not" in text_i and "shall" in text_j) or ("shall not" in text_j and "shall" in text_i):
                contradictions.append((text_i, text_j, sim))

pd.DataFrame(contradictions, columns=["req1", "req2", "similarity"])


Unnamed: 0,req1,req2,similarity
0,System shall restrict users from viewing other...,User shall not be able to view other users' de...,0.813284
1,Customer's password shall not be displayed in ...,User's password shall be displayed in the web ...,0.848544
2,The system shall notify customers about the st...,User shall be notified by the system about the...,0.807620
3,The THEMAS system shall ensure the temperature...,The THEMAS system shall be able to accept temp...,0.794071
4,The THEMAS system shall ensure the temperature...,System shall set overtemperature values to pre...,0.807042
...,...,...,...
65,The system shall allow the user to view their ...,The system shall notify the user of any schedu...,0.755514
66,System’s back-end servers shall only be access...,System’s back-end servers shall not display a ...,0.862594
67,The estimator shall not apply recycled parts t...,Only collision estimators shall search for rec...,0.817669
68,System’s back-end servers shall not display a ...,System's back-end servers shall not display th...,0.828677


In [None]:
# Check incompleteness
nlp = spacy.load("en_core_web_sm")

def is_incomplete(req: str):
    doc = nlp(req)
    has_verb = any(t.pos_ == "VERB" for t in doc)
    has_subject = any(t.dep_ in ("nsubj", "nsubjpass") for t in doc)
    return not (has_verb and has_subject)

reqs["is_incomplete"] = reqs["requirement"].apply(is_incomplete)
reqs[reqs["is_incomplete"]][["requirement"]].head()


Unnamed: 0,requirement
12,Second field will be date of task completion.
30,Audit trail data must be available for inspect...
46,"Fees for various features like job posting, pr..."
50,The system may be adaptable to different langu...
74,System shall be compatible with Windows and Li...


In [None]:
# Summary
summary = {
    "total": len(reqs),
    "ambiguous": reqs["is_ambiguous"].sum(),
    "incomplete": reqs["is_incomplete"].sum(),
}

report = pd.DataFrame([summary])
report["ambiguous_%"] = 100 * report["ambiguous"] / report["total"]
report["incomplete_%"] = 100 * report["incomplete"] / report["total"]

report


Unnamed: 0,total,ambiguous,incomplete,ambiguous_%,incomplete_%
0,5977,247,189,4.132508,3.162121


In [None]:
reqs.to_csv(DATA_PROCESSED / "validated_requirements.csv", index=False)
print("Validation report saved to:", DATA_PROCESSED / "validated_requirements.csv")
