In [10]:
# Install required libraries (ensure they are installed only when running the script for the first time)
!pip install spacy
!pip install rdflib
!pip install owlrl
!pip3 install pydotplus graphviz
!pip install scikit-learn
!pip install rdflib
!pip install sentence_transformers



In [11]:
import os
import csv
import json
from rdflib import Graph
from sentence_transformers import SentenceTransformer, util

def load_ontology(file_path):
    g = Graph()
    g.parse(file_path)
    return g

def extract_classes_properties(g):
    classes_properties = []
    for s, p, o in g:
        if str(p) in [
            'http://www.w3.org/2000/01/rdf-schema#label',
            'http://www.w3.org/2004/02/skos/core#definition',
            'http://www.w3.org/2000/01/rdf-schema#comment',
            'http://purl.org/spar/pro/definition'
        ]:
            if hasattr(o, 'language'):
                if o.language == 'en':
                    classes_properties.append({"iri": str(s), "predicate": str(p), "value": str(o)})
            else:
                classes_properties.append({"iri": str(s), "predicate": str(p), "value": str(o)})
    return classes_properties

def combine_metadata(classes_properties):
    combined_data = {}
    
    for item in classes_properties:
        iri = item['iri']
        predicate = item['predicate']
        value = item['value']

        if iri not in combined_data:
            combined_data[iri] = {
                "label": [],
                "definition": [],
                "comment": [],
                "other": []
            }

        if predicate.endswith('label'):
            combined_data[iri]["label"].append(value)
        elif predicate.endswith('definition'):
            combined_data[iri]["definition"].append(value)
        elif predicate.endswith('comment'):
            combined_data[iri]["comment"].append(value)
        else:
            combined_data[iri]["other"].append(value)

    # Format each IRI's data into: label: definition. (comment)
    formatted = {}
    for iri, parts in combined_data.items():
        label = " / ".join(parts["label"])
        definition = " ".join(parts["definition"])
        comment = " ".join(parts["comment"])
        
        result = ""
        if label:
            result += f"{label}:"
        if definition:
            result += f" {definition}"
        if comment:
            result += f" ({comment})"

        formatted[iri] = result.strip()

    return formatted

def find_relevant_classes_properties(requirement, combined_data, model, threshold=0.0):
    requirement_embedding = model.encode(requirement, convert_to_tensor=True)
    results = []
    for iri, description in combined_data.items():
        description_embedding = model.encode(description, convert_to_tensor=True)
        similarity = util.pytorch_cos_sim(requirement_embedding, description_embedding).item()
        if similarity >= threshold:
            results.append((iri, similarity))
    results = sorted(results, key=lambda x: x[1], reverse=True)
    return results

def load_ground_truth(file_path):
    if not os.path.exists(file_path):
        return set()
    with open(file_path, 'r') as f:
        return set(line.strip() for line in f if line.strip())

def save_results(output_path_txt, output_path_csv, results):
    
    os.makedirs(os.path.dirname(output_path_txt), exist_ok=True)
    
    with open(output_path_txt, 'w') as f:
        for iri, _ in results:
            f.write(f"{iri}\n")

    with open(output_path_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['IRI', 'Similarity'])
        for iri, sim in results:
            writer.writerow([iri, f"{sim:.4f}"])

def save_evaluation(output_path, results, ground_truth, top_n, combined_data):
    top_predictions = [iri for iri, _ in results[:top_n]]
    predicted_set = set(top_predictions)

    tp = predicted_set & ground_truth
    fp = predicted_set - ground_truth
    fn = ground_truth - predicted_set

    precision = len(tp) / (len(tp) + len(fp)) if predicted_set else 0
    recall = len(tp) / (len(tp) + len(fn)) if ground_truth else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("Evaluation Metrics:\n")
        f.write(f"Precision: {precision:.4f}\n")
        f.write(f"Recall: {recall:.4f}\n")
        f.write(f"F1-score: {f1:.4f}\n")
        f.write(f"GroundTruth: {len(ground_truth)}\n")
        f.write(f"Predicted: {len(predicted_set)}\n")
        for iri, description in results[:top_n]:
            f.write(f"predicted {iri}: {description}\n")

def parse_structured_requirements(requirements_dict):
    parsed = {}
    for req_id, text in requirements_dict.items():
        terms = [t.strip() for t in text.split('.') if ':' in t]
        parsed[req_id] = [term.strip() for term in terms]
    return parsed

def main(ontologies_dir, terms_dir, output_dir, similarity_threshold, top_n):
    # ontologies_dir = 'Ontologies'
    # terms_dir = 'GroundtruthTerms'
    # output_dir = 'ExtractedTerms'
    # similarity_threshold = 0.4
    # top_n=20
    
    requirements = {
        "requirement1": "process: A transformation unit within a system that converts input entities (products, energy, or information) into outputs using defined operators. process step: A single, identifiable action within a larger process, often part of a hierarchical decomposition. sub-process: A detailed decomposition of a higher-level process operator, used to represent functional granularity. system boundary: A conceptual or physical limit within which the process and its operators are defined and executed. next process: Indicates sequential flow where one process follows another. parallel process: Processes that operate simultaneously within the same system boundary.",
        "requirement2": "input: An entity (product, energy, or information) that flows into a process operator. output: An entity resulting from a process operator’s transformation. has input: A relation connecting an entity as input to a process operator. has output: A relation connecting an entity as output from a process operator. product: Material or physical substance involved in or resulting from a process. energy: Power or force (e.g., heat, electricity) required or emitted in a process. information: Data or knowledge controlling, monitoring, or resulting from a process. technical resource: Equipment or tools used by a process operator to perform the transformation. attribute: A property (identifier or characteristic) assigned to an object. identification: A unique code or label used to reference an object across systems and versions. characteristic: A measurable or descriptive quality of an object; consists of a descriptive part and a relational part.parameter: A controllable variable influencing the process outcome (e.g., temperature, pressure).",
        "requirement3": "technical resource: Equipment, tool, or device. instrument: Measurement device providing input data. usage context: Specifies how/where a resource is applied in the system.",
        "requirement4": "process operator: A functional unit that transforms an input state (ante) into an output state (post), typically using technical resources. project: A coordinated set of activities with defined scope, resources, and objectives related to one or more processes. project state: A stage within the project lifecycle. has project: A relationship assigning a process (or group of processes) to one or more projects. project status: A descriptor of project progress: planned, in progress, completed, cancelled. project role: The function an actor (person/organization) has in the context of the project. project document: Technical documents, drawings, or specifications associated with a project. change request: A formal proposal to modify a process under the context of a project. project scope: The boundaries and extent of a project. project goal: The intended outcome or purpose of a project, often linked to a strategic initiative or problem to be solved. KPIs: Key Performance Indicators (KPIs) are measurable values that demonstrate progress toward goals. deliverables: Deliverables are tangible or documented outputs that must be produced.",  
    }

    model = SentenceTransformer('multi-qa-mpnet-base-dot-v1') #all-MiniLM-L6-v2, multi-qa-mpnet-base-dot-v1

    for ontology_file in os.listdir(ontologies_dir):
        if not ontology_file.lower().endswith(('.ttl', '.owl', '.rdf')):
            continue
        ontology_name = os.path.splitext(ontology_file)[0]
        ontology_path = os.path.join(ontologies_dir, ontology_file)
        g = load_ontology(ontology_path)

        classes_properties = extract_classes_properties(g)
        combined_data = combine_metadata(classes_properties)

        parsed_requirements = parse_structured_requirements(requirements)

        for requirement_id, term_defs in parsed_requirements.items():
            all_matches = []

            for term_def in term_defs:
                matches = find_relevant_classes_properties(term_def, combined_data, model, threshold=similarity_threshold)
                all_matches.extend(matches)  # Append all matches for each term_def

            # Optional: deduplicate or keep top-N
            seen = set()
            results = []
            for iri, sim in sorted(all_matches, key=lambda x: x[1], reverse=True):
                if iri not in seen:
                    results.append((iri, sim))
                    seen.add(iri)

            # Save results
            output_subdir = os.path.join(output_dir, ontology_name)
            iri_path_csv = os.path.join(output_subdir, f"{requirement_id}.csv")
            iri_path_txt = os.path.join(output_subdir, f"{requirement_id}.txt")
            save_results(iri_path_txt, iri_path_csv, results)

            # Evaluation
            gt_path = os.path.join(terms_dir, ontology_name, f"{requirement_id}.txt")
            ground_truth = load_ground_truth(gt_path)
            eval_path = os.path.join(output_subdir, f"{requirement_id}_evaluation.txt")
            save_evaluation(eval_path, results, ground_truth, top_n, combined_data)


In [12]:
from sentence_transformers import SentenceTransformer, util

INSTRUCTOR_MODELS = {
    "hkunlp/instructor-large": {
        "instruction_req": "Represent the requirement for process modeling",
        "instruction_term": "Represent the ontology concept for process modeling"
    }
}

BASELINE_MODELS = { 
    #"multi-qa-mpnet-base-dot-v1": {},
    "all-MiniLM-L6-v2": {},
    "all-distilroberta-v1": {},
    "all-mpnet-base-v2": {}
}

requirements = {
        "requirement1": "process: A transformation unit within a system that converts input entities (products, energy, or information) into outputs using defined operators. process step: A single, identifiable action within a larger process, often part of a hierarchical decomposition. sub-process: A detailed decomposition of a higher-level process operator, used to represent functional granularity. system boundary: A conceptual or physical limit within which the process and its operators are defined and executed. next process: Indicates sequential flow where one process follows another. parallel process: Processes that operate simultaneously within the same system boundary.",
        "requirement2": "input: An entity (product, energy, or information) that flows into a process operator. output: An entity resulting from a process operator’s transformation. has input: A relation connecting an entity as input to a process operator. has output: A relation connecting an entity as output from a process operator. product: Material or physical substance involved in or resulting from a process. energy: Power or force (e.g., heat, electricity) required or emitted in a process. information: Data or knowledge controlling, monitoring, or resulting from a process. technical resource: Equipment or tools used by a process operator to perform the transformation. attribute: A property (identifier or characteristic) assigned to an object. identification: A unique code or label used to reference an object across systems and versions. characteristic: A measurable or descriptive quality of an object; consists of a descriptive part and a relational part.parameter: A controllable variable influencing the process outcome (e.g., temperature, pressure).",
        "requirement3": "technical resource: Equipment, tool, or device. instrument: Measurement device providing input data. usage context: Specifies how/where a resource is applied in the system.",
        "requirement4": "process operator: A functional unit that transforms an input state (ante) into an output state (post), typically using technical resources. project: A coordinated set of activities with defined scope, resources, and objectives related to one or more processes. project state: A stage within the project lifecycle. has project: A relationship assigning a process (or group of processes) to one or more projects. project status: A descriptor of project progress: planned, in progress, completed, cancelled. project role: The function an actor (person/organization) has in the context of the project. project document: Technical documents, drawings, or specifications associated with a project. change request: A formal proposal to modify a process under the context of a project. project scope: The boundaries and extent of a project. project goal: The intended outcome or purpose of a project, often linked to a strategic initiative or problem to be solved. KPIs: Key Performance Indicators (KPIs) are measurable values that demonstrate progress toward goals. deliverables: Deliverables are tangible or documented outputs that must be produced.",  
    }
def encode_with_optional_instruction(model_name, model, text, role):
    """
    Handles whether instruction needs to be used (INSTRUCTOR) or not.
    """
    if model_name in INSTRUCTOR_MODELS:
        instruction = INSTRUCTOR_MODELS[model_name][f"instruction_{role}"]
        return model.encode([[instruction, text]], convert_to_tensor=True)
    else:
        return model.encode(text, convert_to_tensor=True)

def main_extended(ontologies_dir, terms_dir, output_dir, similarity_threshold, top_n):
    all_models = {**BASELINE_MODELS, **INSTRUCTOR_MODELS}

    for model_name in all_models:
        print(f"\nRunning with model: {model_name}")
        model = SentenceTransformer(model_name)

        for ontology_file in os.listdir(ontologies_dir):
            if not ontology_file.lower().endswith(('.ttl', '.owl', '.rdf')):
                continue

            ontology_name = os.path.splitext(ontology_file)[0]
            ontology_path = os.path.join(ontologies_dir, ontology_file)
            g = load_ontology(ontology_path)
            classes_properties = extract_classes_properties(g)
            combined_data = combine_metadata(classes_properties)

            parsed_requirements = parse_structured_requirements(requirements)

            for requirement_id, term_defs in parsed_requirements.items():
                all_matches = []

                for term_def in term_defs:
                    req_emb = encode_with_optional_instruction(model_name, model, term_def, role='req')

                    for iri, desc in combined_data.items():
                        desc_emb = encode_with_optional_instruction(model_name, model, desc, role='term')
                        sim = util.pytorch_cos_sim(req_emb, desc_emb).item()
                        if sim >= similarity_threshold:
                            all_matches.append((iri, sim))

                # Deduplicate & sort
                seen = set()
                results = []
                for iri, sim in sorted(all_matches, key=lambda x: x[1], reverse=True):
                    if iri not in seen:
                        results.append((iri, sim))
                        seen.add(iri)

                # Save results
                output_subdir = os.path.join(output_dir, model_name.replace("/", "_"), ontology_name)
                os.makedirs(output_subdir, exist_ok=True)

                txt_path = os.path.join(output_subdir, f"{requirement_id}.txt")
                csv_path = os.path.join(output_subdir, f"{requirement_id}.csv")
                eval_path = os.path.join(output_subdir, f"{requirement_id}_evaluation.txt")

                save_results(txt_path, csv_path, results)

                gt_path = os.path.join(terms_dir, ontology_name, f"{requirement_id}.txt")
                ground_truth = load_ground_truth(gt_path)
                save_evaluation(eval_path, results, ground_truth, top_n, combined_data)



In [13]:
main_extended('Ontologies', 'GroundtruthTerms', 'ExtractedTerms', 0.4, 20)


Running with model: multi-qa-mpnet-base-dot-v1

Running with model: all-MiniLM-L6-v2

Running with model: all-distilroberta-v1

Running with model: all-mpnet-base-v2

Running with model: hkunlp/instructor-large


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
#main('Ontologies', 'GroundtruthTerms', 'ExtractedTerms', 0.4, 20)

In [None]:
main_extended('ODPs', 'ODPs/GroundtruthTerms', 'ODPs/ExtractedTerms', 0.4, 20)

In [33]:
import numpy as np

def hyperparameter_search(
    ontologies_dir,
    terms_dir,
    output_base_dir,
    model_name,
    similarity_thresholds=[0.3, 0.4, 0.5, 0.6],
    top_ks=[5, 10, 15, 20],
):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer(model_name)

    # Store all results: (threshold, top_k): [f1_1, f1_2, ...]
    results_dict = {}

    for threshold in similarity_thresholds:
        for top_k in top_ks:
            print(f"\n>> Evaluating: threshold={threshold}, top_k={top_k}")
            f1_scores = []

            for ontology_file in os.listdir(ontologies_dir)[:1]:
                if not ontology_file.lower().endswith(('.ttl', '.owl', '.rdf')):
                    continue

                ontology_name = os.path.splitext(ontology_file)[0]
                ontology_path = os.path.join(ontologies_dir, ontology_file)
                g = load_ontology(ontology_path)
                combined_data = combine_metadata(extract_classes_properties(g))

                parsed_requirements = parse_structured_requirements(requirements)
                print(ontology_name)

                for requirement_id, term_defs in parsed_requirements.items():
                    all_matches = []

                    for term_def in term_defs:
                        matches = find_relevant_classes_properties(term_def, combined_data, model, threshold)
                        all_matches.extend(matches)

                    # Deduplicate
                    seen = set()
                    results = []
                    for iri, sim in sorted(all_matches, key=lambda x: x[1], reverse=True):
                        if iri not in seen:
                            results.append((iri, sim))
                            seen.add(iri)

                    # Evaluation
                    gt_path = os.path.join(terms_dir, ontology_name, f"{requirement_id}.txt")
                    ground_truth = load_ground_truth(gt_path)
                    top_predictions = [iri for iri, _ in results[:top_k]]
                    predicted_set = set(top_predictions)

                    tp = predicted_set & ground_truth
                    fp = predicted_set - ground_truth
                    fn = ground_truth - predicted_set

                    precision = len(tp) / (len(tp) + len(fp)) if predicted_set else 0
                    recall = len(tp) / (len(tp) + len(fn)) if ground_truth else 0
                    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

                    f1_scores.append(f1)

            avg_f1 = np.mean(f1_scores)
            results_dict[(threshold, top_k)] = avg_f1
            print(f"Average F1@{top_k} for threshold={threshold:.2f}: {avg_f1:.4f}")

    # Print top configurations
    print("\n=== Top Configurations ===")
    for (thr, tk), f1 in sorted(results_dict.items(), key=lambda x: x[1], reverse=True)[:5]:
        print(f"Threshold: {thr:.2f}, Top-K: {tk}, F1: {f1:.4f}")

hyperparameter_search(
    ontologies_dir="Ontologies",
    terms_dir="GroundtruthTerms",
    output_base_dir="ExtractedTerms",
    model_name="all-mpnet-base-v2",
    similarity_thresholds=[0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    top_ks=[5, 10, 15, 20]
)


>> Evaluating: threshold=0.4, top_k=5
GPO


In [None]:
from sentence_transformers import util

def is_instructor_model(model):
    """Detect whether the model is an INSTRUCTOR model."""
    return hasattr(model, 'smart_batching_collate') and 'instructor' in str(type(model)).lower()

def find_relevant_classes_properties(
    requirement_summary,
    combined_data,
    model,
    threshold=0.0,
    instruction="Represent the ontology concept for semantic matching"
):
    """
    Finds relevant ontology classes/properties using a summary and semantic similarity.

    Args:
        requirement_summary (str): Summary text describing the requirement.
        combined_data (dict): Mapping of IRI to metadata string.
        model: SentenceTransformer or INSTRUCTOR model.
        threshold (float): Cosine similarity threshold.
        instruction (str): Only used if model is INSTRUCTOR.

    Returns:
        List of (IRI, similarity) sorted by similarity descending.
    """
    if is_instructor_model(model):
        req_embedding = model.encode([[instruction, requirement_summary]], convert_to_tensor=True)[0]
    else:
        req_embedding = model.encode(requirement_summary, convert_to_tensor=True)

    results = []
    for iri, description in combined_data.items():
        if is_instructor_model(model):
            desc_embedding = model.encode([[instruction, description]], convert_to_tensor=True)[0]
        else:
            desc_embedding = model.encode(description, convert_to_tensor=True)

        similarity = util.pytorch_cos_sim(req_embedding, desc_embedding).item()
        if similarity >= threshold:
            results.append((iri, similarity))

    return sorted(results, key=lambda x: x[1], reverse=True)


def main_summary_mode(ontologies_dir, terms_dir, output_dir, model_name, similarity_threshold, top_n):
    from sentence_transformers import SentenceTransformer

    # Step 1: Summarized requirement queries
    requirement_summaries = {
        "requirement1": "Retrieve all ontology terms related to process structure: process, process step, sub-process, sequential and parallel execution, and system boundaries.",
        "requirement2": "Identify terms describing inputs, outputs, and parameters in a process including material, energy, information, product, and associated attributes like characteristics and identification.",
        "requirement3": "Extract terms that represent technical resources, instruments, and their usage context within a process.",
        "requirement4": "Find ontology terms related to project and organizational context: process operator, project stages, roles, documents, change requests, goals, KPIs, and deliverables."
    }

    model = SentenceTransformer(model_name)  # or use INSTRUCTOR model

    for ontology_file in os.listdir(ontologies_dir):
        if not ontology_file.lower().endswith(('.ttl', '.owl', '.rdf')):
            continue

        ontology_name = os.path.splitext(ontology_file)[0]
        ontology_path = os.path.join(ontologies_dir, ontology_file)
        g = load_ontology(ontology_path)
        classes_properties = extract_classes_properties(g)
        combined_data = combine_metadata(classes_properties)

        for requirement_id, summary_query in requirement_summaries.items():
            results = find_relevant_classes_properties(summary_query, combined_data, model, threshold=similarity_threshold)

            # Deduplicate top-N based on IRI
            seen = set()
            dedup_results = []
            for iri, sim in sorted(results, key=lambda x: x[1], reverse=True):
                if iri not in seen:
                    dedup_results.append((iri, sim))
                    seen.add(iri)
                if len(dedup_results) >= top_n:
                    break

            # Output directory and paths
            output_subdir = os.path.join(output_dir, "main_summary", model_name.replace("/", "_"), ontology_name)

            output_subdir = os.path.join(output_dir, ontology_name)
            os.makedirs(output_subdir, exist_ok=True)

            iri_path_csv = os.path.join(output_subdir, f"{requirement_id}.csv")
            iri_path_txt = os.path.join(output_subdir, f"{requirement_id}.txt")
            save_results(iri_path_txt, iri_path_csv, dedup_results)

            # Evaluation
            gt_path = os.path.join(terms_dir, ontology_name, f"{requirement_id}.txt")
            ground_truth = load_ground_truth(gt_path)
            eval_path = os.path.join(output_subdir, f"{requirement_id}_evaluation.txt")
            save_evaluation(eval_path, dedup_results, ground_truth, top_n, combined_data)

main_summary_mode(
    ontologies_dir='Ontologies',
    terms_dir='GroundtruthTerms',
    output_dir='ExtractedTerms_SummaryMode',
    model_name='all-mpnet-base-v2', #"hkunlp/instructor-large"
    similarity_threshold=0.4,
    top_n=20
)
