In [1]:
from ontology_generation.Evaluation import Evaluation
from ontology_generation.OntologyGen import OntologyGen
from ontology_generation.OntologyEncap import OntologyEncap
import re
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
from rdflib import Namespace, Graph

In [2]:
ontogen = OntologyGen(model_name="gpt-4-1106-preview", deployment_name="gpt_chat_test_preview")
topic_name = "natural language processing"

#### CSO

In [3]:
# Evaluation
file_path = "data/CSO.3.3.ttl"
parent_topic_uri = "https://cso.kmi.open.ac.uk/topics/natural_language_processing"
deduplicated_topics = Evaluation.extract_concepts_and_deduplicate(file_path, parent_topic_uri)
cso_concepts = Evaluation.process_concepts(deduplicated_topics)

In [4]:
print("Number of concepts in CSO before deduplication: ", len(cso_concepts))
cso_concepts

Number of concepts in CSO before deduplication:  55


['dependency_parser',
 'part-of-speech_tagging',
 'pseudo_relevance_feedback',
 'relevance_models',
 'part_of_speech_tagging',
 'sequence_labeling',
 'syntactic_analysis',
 'syntactic_features',
 'abstracting_and_indexing',
 'statistical_language_models',
 'syntactic_structure',
 'information_extraction',
 'maximum_entropy_models',
 'semantic_similarity_measures',
 'syntactic_parsing',
 'parse_trees',
 'spoken_language_processing',
 'cross-language_information_retrieval',
 'topic_model',
 'parsing_algorithm',
 'computing_with_words',
 'lexical_semantics',
 'sentiment_analysis',
 'named_entity',
 'spoken_dialogue_system',
 'maximum_entropy_model',
 'named_entities',
 'natural_language_generation',
 'syntactic_information',
 'spoken_dialogue_systems',
 'treebanks',
 'machine_translation',
 'computing_with_word_%28cww%29',
 'information_retrieval_technology',
 'n-gram_models',
 'lexical_database',
 'synsets',
 'natural_language_text',
 'statistical_language_modeling',
 'lexical_resources'

In [29]:
from difflib import SequenceMatcher

# Function to normalize topics
def normalize_topic(topic):
    # Convert to lowercase and replace hyphens with underscores
    topic = topic.lower().replace("-", "_")
    # Remove trailing 's' for plurals
    topic = re.sub(r's$', '', topic)
    # Remove parentheses and their contents
    topic = re.sub(r'\([^)]*\)', '', topic)
    # Remove any remaining special characters and extra whitespace
    topic = re.sub(r'[^a-z0-9_]', '', topic)
    return topic

# Function to check if two topics are similar based on a threshold
def are_similar(a, b, threshold=0.85):
    return SequenceMatcher(None, a, b).ratio() > threshold

# Set for normalized topics to track seen ones
seen = set()
# List to store unique topics
unique_topics = []

# First pass: Remove exact duplicates using normalization
for topic in cso_concepts:
    normalized = normalize_topic(topic)
    if normalized not in seen:
        seen.add(normalized)
        unique_topics.append(topic)

# Second pass: Check for similar topics using SequenceMatcher
final_topics = []
for topic in unique_topics:
    if not any(are_similar(normalize_topic(topic), normalize_topic(existing)) for existing in final_topics):
        final_topics.append(topic)

# Print the number of unique topics and the topics themselves
print(len(final_topics))
final_topics

45


['dependency_parser',
 'part-of-speech_tagging',
 'pseudo_relevance_feedback',
 'relevance_models',
 'sequence_labeling',
 'syntactic_analysis',
 'syntactic_features',
 'abstracting_and_indexing',
 'statistical_language_models',
 'syntactic_structure',
 'information_extraction',
 'maximum_entropy_models',
 'semantic_similarity_measures',
 'syntactic_parsing',
 'parse_trees',
 'spoken_language_processing',
 'cross-language_information_retrieval',
 'topic_model',
 'parsing_algorithm',
 'computing_with_words',
 'lexical_semantics',
 'sentiment_analysis',
 'named_entity',
 'spoken_dialogue_system',
 'natural_language_generation',
 'syntactic_information',
 'treebanks',
 'machine_translation',
 'computing_with_word_%28cww%29',
 'information_retrieval_technology',
 'n-gram_models',
 'lexical_database',
 'synsets',
 'natural_language_text',
 'lexical_resources',
 'parsing',
 'text_processing',
 'question_answering_system',
 'word_embeddings',
 'word_segmentation',
 'computational_grammars',
 

In [24]:
# role = "You are an ontology engineer"
# prompt = f'''You are a model tasked with deleting duplicate topics from a list for ontology creation for the {topic_name} domain. 
# The list: ''' + ', '.join(unique_topics) + '''
# Return the response only in the dictionary format. Do not add new topics. Ensure proper use of quotations:
# {
# 'topic1',
# 'topic2
# }
# '''
# cso_list = ontogen.prompt_extract(role, prompt)

In [25]:
# print("Number of concepts in CSO: ", len(cso_list))
# cso_list

Number of concepts in CSO:  42


{'dependency_parser': 'dependency_parsing',
 'part-of-speech_tagging': 'part-of-speech_tagging',
 'pseudo_relevance_feedback': 'pseudo_relevance_feedback',
 'relevance_models': 'relevance_models',
 'sequence_labeling': 'sequence_labeling',
 'syntactic_analysis': 'syntactic_analysis',
 'syntactic_features': 'syntactic_features',
 'abstracting_and_indexing': 'abstracting_and_indexing',
 'statistical_language_models': 'statistical_language_modeling',
 'syntactic_structure': 'syntactic_structure',
 'information_extraction': 'information_extraction',
 'maximum_entropy_models': 'maximum_entropy_models',
 'semantic_similarity_measures': 'semantic_similarity',
 'syntactic_parsing': 'syntactic_parsing',
 'parse_trees': 'parse_trees',
 'spoken_language_processing': 'spoken_language_processing',
 'cross-language_information_retrieval': 'cross-language_information_retrieval',
 'topic_model': 'topic_model',
 'parsing_algorithm': 'parsing_algorithm',
 'computing_with_words': 'computing_with_word_%28

#### AutOnto with OA

In [26]:
concept_uri = "http://fraunhofer.de/example/Natural_Language_Processing"
graph = Graph()
graph.parse("output/taxonomy_withOA.ttl", format="ttl")  # Load your RDF data

descendants_withOA = Evaluation.get_descendants(concept_uri, graph)
concepts_onto_withOA = Evaluation.clean_concept_names(descendants_withOA)

#### AutOnto w/o OA

In [27]:
concept_uri = "http://fraunhofer.de/example/Natural_language_processing"
file_path = "output/taxonomy_withoutOA.ttl"

graph = Graph()
graph.parse("output/taxonomy_withoutOA.ttl", format="ttl")  # Load your RDF data

descendants_withoutOA = Evaluation.get_descendants(concept_uri, graph)
concepts_onto__withoutOA = Evaluation.clean_concept_names(descendants_withoutOA)

### Comparison

In [30]:
print("Number of concepts in CSO: ", len(final_topics))
print("Number of concepts in OntoNLP with OA: ", len(concepts_onto_withOA))
print("Number of concepts in OntoNLP with OA: ", len(concepts_onto__withoutOA))

Number of concepts in CSO:  45
Number of concepts in OntoNLP with OA:  75
Number of concepts in OntoNLP with OA:  56


In [31]:
evalinst = Evaluation()

cso_list_processed = evalinst.preprocess_list(final_topics)
concepts_onto_withOA_processed = evalinst.preprocess_list(concepts_onto_withOA)
concepts_onto_withoutOA_processed = evalinst.preprocess_list(concepts_onto__withoutOA)

In [32]:
# Load a pre-trained SentenceTransformer model
whaleloops_model = SentenceTransformer("whaleloops/phrase-bert")

# Calculate metrics for preprocessed_list1
phrase_embeddings1 = whaleloops_model.encode(cso_list_processed)
reference_embedding = whaleloops_model.encode('natural-language-processing')
metrics1 = Evaluation.calculate_metrics(phrase_embeddings1, reference_embedding)

# Calculate metrics for preprocessed_list2
phrase_embeddings2 = whaleloops_model.encode(concepts_onto_withOA_processed)
metrics2 = Evaluation.calculate_metrics(phrase_embeddings2, reference_embedding)

# Calculate metrics for preprocessed_list2
phrase_embeddings3 = whaleloops_model.encode(concepts_onto_withoutOA_processed)
metrics3 = Evaluation.calculate_metrics(phrase_embeddings3, reference_embedding)



In [33]:
# Create a list of dictionaries to store the metrics
data = [
    {
        "List": "CSO",
        "Number of Terms": len(cso_list_processed),
        **metrics1
    },
    {
        "List": "AutOnto with OA concepts",
        "Number of Terms": len(concepts_onto_withOA_processed),
        **metrics2
    },
        {
        "List": "AutOnto without OA concepts",
        "Number of Terms": len(concepts_onto_withoutOA_processed),
        **metrics3
    }
]

# Create the DataFrame from the list of dictionaries
comparison = pd.DataFrame(data)

# Export the DataFrame to a CSV file
comparison.to_csv("output/metrics_comparison.csv", index=False)