In [1]:
# File: proc_ont.py
# Description:
# This file accepts user made strings as input.
# It processes the input to clean and extract keywords from it.
# Then the tool proceeds with extracting the ontology concept for the keyword.
# Afterwards, the concepts get queried to a search tool (default is GeoNetwork).
# The output is converted to human readable format.
# Author: Mitchell Verhaar

from owlready2 import *
from nltk.corpus import stopwords
import string
import re
import requests

class Ontology_Parser:

    def __init__(self, ont_list, filt_punc = False, limit = 5):
        ### Initializes the class instance
        self.ont_list = ont_list
        self.stopwords = set(stopwords.words('english'))
        self.punc_table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        self.filter_punc = filt_punc
        self.ont_dict_split = {}
        self.ont_dict_count = {}
        self.query_ip = '212.189.145.37:8080'
        self.limit = limit

    def load_ontologies(self):
        ### This function loads the ontologies given to the class at the initialization
        for ont in self.ont_list:
            try:
                proc_ont = get_ontology(ont).load()
                self.ont_dict_split[(proc_ont.name, proc_ont)] = {}
            except Exception as err:
                print("Cannot load ontology: " + ont + "\nDue to error: " + str(err))
    
    def process_input(self, u_input = None):
        ### This function cleans and filters the user input into usable input terms
        if u_input and self.filter_punc:
            return [re.sub(r"\W+$", "", re.sub(r"\W+\b", " ", word.lower())) for word in u_input.split(',') if word.lower() not in self.stopwords]
        elif u_input and not self.filter_punc:  
            return [word.lower() for word in u_input.split(',') if word.lower() not in self.stopwords]
        elif not u_input and self.filter_punc:
            return [re.sub(r"\W+$", "", re.sub(r"\W+\b", " ", word.lower())) for word in input("Enter the desired query here: ").split(',') if word.lower() not in self.stopwords]
        else:
            return [word.lower() for word in input("Enter the desired query here: ").split(',') if word.lower() not in self.stopwords]
        
    def search(self, u_input = None):
        ### This function performs the searching of ontologies given a set of input terms
        proc_input = self.process_input(u_input)
        if '' not in proc_input:
            for name, ont in self.ont_dict_split.keys():
                for term in proc_input:
                    ont_concept = ont.search_one(label = term)
                    if ont_concept:
                        self.add_concepts(ont_concept, name, ont)
                    else:
                        print("No ontological concepts have been found")
        else:
            print("Input must be provided!")
    
    def add_concepts(self, result, name, ont):
        ### Adds concepts to the original query keywords
        self.ont_dict_split[(name, ont)][result.label[0]] = result
        self.add_count(result.label[0])
        for related_concepts in result.is_a:
            if hasattr(related_concepts, 'label'):
                    if related_concepts.label:
                        print(related_concepts.label)
                        self.ont_dict_split[(name, ont)][related_concepts.label[0]] = related_concepts
                        self.add_count(related_concepts.label[0])
            else:
                if hasattr(related_concepts, 'value'):
                    if related_concepts.value.label:
                        print(related_concepts.value.label)
                        self.ont_dict_split[(name, ont)][related_concepts.value.label[0]] = related_concepts.value
                        self.add_count(related_concepts.value.label[0])
            
    def process_concepts(self):
        ### This function converts the ontology output into a list sorted by most occurring concept
        sorted_concepts = sorted(self.ont_dict_count.items(), key=lambda item: item[1])
        return [concept[0] for concept in sorted_concepts]
    
    def add_count(self, label):
        ### Adds term to stored dict
        if label in self.ont_dict_count:
            self.ont_dict_count[label] += 1
        else:
            self.ont_dict_count[label] = 1
    
    def send_query(self):
        ### This function sends the ontology output to the search tool that is being used
        ordered_query = ' '.join(self.process_concepts())
        print(ordered_query)
        if ordered_query:
            try:
                url = "http://" + self.query_ip + "/geonetwork/srv/eng/q?or=" + ordered_query + "&from=1&to=20&resultType=details&fast=index&_content_type=xml"
                response = requests.get(url)
                print(response.content)
                print(response.text)
                with open("Search_Results/output_" + ordered_query.replace(" ", "_") + ".xml", "w+") as f:
                    f.write(response.text)
                    print("Output saved to Search_Results/output_" + ordered_query.replace(" ", "_"))
            except:
                print("Connection to the search tool failed!")
        else:
            print("No ontology concepts could be added, using original user query as input...")
    
# Experiments to show keyword generations and connection to GeoNetwork




In [7]:
ont_parse = Ontology_Parser(["http://purl.obolibrary.org/obo/envo.owl", "http://purl.obolibrary.org/obo/oba.owl"], False)
ont_parse.load_ontologies()
ont_parse.search()

ont_parse.send_query()

Enter the desired query here: Ocean
['marine water body']
['saline water body']
['sea water']
['marine water body']
['saline water body']
['sea water']
ocean marine water body saline water body sea water
Connection to the search tool failed!


In [440]:
print(ont_parse.ont_dict_split)
print(ont_parse.ont_dict_count)

{('envo', get_ontology("http://purl.obolibrary.org/obo/envo.owl#")): {'ocean': obo.ENVO_00000015, 'marine water body': obo.ENVO_00001999, 'saline water body': obo.ENVO_01001319, 'sea water': obo.ENVO_00002149}, ('oba', get_ontology("http://purl.obolibrary.org/obo/oba.owl#")): {'ocean': obo.ENVO_00000015, 'marine water body': obo.ENVO_00001999, 'saline water body': obo.ENVO_01001319, 'sea water': obo.ENVO_00002149}}
{'ocean': 2, 'marine water body': 2, 'saline water body': 2, 'sea water': 2}


In [None]:
# "http://purl.obolibrary.org/obo/po.owl"