In [None]:
# File: proc_ont.py
# Description:
# This file accepts user made strings as input.
# It processes the input to clean and extract keywords from it.
# Then the tool proceeds with extracting the ontology concept for the keyword.
# Afterwards, the concepts get queried to a search tool (default is GeoNetwork).
# The output is converted to human readable format.
# Author: Mitchell Verhaar

from owlready2 import *
from nltk.corpus import stopwords
from timeit import default_timer as timer
import string
import re
import requests
import matplotlib.pyplot as plt
import numpy as np

In [None]:
class Ontology_Parser:

    def __init__(self, ont_list, filt_punc = False):
        ### Initializes the class instance
        self.ont_list = ont_list
        self.stopwords = set(stopwords.words('english'))
        self.punc_table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
        self.filter_punc = filt_punc
        self.ont_dict_split = {}
        self.ont_dict_count = {}
        self.url = "http://212.189.145.37:8080/geonetwork/srv/eng/q?or=%s&from=1&resultType=details&fast=index&_content_type=xml"
        self.u_input = []
        self.processed_query = ''

    def load_ontologies(self):
        ### This function loads the ontologies given to the class at the initialization
        for ont in self.ont_list:
            try:
                self.ont_dict_split[World().get_ontology(ont).load()] = {}
            except Exception as err:
                print("Cannot load ontology: " + ont + "\nDue to error: " + str(err))
    
    def process_input(self, u_input = None):
        ### This function cleans and filters the user input into usable input terms
        if u_input and self.filter_punc:
            return [re.sub(r"\W+$", "", re.sub(r"\W+\b", " ", word.lower())) for word in u_input.split(',') if word.lower() not in self.stopwords]
        elif u_input and not self.filter_punc:  
            return [word.lower() for word in u_input.split(',') if word.lower() not in self.stopwords]
        elif not u_input and self.filter_punc:
            return [re.sub(r"\W+$", "", re.sub(r"\W+\b", " ", word.lower())) for word in input("Enter the desired query here: ").split(',') if word.lower() not in self.stopwords]
        else:
            return [word.lower() for word in input("Enter the desired query here: ").split(',') if word.lower() not in self.stopwords]
        
    def search(self, u_input = None):
        ### This function performs the searching of ontologies given a set of input terms
        self.clean_data()
        self.u_input = self.process_input(u_input)
        if '' not in self.u_input:
            for ont in self.ont_dict_split.keys():
                for term in self.u_input:
                    for ind in ont.individuals():
                        if hasattr(ind, 'label'):
                            if term in (concept.lower() for concept in ind.label):
                                print('passed with term: ' + term)
                                self.add_concepts(ind, ont)
                    ont_concept = ont.search_one(label = term)
                    if ont_concept:
                        self.add_concepts(ont_concept, ont)
                    else:
                        if term.lower() not in (keyword.lower() for keyword in self.ont_dict_count.keys()):
                            self.add_count(term.lower())
            
        else:
            print("Input must be provided!")
    
    def add_concepts(self, result, ont):
        ### Adds concepts to the original query keywords
        self.ont_dict_split[ont][result.label[0]] = result
        self.add_count(result.label[0])
        for related_concepts in result.is_a:
            if hasattr(related_concepts, 'label'):
                    if related_concepts.label and 'Obsolete Class' not in related_concepts.label:
                        self.ont_dict_split[ont][related_concepts.label[0]] = related_concepts
                        self.add_count(related_concepts.label[0].lower())
            else:
                if hasattr(related_concepts, 'value'):
                    try:
                        self.ont_dict_split[ont][related_concepts.value.label[0]] = related_concepts.value
                        self.add_count(related_concepts.value.label[0].lower())
                    except:
                        continue
            
    def process_concepts(self):
        ### This function converts the ontology output into a list sorted by most occurring concept
        sorted_concepts = sorted(self.ont_dict_count.items(), key=lambda item: item[1])
        return [concept[0] for concept in sorted_concepts]
    
    def add_count(self, label):
        ### Adds term to stored dict
        if label in self.ont_dict_count:
            self.ont_dict_count[label] += 1
        else:
            self.ont_dict_count[label] = 1
            
    def send_query(self, query):
        ### This function sends the given query to an external search tool, GeoNetwork in this case.
        try:
            response = requests.get(self.url % query)
            with open("Search_Results/output_" + query.replace(" ", "_") + ".xml", "w+") as f:
                f.write(response.text)
                print("Output saved to Search_Results/output_" + query.replace(" ", "_"))
        except:
            print("Connection to the search tool failed!")

    def process_query(self, keywords = ''):
        ### This function sends the ontology output to the search tool that is being used
        self.processed_query = ' '.join(self.process_concepts())
        if keywords:
            print("Given input query: " + keywords)
            self.send_query(keywords)
        elif self.processed_query:
            print("Ontological supported query: " + self.processed_query)
            self.send_query(self.processed_query)
        
        else:
            if self.u_input:
                u_input = ' '.join(self.u_input)
                print("No ontology concepts could be found, using original user query as input: " + u_input)
                self.send_query(u_input)
            else:
                print('No query has been given, aborting search!')
    
    def clean_data(self):
        ### Cleans the keywords gathered from previous searches
        self.ont_dict_count = {}
        for ont in self.ont_dict_split:
            self.ont_dict_split[ont] = {}

In [None]:
### Class initialiation and ontology loading (excessive runtime warning!)

### Initializes the Class with a list of ontology's to be loaded in and a flag to process punctuation
ont_parse = Ontology_Parser(["http://purl.obolibrary.org/obo/envo.owl", "http://purl.obolibrary.org/obo/geo.owl"], False)

### Loads the ontology's into the class variables
ont_parse.load_ontologies()

### Searches the ontology's and retrieves all concepts that relate to the input
#ont_parse.search()

### Processes input into query to send to GeoNetwork
#ont_parse.process_query()

In [None]:
timed_q = {}

def time_search(q, process = False):
    ### This function measures the runtime of the total process
    if process:
        start = timer()
        ont_parse.search(q)
        ont_parse.process_query()
        end = timer()
        timed_q[ont_parse.processed_query] = round((end-start)*1000)
    else:
        start = timer()
        ont_parse.process_query(q)
        end = timer()
        timed_q[q] = round((end-start)*1000)
    print("Duration of search & send: " + str((end - start)*1000) + ' ms\n')
    
time_search('Ocean', False)
time_search('ocean', True)
time_search('Ocean,water body', False)
time_search('Ocean,water body', True)
time_search('Continent', False)
time_search('Continent', True)
time_search('Ocean,World Ocean,Marine water body,Cave', False)
time_search('Ocean,World Ocean,Marine water body,Cave', True)
time_search('Texel,Island,Sea', False)
time_search('Texel,Island,Sea', True)
time_search('Earth,Ocean', False)
time_search('Earth,Ocean', True)



In [None]:
### Shows the contents of the double dictionary that keeps track of the ontology's and the concepts found within the ontology's
#print(ont_parse.ont_dict_split)

### Shows the contents of the count dictionary, which counts the amount of times a concept has been found in all the ontology's
#print(ont_parse.ont_dict_count)

In [None]:
### This cell is responsible for creating the graph only. Delete this in case it's not needed! ###

labels = ["Set 1", "Set 2", "Set 3", "Set 4", "Set 5", "Set 6"]

query_1 = list(timed_q.values())[0::2]
query_2 = list(timed_q.values())[1::2]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, query_1, width, label='Original Query')
rects2 = ax.bar(x + width/2, query_2, width, label='Enhanced Query')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Runtime (ms)')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.075),
          fancybox=True, shadow=True, ncol=6)



def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)

fig.tight_layout()

plt.savefig('graph.png')