In [87]:
import pymongo, re
from bson import ObjectId

%run ../.env/credentialsDB.py

In [88]:
class Connection:
    """Class that connects to the database creating all necessary methods for connection and closing connection, as well as the base database for adding new HTS records and the string_dict collection too
    """

    def __init__(self, db_path: str):
        """_init_ function of the class, defines the connection variables

        Args:
            db_path (str): Path to the database connection on MongoDB
        """

        self.client = pymongo.MongoClient(db_path)
        self.db = self.client['hts']
        self.collection_records = self.db['hts_records']
        self.collection_string_dict = self.db['string_dict']

    def closeConnection(self):
        """Close connection function, closes current connection created in the Connection class
        """
        self.client.close()

In [89]:
#Patterns to clean up and organize input of HTS number from user
remove_punctuation = r'[!\"#$%&\'()*+,-./:;<=>?@\[\]\^_`{|}~—]'

key_words = ["a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]

Functions that process the initial string query:

In [90]:
#Helper function for string manipulation
def checkKeyWords(string: str) -> bool:
    """Checks against the keywords list for any matches in the string input

    Args:
        string (str): String that will be checked

    Returns:
        bool: True if the string does not coincide with the list, False if there is a match
    """

    for keyword in key_words:
        match = re.search(rf'^{keyword}$', string=string)
        if(match):
            return False
    
    return True


In [91]:
#Function to process the string query
def queryStringProcessing(string: str) -> list[str]:
    """Function to process and normalize the query string in order to get an array of strings using the checkKeyWords method to remove all banned keywords from the query and punctuation signs

    Args:
        string (str): Query string input

    Returns:
        list: List of strings in the original input after processing with no punctuation or banned keywords
    """

    list_string = string.split(' ')
    result_string = []

    for st in list_string:

        normal_string = re.sub(remove_punctuation, '', st.lower())
        if(checkKeyWords(normal_string)):
            
            result_string.append(normal_string) 

    
    return result_string


In [92]:
#Testing
queryStringProcessing('Hola mundo ourselves como estas bien these!, AIMING.')

['hola', 'mundo', 'como', 'estas', 'bien', 'aiming']

Functions that process the query list of keywords parsed and query the DB:

In [93]:
#Helper functions for db document manipulation:
def processRawQuery(raw_results: list[dict[str, any]]) -> list[dict[str, any]]:
    """Function that removes all null value keys from the original raw_query on the document gathered from the db.

    Args:
        raw_results (list[dict[str, any]]): Raw list of documents gathered on the first query of the db with the main and sub_groups

    Returns:
        list[dict[str, any]]: Returns a list of documents parsed without empty keys for the final result of query
    """

    processed_result = []

    for result in raw_results:

        new_result = {}

        for key in result.keys():

            if result[key]:
                
                new_result[key] = result[key]
        
        processed_result.append(new_result)

    return processed_result

In [98]:
def mapStringQuery(query: list, str_collection: pymongo.collection.Collection) -> dict[str, any]:

    queryMap = {
        'map': [],
        'query': []
    }

    for word in query:

        document = str_collection.find_one({'string': word})

        if document:
            queryMap['map'].extend(document['chaps'])
            queryMap['query'].append(
                {
                    'keyword': word,
                    'record_ids': document['chaps'] 
                }
            )

    #REMOVES THE DUPLICATES IN THE LIST
    queryMap['map'] = list(dict.fromkeys(queryMap['map']))

    return queryMap

def mapHierarchy(queryMap: dict, hts_collection: pymongo.collection.Collection) -> dict[str, any]:

    mapCoincidences = {}

    for query in queryMap['query']:
        
        for chap in queryMap['map']:

            object_id = ObjectId(chap)
            hts_document = hts_collection.find_one({'_id': object_id})
            header = hts_document['header']

            if(chap in query['record_ids']):
                
                if(chap in mapCoincidences.keys()):
                    mapCoincidences[header]['keyword'].append(query['keyword'])
                    mapCoincidences[header]['coincidence'] =  mapCoincidences[header]['coincidence'] + 1
                else:
                    mapCoincidences[header] = {
                        'coincidence': 1,
                        'keyword': [query['keyword']],
                    }
    
    return mapCoincidences

#def gatherInnerCoincidences(data: list[dict[str, any]], keyword: str) -> list[dict[str, any]]:





In [101]:
#Testing
connection = Connection(f'{PATH_DB}{USER_DB}:{PW_DB}@{CLUSTER_DB}')
query_test = ['breeding', 'purebred']
query_map = mapStringQuery(query_test, connection.collection_string_dict)
map_hierarchy = mapHierarchy(query_map, connection.collection_records)
connection.closeConnection()

map_hierarchy

{'0105': {'coincidence': 1, 'keyword': ['purebred']},
 '0102': {'coincidence': 1, 'keyword': ['purebred']},
 '9813': {'coincidence': 1, 'keyword': ['breeding']},
 '0103': {'coincidence': 1, 'keyword': ['purebred']},
 '0101': {'coincidence': 1, 'keyword': ['purebred']}}