In [None]:
import pymongo, re
from bson import ObjectId

Connection to MongoDB

In [None]:
class Connection:
    """Class that connects to the database creating all necessary methods for connection and closing connection, as well as the base database for adding new HTS records and the string_dict collection too
    """

    def __init__(self, db_path: str):
        """_init_ function of the class, defines the connection variables

        Args:
            db_path (str): Path to the database connection on MongoDB
        """

        self.client = pymongo.MongoClient(db_path)
        self.db = self.client['hts']
        self.collection_records = self.db['hts_records']
        self.collection_string_dict = self.db['string_dict']

Definition of pipelines and criterias for pymongo queries, as well as conversion of the hts query string

-Here are the methods to process the string input from user, as well as patterns created:

In [None]:
#Patterns to clean up and organize input of HTS number from user
remove_punctuation = r'[!\"#$%&\'()*+,-./:;<=>?@\[\]\^_`{|}~—]'
gather_hts_number = [
    (
        r'(^[\d]{4})([\d]{2})([\d]{2})([\d]{2})$', 'Complete_record'
    ),
    (
        r'(^[\d]{4})([\d]{2})([\d]{2})$', 'Base_semifull'
    ),
    (
        r'(^[\d]{4})([\d]{2})$', 'Base_subrecord'
    ),
    (
        r'(^[\d]{4})$', 'Base_chapter'
    )
]

def processString(test_string: str):
    """Processes the string using the patterns for removing symbols and grouping the numbers according HTS syntax

    Args:
        test_string (str): Test string HTS number from user

    Returns:
        dict: Creates an object with 'type' of query and 'groups' Match object with the HTS grouped
    """

    string_no_symbols = re.sub(remove_punctuation, '', test_string)

    for pattern in gather_hts_number:

        matched_str = re.match(pattern=pattern[0], string=string_no_symbols)

        if matched_str:
            return {
                'type': pattern[1],
                'groups': matched_str
            }
        
def processGroups(processed_str: dict):
    """Processes the groups from the originally processed string to add the HTS numbers for further database query

    Args:
        processed_str (dict): Object with the type and groups originally gathered from user input

    Returns:
        dict: Returns an object with 'type', 'main_group' and 'sub_groups' for DB query
    """

    query_chap = processed_str['groups'].group(1)
    groups = gatherGroups(processed_str['groups'])
    
    if len(groups) == 0:
        return {
            'type': processed_str['type'],
            'main_group': query_chap
        }
    else:
        return {
            'type': processed_str['type'],
            'main_group': query_chap,
            'sub_groups': groups
        }


def gatherGroups(groups: re.Match):
    """Helper method for the processGroups() that formats the HTS subrecords adding the previous numbers for database query

    Args:
        groups (re.Match): Match object containing each individual sub_record from user initial input

    Returns:
        list: Returns a list of hts sub_records for DB query
    """

    list_of_groups = []
    previous_group = ''
    first_run = True

    for i in range(1, len(groups.groups()) + 1):

        if first_run:
            previous_group = groups.group(i)
            first_run = False
        else:
            result = previous_group + '.' + groups.group(i)
            list_of_groups.append(result)
            previous_group = previous_group + '.' + groups.group(i)

    return list_of_groups


In [None]:
def createQueryGroups(test_list: list):
    """Main function that creates the query groups for DB query as a list of objects (for bulk query of several records)

    Args:
        test_list (list): List of strings or string in single element list containing raw input from user for hts query

    Returns:
        list: Returns a list of resulting objects with type of query, main chapter, and sub_records if applicable for DB query
    """

    list_of_results = []

    for element in test_list:
        string_processed = processString(element)
        groups_processed = processGroups(string_processed)
        list_of_results.append(groups_processed)
        
    return list_of_results

FUnction definition for executing the queries with the Connection class