In [13]:
key_words = ["a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]

punctuation_pattern = r'[!\"#$%&\'()*+,-./:;<=>?@\[\]\^_`{|}~—]'

In [14]:
import json
import os
import re

#REPEATED FROM chapterOrganizer file (remember to consolidate this method)
def checkKeyWords(string: str):
    """Checks against the keywords list for any matches in the string input

    Args:
        string (str): String that will be checked

    Returns:
        bool: True if the string does not coincide with the list, False if there is a match
    """

    for keyword in key_words:
        match = re.search(rf'^{keyword}$', string=string)
        if(match):
            return False
    
    return True

In [15]:
def queryStringProcessing(string: str):
    """Function to process and normalize the query string in order to get an array of strings using the checkKeyWords method to remove all banned keywords from the query and punctuation signs

    Args:
        string (str): Query string input

    Returns:
        list: List of strings in the original input after processing with no punctuation or banned keywords
    """

    list_string = string.split(' ')
    result_string = []

    for st in list_string:

        normal_string = re.sub(punctuation_pattern, '', st.lower())
        if(checkKeyWords(normal_string)):
            
            result_string.append(normal_string) 

    
    return result_string



queryStringProcessing('Hola, soy Ricardo, then, for')

['hola', 'soy', 'ricardo']

In [16]:
def mapStringQuery(query: list):
    """Function that takes the list query preprocessed in the queryStringProcessing() and processes the list against the string_dict file elements to gather the information for the queryMap object.

    Args:
        query (list): Processed string query input as a list with the banned words removed

    Returns:
        dict: queryMap dictionary comprised of a map with all relevant sections that coincided with a result and an array of objects with the necessary info for each result - section - string.
    """
    queryMap = {
        'map': [],
        'query': []
    }

    with open('../db_hts/temp/NEW_test_string_dict/string_dict.json', 'r') as data:
        string_dict = json.load(data)

        for string in query:

            if string in string_dict:
                queryMap['map'].extend(string_dict[string])
                queryMap['query'].append(
                    {
                        'keyword': string,
                        'sections': string_dict[string]
                    }
                )

    #REMOVES THE DUPLICATES IN THE LIST
    queryMap['map'] = list(dict.fromkeys(queryMap['map']))

    return queryMap

def mapHierarchy(queryMap: dict):
    """Function that processes the queryMap object to generate a new object with the coincidences and sections relevant for the final display of the query

    Args:
        queryMap (dict): queryMap dictionary containing the initial result of the query

    Returns:
        dict: Dictionary object with keys as section numbers and value of a coincidence and keyword keys that represent the keywords present in each section with the respective number of coincidences present
    """
    mapCoincidences = {}

    for query in queryMap['query']:
        
        for section in queryMap['map']:

            if(section in query['sections']):

                if(section in mapCoincidences.keys()):
                    mapCoincidences[section]['keyword'].append(query['keyword'])
                    mapCoincidences[section]['coincidence'] =  mapCoincidences[section]['coincidence'] + 1
                else:
                    mapCoincidences[section] = {
                        'coincidence': 1,
                        'keyword': [query['keyword']]
                    }
    
    return mapCoincidences
        



In [17]:
def processCoincidences(coincidences: dict):
    """Function that takes the mapHierarchy object and processes that object agains the database information for each hts Section located in the JSON files by HTS header.

    Args:
        coincidences (dict): mapHierarchy object returned by that function that shows the initial coincidences of the search query and their corresponding HTS sections

    Returns:
        list: Returns a final_result list with dictionary objects containing the final queried values both from the initial query coincidences and the HTS database
    """
    final_result = []

    for key in coincidences:
        with open(f'../db_hts/temp/NEW_test_files/{key}.json') as file:

            json_file = json.load(file)

            final_result.append(traverseFile(json_file, coincidences[key]['keyword']))

    return final_result
    


def traverseFile(file: dict, list_string: list):
    """Helper function that provides the final result agains the HTS JSON database query.

    Args:
        file (dict): Dictionary representing the JSON file loaded with the HTS chapters and subchapter information in that file
        list_string (list): List of query keywords already processed that pertain to that specific file

    Returns:
        dict: Final result of the query the JSON database, with all keyword final coincidence and repeat values, alongside all the raw data extracted from each item of the original file
    """

    result = []
    navigator = {
        'counter': 0,
        'continue': False,
        'currenthts': ''
    }
    
    for item in file:
        
        #This checks if the navigator is currently activated to reiterate the same index of the result and add additional data, or continue to create new index in the result
        if(navigator['continue'] == False):

            temp_result = traverseDescription(list_string, item['description'].lower())

            #Checks if the list_string has matches in the item['description'] string
            if(temp_result['match']):
                result.append(temp_result)
                result[navigator['counter']]['matchedInformation']['data'].append(item)
                
                #Checks for the current value of the navigator, if the loop will create a new result index or use the next elif conditional to add to the original index
                navigator['continue'] = traverseItem(item, item['htsno'], list_string)['go']
                navigator['currenthts'] = item['htsno']
                #Adds one to the counter for the next iteration of the loop in case navigator['continue'] is False
                navigator['counter'] += 1

        #Checks if the result index was previously created and if the navigator process will continue (meaning the ['continue'] key is set to True)
        elif(navigator['continue'] == True):   
            #Adds the new item information 
            temp_navigator = traverseItem(item, navigator['currenthts'], list_string)
            navigator['continue'] = temp_navigator['go']
            if 'additionalstr' in temp_navigator:
                result[navigator['counter']-1]['matchedInformation']['data'].append(item)
                result[navigator['counter']-1]['matchedInformation']['keyword'].extend(temp_navigator['additionalstr'])

        
    return result
    
def traverseDescription(list_string: list, description: str):
    """Function that works to check what values from the list provided are existent in the description string provided from the JSON hts file description section

    Args:
        list_string (list): List of string query
        description (str): Full description string to be checked

    Returns:
        dict: Returns a dictionary with both a matched information key that contains the keywords matched and if the match was successful, and an empty list as the 'data' key for further addition of data outside this function
    """
    result = {
        
        'matchedInformation': {'keyword': [], 'data': []},
        'match': False   
    }

    for string in list_string:
        pattern = rf'[\( ]{string}[\) ,.:]|^{string}[\) ,.:]|[\( ]{string}'
        match_string = re.search(pattern, description)
        if(match_string):
            result['matchedInformation']['keyword'].append(string)
            result['match'] = True
    
    return result


def traverseItem(item: dict, current_pattern: str, list_string: list):
    """Function that further traverses the item object when there is a match in the query string, looking for further matches for both the hts code retrieved and the list_string initial query input and adds any repeated results or new keywords to the data being retrieved

    Args:
        item (dict): item dictionary object from the JSON HTS file being processed
        current_pattern (str): Pattern string HTS number currently being processed
        list_string (list): Original query list of strings being processed

    Returns:
        _type_: Dictionary with either the key 'go' bool that indicates the processs continues and the 'additionalstr' key containing additional keyword matches, or only the key 'go' with False, indicating the process will stop
    """
    final_pattern = rf'^(?! +$){current_pattern}|^$'

    match_hts = re.search(final_pattern, item['htsno'])
    additonal_keywords = []

    if(match_hts):
        for string in list_string:
            pattern = rf'[\( ]{string}[\) ,.:]|^{string}[\) ,.:]|[\( ]{string}'
            match_string = re.search(pattern, item['description'].lower())
            if(match_string):
                additonal_keywords.append(string)
        return {
            'go': True,
            'additionalstr': additonal_keywords
        }
    else:
        return {
            'go': False,
        }
    


In [18]:
query_string = queryStringProcessing('Live horses, asses mammals')

print(query_string)

queryMap = mapStringQuery(query_string)

coincidence = mapHierarchy(queryMap)

coincidence

['live', 'horses', 'asses', 'mammals']


{'0105': {'coincidence': 1, 'keyword': ['live']},
 '0309': {'coincidence': 1, 'keyword': ['live']},
 '0102': {'coincidence': 1, 'keyword': ['live']},
 '0104': {'coincidence': 1, 'keyword': ['live']},
 '0307': {'coincidence': 1, 'keyword': ['live']},
 '0103': {'coincidence': 1, 'keyword': ['live']},
 '0308': {'coincidence': 1, 'keyword': ['live']},
 '0106': {'coincidence': 2, 'keyword': ['live', 'mammals']},
 '0301': {'coincidence': 1, 'keyword': ['live']},
 '0306': {'coincidence': 1, 'keyword': ['live']},
 '0101': {'coincidence': 3, 'keyword': ['live', 'horses', 'asses']},
 '0602': {'coincidence': 1, 'keyword': ['live']},
 '0206': {'coincidence': 2, 'keyword': ['horses', 'asses']},
 '9804': {'coincidence': 1, 'keyword': ['horses']},
 '0205': {'coincidence': 2, 'keyword': ['horses', 'asses']},
 '0208': {'coincidence': 1, 'keyword': ['mammals']},
 '0210': {'coincidence': 1, 'keyword': ['mammals']},
 '1504': {'coincidence': 1, 'keyword': ['mammals']},
 '9902': {'coincidence': 1, 'keyword'

In [19]:
final_result = processCoincidences(coincidence)

final_result

[[{'matchedInformation': {'keyword': ['live'],
    'data': [{'htsno': '0105',
      'indent': 0,
      'description': 'Live poultry of the following kinds: Chickens, ducks, geese, turkeys and guineas:',
      'superior': None,
      'units': [],
      'general': '',
      'special': '',
      'other': '',
      'footnotes': [],
      'quotaQuantity': '',
      'additionalDuties': '',
      'addiitionalDuties': None},
     {'htsno': '0105.11.00',
      'indent': 2,
      'description': 'Chickens | Weighing not more than 185 g:',
      'superior': None,
      'units': [],
      'general': '0.9¢ each',
      'special': 'Free (A+,AU,BH,CL,CO,D,E, IL,JO,KR, MA,OM, P,PA,PE,S,SG)',
      'other': '4¢ each',
      'footnotes': [{'columns': ['general'],
        'marker': '1',
        'value': 'See 9903.88.15. ',
        'type': 'endnote'}],
      'quotaQuantity': None,
      'additionalDuties': None,
      'addiitionalDuties': None},
     {'htsno': '0105.11.00.10',
      'indent': 4,
      'des