In [1]:
key_words = ["a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]

punctuation_pattern = r'[!\"#$%&\'()*+,-./:;<=>?@\[\]\^_`{|}~—]'

In [2]:
import json
import os
import re

#REPEATED FROM chapterOrganizer file (remember to consolidate this method)
def checkKeyWords(string: str):
    """Checks against the keywords list for any matches in the string input

    Args:
        string (str): String that will be checked

    Returns:
        bool: True if the string does not coincide with the list, False if there is a match
    """

    for keyword in key_words:
        match = re.search(rf'^{keyword}$', string=string)
        if(match):
            return False
    
    return True

In [3]:
def queryStringProcessing(string: str):
    """Function to process and normalize the query string in order to get an array of strings using the checkKeyWords method to remove all banned keywords from the query and punctuation signs

    Args:
        string (str): Query string input

    Returns:
        list: List of strings in the original input after processing with no punctuation or banned keywords
    """

    list_string = string.split(' ')
    result_string = []

    for st in list_string:

        normal_string = re.sub(punctuation_pattern, '', st.lower())
        if(checkKeyWords(normal_string)):
            
            result_string.append(normal_string) 

    
    return result_string



queryStringProcessing('Hola, soy Ricardo, then, for')

['hola', 'soy', 'ricardo']

In [4]:
def mapStringQuery(query: list):
    """Function that takes the list query preprocessed in the queryStringProcessing() and processes the list against the string_dict file elements to gather the information for the queryMap object.

    Args:
        query (list): Processed string query input as a list with the banned words removed

    Returns:
        dict: queryMap dictionary comprised of a map with all relevant sections that coincided with a result and an array of objects with the necessary info for each result - section - string.
    """
    queryMap = {
        'map': [],
        'query': []
    }

    with open('test_string_dict/string_dict.json', 'r') as data:
        string_dict = json.load(data)

        for string in query:

            if string in string_dict:
                queryMap['map'].extend(string_dict[string])
                queryMap['query'].append(
                    {
                        'keyword': string,
                        'sections': string_dict[string]
                    }
                )

    #REMOVES THE DUPLICATES IN THE LIST
    queryMap['map'] = list(dict.fromkeys(queryMap['map']))

    return queryMap

def mapHierarchy(queryMap: dict):
    """Function that processes the queryMap object to generate a new object with the coincidences and sections relevant for the final display of the query

    Args:
        queryMap (dict): queryMap dictionary containing the initial result of the query

    Returns:
        dict: Dictionary object with keys as section numbers and value of a coincidence and keyword keys that represent the keywords present in each section with the respective number of coincidences present
    """
    mapCoincidences = {}

    for query in queryMap['query']:
        
        for section in queryMap['map']:

            if(section in query['sections']):

                if(section in mapCoincidences.keys()):
                    mapCoincidences[section]['keyword'].append(query['keyword'])
                    mapCoincidences[section]['coincidence'] =  mapCoincidences[section]['coincidence'] + 1
                else:
                    mapCoincidences[section] = {
                        'coincidence': 1,
                        'keyword': [query['keyword']]
                    }
    
    return mapCoincidences
        



In [57]:
def processCoincidences(coincidences: dict):

    final_result = []

    for key in coincidences:
        with open(f'test_files/{key}.json') as file:

            json_file = json.load(file)

            final_result.append(traverseFile(json_file, coincidences[key]['keyword']))

    return final_result
    


def traverseFile(file, list_string):

    result = []
    navigator = {
        'counter': 0,
        'continue': False
    }
    
    for item in file:
        
        if(navigator['continue'] == False):

            temp_result = traverseDescription(list_string, item['description'].lower(), item)
            if(temp_result['match']):
                result.append(temp_result)
                navigator['counter'] += 1
        else:
            

                
        
        
    return result
    
def traverseDescription(list_string, description, item):

    result = {
        
        'matchedInformation': {'keyword': [], 'data': {}},
        'match': False   
    }

    for string in list_string:
        pattern = rf'[\( ]{string}[\) ,.:]|^{string}[\) ,.:]|[\( ]{string}'
        match_string = re.search(pattern, description)
        if(match_string):
            result['matchedInformation']['keyword'].append(string)
            result['matchedInformation']['data'] = item
            result['match'] = True
    
    return result


def traverseItem(item, current_pattern):

    match_hts = re.search(current_pattern, item['htsno'])
    if(match_hts):
        return {
            'continue': True,
            'data': item
        }
    


In [48]:
query_string = queryStringProcessing('Live horses, asses')

print(query_string)

queryMap = mapStringQuery(query_string)

coincidence = mapHierarchy(queryMap)

coincidence

['live', 'horses', 'asses']


{'0106': {'coincidence': 1, 'keyword': ['live']},
 '0307': {'coincidence': 1, 'keyword': ['live']},
 '0309': {'coincidence': 1, 'keyword': ['live']},
 '0105': {'coincidence': 1, 'keyword': ['live']},
 '0104': {'coincidence': 1, 'keyword': ['live']},
 '0602': {'coincidence': 1, 'keyword': ['live']},
 '0301': {'coincidence': 1, 'keyword': ['live']},
 '0306': {'coincidence': 1, 'keyword': ['live']},
 '0308': {'coincidence': 1, 'keyword': ['live']},
 '0103': {'coincidence': 1, 'keyword': ['live']},
 '0101': {'coincidence': 3, 'keyword': ['live', 'horses', 'asses']},
 '0102': {'coincidence': 1, 'keyword': ['live']},
 '0206': {'coincidence': 2, 'keyword': ['horses', 'asses']},
 '9804': {'coincidence': 1, 'keyword': ['horses']},
 '0205': {'coincidence': 2, 'keyword': ['horses', 'asses']}}

In [58]:
final_result = processCoincidences(coincidence)

final_result

[[{'matchedInformation': {'keyword': ['live'],
    'data': {'htsno': '0106',
     'indent': 0,
     'description': 'Other live animals:',
     'superior': None,
     'units': [],
     'general': '',
     'special': '',
     'other': '',
     'footnotes': [],
     'quotaQuantity': '',
     'additionalDuties': '',
     'addiitionalDuties': None}},
   'match': True}],
 [{'matchedInformation': {'keyword': ['live'],
    'data': {'htsno': '0307',
     'indent': 0,
     'description': 'Molluscs, whether in shell or not, live, fresh, chilled, frozen, dried, salted or in brine; smoked molluscs, whether in shell or not, whether or not cooked before or during the smoking process:',
     'superior': None,
     'units': [],
     'general': '',
     'special': '',
     'other': '',
     'footnotes': [],
     'quotaQuantity': None,
     'additionalDuties': None,
     'addiitionalDuties': None}},
   'match': True},
  {'matchedInformation': {'keyword': ['live'],
    'data': {'htsno': '0307.11.00',
    