In [25]:
import os, json, re    

In [26]:
#This needs to be changed for a database connected function that returns the appropriate collection record
def importJSONFiles(files: list, path: str) -> list[dict[str, any]]:

    hts_data = []

    for file in files:

        with open(f'{path}/{file}') as f:
            header = re.sub(r'\.json', '', file)
            hts_data.append(
                {
                    'header': header,
                    'data': json.loads(f.read())
                })

    return hts_data

def getHTSRecord(hts_data: list[dict[str, any]], queryStr: str):

    for record in hts_data:
        
        if record['header'] == queryStr:
            return record['data']
    
    return 'No record found'

In [27]:
def grabQueryRecords(hts_record: list[dict[str, any]], query: list[str]) -> list[dict[str, any]]:
    """Function that iterates the original HTS full record and parses against the coincidences found in the query list (query list containing the main and sub query numbers entered at the time of query)

    Args:
        hts_record (list[dict[str, any]]): Full HTS record grabbed from db
        query (list[str]): List of query numbers in the original query input (for each query there may be multiple of these)

    Returns:
        list[dict[str, any]]: HTS parsed record of coincidences matching the query list elements against original HTS record
    """

    def checkResultQuery(result_query: list[dict[str, any]]):
        """Helper function that takes the original query records grabbed and tags the missing indents after parsing, adding the 'missing' tag to each with missing indents for further processing

        Args:
            result_query (list[dict[str, any]]): Result of the original grabQueryRecords() logic that has all parsed coincidences with original HTS record
        """

        index = 0
        indent_list = [record['indent'] for record in result_query if 'indent' in record]
        
        for i in range(0, result_query[-1]['indent']):
            
            if i not in indent_list:
        
                if 'missing' in result_query[index-1]:

                    result_query[index-1]['missing'].append(i)
                else:
                    result_query[index-1]['missing'] = [i]

            else:
                index += 1

    result = []
    index_query = 0

    while index_query < len(query):

        for key, record in enumerate(hts_record):

            if 'htsno' in record and re.match(rf'{query[index_query]}$', record['htsno']):

                result.append({
                    'htsno': record['htsno'],
                    'indent': record['indent'],
                    'description': record['description'],
                    'indexHTSRec': key
                })

        index_query += 1
    
    checkResultQuery(result)
    
    return result

In [28]:
def searchEHIndents(result_query: list[dict[str, any]], hts_record: list[dict[str, any]]) -> list[dict[str, any]]:
    """Function that search for the Empty HTS records in the current result_query and adds their information to the final result

    Args:
        result_query (list[dict[str, any]]): Result of the original query to the hts_record with all valid matches for the original query string
        hts_record (list[dict[str, any]]): Main record where all the query information is located

    Returns:
        list[dict[str, any]]: Returns a list of dictinaries with the result_query and the EH records (if present).
    """
    
    def createEHRecords(indents: list[int]) -> list[dict[str, int]]:
        """Helper function that creates the main EH record object list with just the indent key and value

        Args:
            indents (list[int]): Original indent list grabbed from the result_query object

        Returns:
            list[dict[str, int]]: Final object list with the indents as key-value pairs.
        """

        result = []
        
        for indent in indents:

            result.append({
                'indent': indent
            })
        
        return result
    
    result_final = []

    for key, result in enumerate(result_query):

        result_final.append(result)

        if 'missing' not in result: continue

        indents = result_query[key]['missing']
        eh_records = createEHRecords(indents)

        for i in range(result['indexHTSRec'], result_query[key+1]['indexHTSRec']):

            for rec in eh_records:

                if 'htsno' not in hts_record[i] and rec['indent'] == hts_record[i]['indent']:
                    
                    rec['description'] = hts_record[i]['description']        
        
        if eh_records: result_final.extend(eh_records)

    return result_final



In [29]:
path = '../../db_hts/temp/NEW_final_json_files'
query = ["8802", "8802.20", "8802.20.01", "8802.20.01.20"]
files = os.listdir(path)
hts_data = importJSONFiles(files, path)

In [30]:
hts_record = getHTSRecord(hts_data, query[0])
hts_record

[{'htsno': '8802',
  'indent': 0,
  'description': 'Other aircraft (for example, helicopters, airplanes, except unmanned aircraft of heading 8806); spacecraft (including satellites) and suborbital and spacecraft launch vehicles:'},
 {'indent': 1, 'description': 'Helicopters:', 'superior': 'true'},
 {'htsno': '8802.11.01',
  'indent': 2,
  'description': 'Of an unladen weight not exceeding 2,000 kg',
  'general': 'Free',
  'other': '30%',
  'footnotes': [{'columns': ['general'],
    'marker': '1',
    'value': 'See 9903.88.01. ',
    'type': 'endnote'}]},
 {'indent': 3, 'description': 'New:', 'superior': 'true'},
 {'htsno': '8802.11.01.15',
  'indent': 4,
  'description': 'Military',
  'units': ['No.']},
 {'indent': 4, 'description': 'Other:', 'superior': 'true'},
 {'htsno': '8802.11.01.30',
  'indent': 5,
  'description': 'Of an unladen weight not exceeding 998 kg',
  'units': ['No.']},
 {'htsno': '8802.11.01.45',
  'indent': 5,
  'description': 'Of an unladen weight exceeding 998 but 

In [31]:
result = grabQueryRecords(hts_record, query)
result

[{'htsno': '8802',
  'indent': 0,
  'description': 'Other aircraft (for example, helicopters, airplanes, except unmanned aircraft of heading 8806); spacecraft (including satellites) and suborbital and spacecraft launch vehicles:',
  'indexHTSRec': 0},
 {'htsno': '8802.20.01',
  'indent': 1,
  'description': 'Airplanes and other aircraft, of an unladen weight not exceeding 2,000 kg',
  'indexHTSRec': 18,
  'missing': [2, 3, 4]},
 {'htsno': '8802.20.01.20',
  'indent': 5,
  'description': 'Airplanes',
  'indexHTSRec': 23}]

In [32]:
EHrecords = searchEHIndents(result, hts_record)
EHrecords

[{'htsno': '8802',
  'indent': 0,
  'description': 'Other aircraft (for example, helicopters, airplanes, except unmanned aircraft of heading 8806); spacecraft (including satellites) and suborbital and spacecraft launch vehicles:',
  'indexHTSRec': 0},
 {'htsno': '8802.20.01',
  'indent': 1,
  'description': 'Airplanes and other aircraft, of an unladen weight not exceeding 2,000 kg',
  'indexHTSRec': 18,
  'missing': [2, 3, 4]},
 {'indent': 2, 'description': 'Other:'},
 {'indent': 3, 'description': 'New:'},
 {'indent': 4, 'description': 'Military aircraft:'},
 {'htsno': '8802.20.01.20',
  'indent': 5,
  'description': 'Airplanes',
  'indexHTSRec': 23}]