In [65]:
import os, json, re    

In [66]:
def importJSONFiles(files: list, path: str) -> list[dict[str, any]]:

    hts_data = []

    for file in files:

        with open(f'{path}/{file}') as f:
            header = re.sub(r'\.json', '', file)
            hts_data.append(
                {
                    'header': header,
                    'data': json.loads(f.read())
                })

    return hts_data

def getHTSRecord(hts_data: list[dict[str, any]], queryStr: str):

    for record in hts_data:
        
        if record['header'] == queryStr:
            return record['data']
    
    return 'No record found'

In [67]:
def searchEHIndents(result_query: list[dict[str, any]], hts_record: list[dict[str, any]]):
    def createEHRecords(indents: list[int]):

        result = []
        
        for indent in indents:

            result.append({
                'indent': indent
            })
        
        return result
    
    result_final = []

    for key, result in enumerate(result_query):

        result_final.append(result)

        if 'missing' not in result: continue

        indents = result_query[key]['missing']
        eh_records = createEHRecords(indents)

        for i in range(result['indexHTSRec'], result_query[key+1]['indexHTSRec']):

            for rec in eh_records:

                if 'htsno' not in hts_record[i] and rec['indent'] == hts_record[i]['indent']:
                    
                    rec['description'] = hts_record[i]['description']
        
        
        if eh_records: result_final.extend(eh_records)

    return result_final



In [68]:
def grabQueryRecords(hts_record: list[dict[str, any]], query: list[str]) -> list[dict[str, any]]:

    def checkResultQuery(result_query: list[dict[str, any]]):

        index = 0
        indent_list = [record['indent'] for record in result_query if 'indent' in record]
        
        for i in range(0, result_query[-1]['indent']):
            
            if i not in indent_list:
        
                if 'missing' in result_query[index-1]:

                    result_query[index-1]['missing'].append(i)
                else:
                    result_query[index-1]['missing'] = [i]

            else:
                index += 1

    result = []
    index_query = 0

    while index_query < len(query):

        for key, record in enumerate(hts_record):

            if 'htsno' in record and re.match(rf'{query[index_query]}$', record['htsno']):

                result.append({
                    'htsno': record['htsno'],
                    'indent': record['indent'],
                    'description': record['description'],
                    'indexHTSRec': key
                })

        index_query += 1
    
    checkResultQuery(result)
    
    return result

In [69]:
path = '../../db_hts/temp/NEW_final_json_files'
query = ["8802", "8802.20", "8802.20.01", "8802.20.01.20"]
files = os.listdir(path)
hts_data = importJSONFiles(files, path)

In [70]:
hts_record = getHTSRecord(hts_data, query[0])

In [71]:
result = grabQueryRecords(hts_record, query)
result

[{'htsno': '8802',
  'indent': 0,
  'description': 'Other aircraft (for example, helicopters, airplanes, except unmanned aircraft of heading 8806); spacecraft (including satellites) and suborbital and spacecraft launch vehicles:',
  'indexHTSRec': 0},
 {'htsno': '8802.20.01',
  'indent': 1,
  'description': 'Airplanes and other aircraft, of an unladen weight not exceeding 2,000 kg',
  'indexHTSRec': 18,
  'missing': [2, 3, 4]},
 {'htsno': '8802.20.01.20',
  'indent': 5,
  'description': 'Airplanes',
  'indexHTSRec': 23}]

In [72]:
EHrecords = searchEHIndents(result, hts_record)
EHrecords

[{'htsno': '8802',
  'indent': 0,
  'description': 'Other aircraft (for example, helicopters, airplanes, except unmanned aircraft of heading 8806); spacecraft (including satellites) and suborbital and spacecraft launch vehicles:',
  'indexHTSRec': 0},
 {'htsno': '8802.20.01',
  'indent': 1,
  'description': 'Airplanes and other aircraft, of an unladen weight not exceeding 2,000 kg',
  'indexHTSRec': 18,
  'missing': [2, 3, 4]},
 {'indent': 2, 'description': 'Other:'},
 {'indent': 3, 'description': 'New:'},
 {'indent': 4, 'description': 'Military aircraft:'},
 {'htsno': '8802.20.01.20',
  'indent': 5,
  'description': 'Airplanes',
  'indexHTSRec': 23}]