This notebook contains all the code for creating the HTS and string_dict files in the temp folder for further addition to the MongoDB

In [10]:
key_words = ["a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"]

punctuation_pattern = r'[!\"#$%&\'()*+,-./:;<=>?@\[\]\^_`{|}~—]'

In [11]:
import pandas as pd
import re
import json
import os

def addRowsToDataframe(df: pd.DataFrame, row: list):
    """Helper method used to easily add new rows to a pandas dataframe

    Args:
        df: pandas.DataFrame
            DataFrame to add the new rows to
        row (list): List to be added as a row into the dataframe
    """

    df.loc[len(df)] = row

def checkKeyWords(string: str):
    """Checks against the keywords list for any matches in the string input

    Args:
        string (str): String that will be checked

    Returns:
        bool: True if the string does not coincide with the list, False if there is a match
    """

    for keyword in key_words:
        match = re.search(rf'^{keyword}$', string=string)
        if(match):
            return False
    
    return True

def openJSON(path: str):
    """Function that opens a JSON file and returns that file as a list

    Args:
        path (str): Path to the JSON file

    Returns:
        list: Returns a list object with the JSON information
    """

    with open(path, 'r') as file:
        result = json.loads(file.read())
    
    return result

def processRecord(record: list):
    """Helper method that works by iterating each JSON HTS record already processed, and eliminates the empty htsno records, adding their description to the next sections following the indent logic

    Args:
        record (list): JSON processed list containing several sections to be iterated
    
    Returns:
        list: Returns the resulting sections in a list without the empty htsno sections and correctly appeding the descriptions of empty htsno sections to their corresponding sections. 
    """

    result = []
    saved_section_description = {
        'description': '',
        'indent': 0,
        'current': False
    }

    for section in record:

        if section['htsno'] == '':

            saved_section_description = {
                'description': section['description'],
                'indent': section['indent'],
                'current': True
            }
            continue
        
        if saved_section_description['current'] == True:

            if saved_section_description['indent'] == (section['indent'] -1):
                section['description'] += f' | {saved_section_description['description']}'
            
            else:
                saved_section_description['current'] = False

        result.append(section)
    
    return result

In [12]:
def createHTSDict(path: str):
    """Method that creates a dictionary object with all the HTS data from the CBP site in JSON format for all chapters.

    Args:
        path (str): Path of the original HTS JSON file downloaded from CBP

    Returns:
        dict: Dictionary object with all the information in the original HTS file
    """

    df = pd.read_json(path)
    columns_df = df.columns.tolist()
    final_dict = {}
    main_numbers_pattern = re.compile(r'^[\d]{4}')
    start = True
    current_main_hts = ['']
    previous_main_hts = ['']

    for index,row in df.iterrows():

        current_main_hts = main_numbers_pattern.findall(row['htsno'])

        if start:
            previous_main_hts = current_main_hts
            final_dict[current_main_hts[0]] = pd.DataFrame(columns=columns_df)
            addRowsToDataframe(final_dict[current_main_hts[0]], row)

        if len(current_main_hts) == 0:
            current_main_hts = previous_main_hts

        if (previous_main_hts[0] != current_main_hts[0]) and (len(current_main_hts)):
            final_dict[current_main_hts[0]] = pd.DataFrame(columns=columns_df)
            addRowsToDataframe(final_dict[current_main_hts[0]], row)
            previous_main_hts = current_main_hts  
        elif start == False:
            addRowsToDataframe(final_dict[current_main_hts[0]], row)
        start = False

    return final_dict

def writeFiles(HTS_dict: dict, path_hts: str, path_strings: str):
    """Writes all header sections of HTS codes into a single JSON file into a path_hts folder. And creates a single JSON file containing all keywords with lists of the HTS file names where they are located

    Args:
        HTS_dict (dict): HTS dictionary object with all the chapter information from CBP
        path_hts (str): Path to store the individual HTS JSON files divided by header code
        path_strings (str): Path to store the string dictionary into a JSON file
    """

    string_dict = {}
    file_dict = {}
    file_path = 'string_dict.json'

    for key,df in HTS_dict.items():
        
        file_dict[key] = f'{path_hts}/{key}.json'

        df.to_json(file_dict[key], orient='records')

        for row in df.iterrows():
            desc = re.sub(pattern=punctuation_pattern, repl='', string=row[1]['description']).lower()
            
            array_string = desc.split()
            
            if(len(array_string) <= 0): continue
            
            for string in array_string:
                if(checkKeyWords(string) == False): continue

                if(string in string_dict):
                    string_dict[string].append(key)
                    string_dict[string] = list(set(string_dict[string]))
                    continue
            
                string_dict[string] = []
                string_dict[string].append(key)
                string_dict[string] = list(set(string_dict[string]))

    with open(f'{path_strings}{file_path}', 'w') as json_file:
        json.dump(string_dict, json_file, indent=4)

def removeEmptyHTS(folder: str):
    """Method that overwrites the already created json HTS files, processing them again so they are passed on without the empty "htsno" sections.

    Args:
        folder (str): Folder where the already processed json files are located
    """

    filenames = os.listdir(folder)

    for file in filenames:

        record = openJSON(f'{folder}{file}')
        processed_record = processRecord(record)

        with open(f'{folder}{file}', 'w') as final_file:
            json.dump(processed_record, final_file, indent=4)

In [13]:
writeFiles(createHTSDict('../db_hts/htsdata/htsdata.json'), '../db_hts/temp/NEW_test_files/', '../db_hts/temp/NEW_test_string_dict/')


In [14]:
removeEmptyHTS('../db_hts/temp/NEW_test_files/')