In [33]:
import re

test_numbers = [
    "1234.56.78.90",
    "9876.54.32.10",
    "3456.78.90.12",
    "1111.22",
    "5555.66.77.88",
    "9999.88.77.66",
    "1010.20.30.40",
    "3030.40.50.60",
    "4040",
    "5050.60.70.80",
    "7777.88.99.00",
    "8888.99.00.11",
    "2222.33",
    "6666.77.88.99",
    "1234.5678",
    "9876.5432",
    "3456.7890",
    "1111.2233",
    "5555.6677",
    "9999.8877",
    "1234-56-78-90",
    "9876_54_32_10",
    "3456,78,90",
    "1111-22-33-44",
    "5555_66_77_88",
    "9999,88,77,66",
    "1010-20-30-40",
    "3030_40_50_60",
    "4040,50",
    "5050-60-70-80",
    "7777_88_99_00",
    "8888,99,00,11",
    "2222-33-44-55",
    "6666_77_88_99",
    "1234.56-78,90",
    "9876_54.32-10",
    "3456-78.90_12",
    "1111.22-33,44",
    "5555-66.77_88",
    "9999.88-77_66"
]

remove_punctuation = r'[!\"#$%&\'()*+,-./:;<=>?@\[\]\^_`{|}~—]'
gather_hts_number = [
    (
        r'(^[\d]{4})([\d]{2})([\d]{2})([\d]{2})$', 'Complete_record'
    ),
    (
        r'(^[\d]{4})([\d]{2})([\d]{2})$', 'Base_semifull'
    ),
    (
        r'(^[\d]{4})([\d]{2})$', 'Base_subrecord'
    ),
    (
        r'(^[\d]{4})$', 'Base_chapter'
    )
]

def processString(test_string: str):

    string_no_symbols = re.sub(remove_punctuation, '', test_string)

    for pattern in gather_hts_number:

        matched_str = re.match(pattern=pattern[0], string=string_no_symbols)

        if matched_str:
            return {
                'type': pattern[1],
                'groups': matched_str
            }
        
def processGroups(processed_str: dict):

    query_chap = processed_str['groups'].group(1)
    groups = gatherGroups(processed_str['groups'])
    
    if len(groups) == 0:
        return {
            'type': processed_str['type'],
            'main_group': query_chap
        }
    else:
        return {
            'type': processed_str['type'],
            'main_group': query_chap,
            'sub_groups': groups
        }


def gatherGroups(groups: re.Match):

    list_of_groups = []
    previous_group = ''
    first_run = True

    for i in range(1, len(groups.groups()) + 1):

        if first_run:
            previous_group = groups.group(i)
            first_run = False
        else:
            result = previous_group + '.' + groups.group(i)
            list_of_groups.append(result)
            previous_group = previous_group + '.' + groups.group(i)

    return list_of_groups



In [34]:
def createQueryGroups(test_list: list):

    list_of_results = []

    for element in test_list:
        string_processed = processString(element)
        groups_processed = processGroups(string_processed)
        list_of_results.append(groups_processed)
    return list_of_results

In [35]:
createQueryGroups(test_numbers)

[{'type': 'Complete_record',
  'main_group': '1234',
  'sub_groups': ['1234.56', '1234.56.78', '1234.56.78.90']},
 {'type': 'Complete_record',
  'main_group': '9876',
  'sub_groups': ['9876.54', '9876.54.32', '9876.54.32.10']},
 {'type': 'Complete_record',
  'main_group': '3456',
  'sub_groups': ['3456.78', '3456.78.90', '3456.78.90.12']},
 {'type': 'Base_subrecord', 'main_group': '1111', 'sub_groups': ['1111.22']},
 {'type': 'Complete_record',
  'main_group': '5555',
  'sub_groups': ['5555.66', '5555.66.77', '5555.66.77.88']},
 {'type': 'Complete_record',
  'main_group': '9999',
  'sub_groups': ['9999.88', '9999.88.77', '9999.88.77.66']},
 {'type': 'Complete_record',
  'main_group': '1010',
  'sub_groups': ['1010.20', '1010.20.30', '1010.20.30.40']},
 {'type': 'Complete_record',
  'main_group': '3030',
  'sub_groups': ['3030.40', '3030.40.50', '3030.40.50.60']},
 {'type': 'Base_chapter', 'main_group': '4040'},
 {'type': 'Complete_record',
  'main_group': '5050',
  'sub_groups': ['5050