**Data Parcing flow**

Importing necessary libraries and modules

In [1]:
import csv
import re
from pprint import pprint

Open file: "source_data/Input_File_01.txt"

In [2]:
try:
    with open("source_data/Input_File_01.txt", 'rb') as f:
        content = f.read()
except FileNotFoundError:
    print(f"Error: Input file not found at 'source_data/Input_File_01.txt'")

Parsing and Record Extraction. The content is split by the null delimiter. We decode using 'latin-1' because it can represent every possible byte, preventing errors with mixed text/binary data. Empty fields resulting from the split are filtered out.

In [3]:
fields = content.split(b'\\x00')   # split by null byte
all_fields = []                   # make an empty list

for field in fields:
    if field:  # skip empty parts
        text = field.decode('latin-1')  # convert bytes → string
        text = text.strip()             # remove spaces/newlines
        all_fields.append(text)         # add to the list

pprint(all_fields)

['1',
 'X530108146000',
 'INTERNAL RETAINING RING',
 "ANNEAU D\\'ARRET POUR ALESAGE",
 'INNENHALTERING',
 'ANELLO DI SICUREZZA INTERNO',
 'ANILLO DE RETENCION INTERNO',
 'PIER\\xc5\\x9aCIE\\xc5\\x83 USTALAJ\\xc4\\x84CY WEWN\\xc4\\x98TRZNY',
 'X530.108.146.000',
 '72311106',
 'DIN472 - 90X3',
 '2',
 '139100150130',
 'SHIM RING',
 'ANNEAU ENTRETOISE',
 'UNTERLEGRING',
 'DISCO SPESSORE',
 'ANILLO DE CALCE',
 'PIER\\xc5\\x9aCIE\\xc5\\x83 REGULACYJNY',
 '139.100.150.130',
 '72316227',
 '0,10 MM',
 '139100150131',
 '139.100.150.131',
 '72316228',
 '0,15 MM',
 '139100150132',
 '139.100.150.132',
 '72316229',
 '0,25 MM',
 '139100150133',
 '139.100.150.133',
 '72316230',
 '0,30 MM',
 '139100150134',
 '139.100.150.134',
 '72316231',
 '0,80 MM',
 '3',
 'X548967266000',
 'O-RING',
 'JOINT TORIQUE',
 'GIUNTO DI TENUTA',
 'JUNTA TOROIDAL',
 'X548.967.266.000',
 '72313499',
 '82,14X3,53-NBR 70 FWN56402',
 '4',
 '178100010020',
 'SEALING COVER',
 'CACHE ETANCHE',
 'DICHTDECKEL',
 'COPERTURA TENUTA',
 

Records are identified by a numeric "Serial Number". We find the start index of each record. The check is now more specific: a short, purely numeric field. This prevents long numeric fields (like Reference Numbers) from being incorrectly marked as a new record start.

In [4]:
start_indices = []

# Use enumerate() to get both the index (i) and the value (field) for each item.
for i, field in enumerate(all_fields):
    
    # This is the same condition as in the list comprehension.
    # It checks if the field is a number AND is short.
    if field.isdigit() and len(field) < 4:
        
        # If the conditions are met, add the index 'i' to our list.
        start_indices.append(i)

pprint(start_indices)

[0,
 11,
 38,
 47,
 57,
 68,
 79,
 89,
 100,
 110,
 119,
 131,
 135,
 145,
 155,
 166,
 179,
 190,
 195,
 199,
 206,
 216,
 217,
 221,
 222,
 242,
 247,
 257,
 262,
 272,
 293,
 294,
 304,
 315,
 320,
 331,
 341,
 346,
 357,
 367,
 377,
 388,
 398,
 403,
 413,
 423,
 428,
 438,
 448,
 459,
 464,
 469,
 474,
 479,
 489,
 498]


Processes a single record's data (a "chunk") and maps its fields to the defined CSV columns using pattern matching and positional logic.

Args:
    chunk (list): A list of strings representing one record, starting with a serial number.

Returns:
    dict: A dictionary with keys matching the CSV headers and values from the chunk.

In [5]:
def process_chunk(chunk):
    
    # --- Step 3: Field Mapping and Logic ---
    headers = [
        "Serial Number", "Part Number", "Part Name English", 
        "Part Name Language 1", "Part Name Language 2", "Part Name Language 3", 
        "Part Name Language 4", "Part Name Language 5", 
        "Part Number in Other Format", "Reference Number", 
        "Additional Information", "Extra Data"
    ]
    record = {h: '' for h in headers}
    
    # The first item in a chunk is always the Serial Number.
    record['Serial Number'] = chunk[0]
    data_fields = chunk[1:]

    # Handle empty records (like #22, #24)
    if not data_fields:
        return record

    # Regex to identify part numbers with dots (e.g., "X530.108.146.000")
    # and reference numbers (e.g., "72311106").
    part_num_other_format_regex = re.compile(r'^[A-Z0-9\.]+$')
    ref_num_regex = re.compile(r'^[0-9]{8,}$') # Reference numbers are typically long

    # The first field is usually the Part Number, but not always (e.g., record #20).
    # We will tentatively assign it and correct it later if needed.
    current_pos = 0
    if current_pos < len(data_fields):
        record['Part Number'] = data_fields[current_pos]
        current_pos += 1

    # Consume all name/description fields. The sequence of names ends when we encounter
    # a field that strongly matches the pattern of a formatted part number or a reference number.
    names_end_pos = current_pos
    for i in range(current_pos, len(data_fields)):
        field = data_fields[i]
        is_formatted_part_num = bool(part_num_other_format_regex.match(field) and '.' in field)
        is_ref_num = bool(ref_num_regex.match(field))
        
        # This condition marks the end of the variable-length name fields.
        if is_formatted_part_num or is_ref_num:
            break
        names_end_pos += 1
    
    names = data_fields[current_pos:names_end_pos]
    
    # Handle special case for record #20 which has no part number.
    if record['Serial Number'] == '20':
        names.insert(0, record['Part Number']) # The tentative part number was actually the first name.
        record['Part Number'] = ''
        
    # Assign the collected names to the appropriate columns.
    name_headers = headers[2:8] # "Part Name English" through "Part Name Language 5"
    for i, name in enumerate(names):
        if i < len(name_headers):
            record[name_headers[i]] = name
            
    current_pos = names_end_pos

    # Sequentially assign the next fields based on their patterns.
    # Part Number in Other Format
    if current_pos < len(data_fields) and part_num_other_format_regex.match(data_fields[current_pos]) and '.' in data_fields[current_pos]:
        record['Part Number in Other Format'] = data_fields[current_pos]
        current_pos += 1
    
    # Reference Number
    if current_pos < len(data_fields) and ref_num_regex.match(data_fields[current_pos]):
        record['Reference Number'] = data_fields[current_pos]
        current_pos += 1
        
    # Additional Information is the single next field, if it exists.
    if current_pos < len(data_fields):
        record['Additional Information'] = data_fields[current_pos]
        current_pos += 1

    # --- Step 4: Handling Extra Data ---
    
    # Special case: Delete "white noise" from position 61. 
    # These data have no sense and most probably are distractor
    if record['Serial Number'] == '61':
        data_fields[current_pos:] = []
    
    # Any remaining fields are considered "Extra Data" and are joined with '___'.
    extra_data = data_fields[current_pos:]
    if extra_data:
        record['Extra Data'] = '___'.join(extra_data)
    else:
        record['Extra Data'] = '-' # Use a placeholder if no extra data exists.
        
    return record

Deviding 'all_fields' into chunks based on the identified record start indices. Each chunk represents one complete record. Applying 'process_chunk' to each chunk to transform it into a structured dictionary. 

In [6]:
records_data = []
for i in range(len(start_indices)):
    start = start_indices[i]
    # The end of a chunk is the start of the next one, or the end of the file.
    end = start_indices[i + 1] if i + 1 < len(start_indices) else len(all_fields)
    chunk = all_fields[start:end]
    if chunk:
        records_data.append(process_chunk(chunk))

pprint(records_data)

[{'Additional Information': 'DIN472 - 90X3',
  'Extra Data': '-',
  'Part Name English': 'INTERNAL RETAINING RING',
  'Part Name Language 1': "ANNEAU D\\'ARRET POUR ALESAGE",
  'Part Name Language 2': 'INNENHALTERING',
  'Part Name Language 3': 'ANELLO DI SICUREZZA INTERNO',
  'Part Name Language 4': 'ANILLO DE RETENCION INTERNO',
  'Part Name Language 5': 'PIER\\xc5\\x9aCIE\\xc5\\x83 USTALAJ\\xc4\\x84CY '
                          'WEWN\\xc4\\x98TRZNY',
  'Part Number': 'X530108146000',
  'Part Number in Other Format': 'X530.108.146.000',
  'Reference Number': '72311106',
  'Serial Number': '1'},
 {'Additional Information': '0,10 MM',
  'Extra Data': '139100150131___139.100.150.131___72316228___0,15 '
                'MM___139100150132___139.100.150.132___72316229___0,25 '
                'MM___139100150133___139.100.150.133___72316230___0,30 '
                'MM___139100150134___139.100.150.134___72316231___0,80 MM',
  'Part Name English': 'SHIM RING',
  'Part Name Language 1': 'ANN

Saving data in CSV file: "output_data/Output_File_01.csv"

In [7]:
if not records_data:
    print("No records to write.")
    
with open('output_data/output.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=records_data[0].keys())
    writer.writeheader()
    writer.writerows(records_data)
    print(f"Successfully parsed data and created output_data/output.csv")

Successfully parsed data and created output_data/output.csv
