DOwload model


In [326]:
import spacy
import csv
from spacy.matcher import Matcher

In [327]:
try:
    with open("source/Input_File_01.txt", 'rb') as f:
        binary_content = f.read()
except FileNotFoundError:
    print(f"Error: Input file not found at 'source_data/Input_File_01.txt'")

 Read the file in binary mode and split it by the null-byte delimiter (\x00) to get a list of the data fields.

In [328]:
fields = binary_content.split(b'\\x00')   # split by null byte
all_fields = []                   # make an empty list

for field in fields:
    if field:  # skip empty parts
        text = field.decode('latin-1')  # convert bytes → string
        text = text.strip()             # remove spaces/newlines
        all_fields.append(text)         # add to the list

print(all_fields)

['1', 'X530108146000', 'INTERNAL RETAINING RING', "ANNEAU D\\'ARRET POUR ALESAGE", 'INNENHALTERING', 'ANELLO DI SICUREZZA INTERNO', 'ANILLO DE RETENCION INTERNO', 'PIER\\xc5\\x9aCIE\\xc5\\x83 USTALAJ\\xc4\\x84CY WEWN\\xc4\\x98TRZNY', 'X530.108.146.000', '72311106', 'DIN472 - 90X3', '2', '139100150130', 'SHIM RING', 'ANNEAU ENTRETOISE', 'UNTERLEGRING', 'DISCO SPESSORE', 'ANILLO DE CALCE', 'PIER\\xc5\\x9aCIE\\xc5\\x83 REGULACYJNY', '139.100.150.130', '72316227', '0,10 MM', '139100150131', '139.100.150.131', '72316228', '0,15 MM', '139100150132', '139.100.150.132', '72316229', '0,25 MM', '139100150133', '139.100.150.133', '72316230', '0,30 MM', '139100150134', '139.100.150.134', '72316231', '0,80 MM', '3', 'X548967266000', 'O-RING', 'JOINT TORIQUE', 'GIUNTO DI TENUTA', 'JUNTA TOROIDAL', 'X548.967.266.000', '72313499', '82,14X3,53-NBR 70 FWN56402', '4', '178100010020', 'SEALING COVER', 'CACHE ETANCHE', 'DICHTDECKEL', 'COPERTURA TENUTA', 'CUBIERTA SELLANTE', 'POKRYWA USZCZELNIAJ\\xc4\\x84CA

In [329]:

nlp = spacy.load("en_core_web_sm")  # Load English model


In [330]:
def process_chunk(chunk):
    
    headers = [
        "Serial Number", "Part Number", "Part Name English", 
        "Part Name Language 1", "Part Name Language 2", "Part Name Language 3", 
        "Part Name Language 4", "Part Name Language 5", 
        "Part Number in Other Format", "Reference Number", 
        "Additional Information", "Extra Data"
    ]
    record = {h: '' for h in headers}
    
    # The first item in a chunk is always the Serial Number.
    record['Serial Number'] = chunk[0]
    data_fields = chunk[1:]

    # Handle empty records (like #22, #24)
    if not data_fields:
        return record
    # Join the fields into a single text for processing
    # (or process chunks of fields that constitute a record)
    chunk_text = "\n".join(chunk)

    # Process the text with spaCy
    doc = nlp(chunk_text)

    # Initialize the Matcher with the shared vocab
    matcher = Matcher(nlp.vocab)

    # Refined part number pattern
    part_number_pattern = [{"TEXT": {"REGEX": r"^(?=.*\d)[A-Z0-9]{12,13}$"}}]
    matcher.add("Part Number", [part_number_pattern])

    # Defined part number in other formats
    other_part_number_pattern = [{"TEXT": {"REGEX": r"^(?=.*\d)(?=.*\.)([A-Z0-9.]{15,16})$"}}]
    matcher.add("Part Number in Other Format", [other_part_number_pattern])

    # Reference number
    reference_number_pattern = [{"TEXT": {"REGEX": r"^[0-9]{8}$"}}]
    matcher.add("Reference Number", [reference_number_pattern])

    # Find all matches in the doc
    matches = matcher(doc)

    # You can then iterate through the matches to extract the data
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get the pattern name
        span = doc[start:end]  # The matched span
        record[string_id] = span.text
    
    
    if record['Serial Number'] == '20':
        current_pos = 0  # Since record #20 does not have a Part Number
    else:
        current_pos = 1  # Start after Serial Number
   
    # Determine the end position of the variable-length name fields
    
    names_end_pos = current_pos
    
    for i in range(current_pos, len(data_fields)):
        field = data_fields[i]
        is_formatted_part_num = bool(record['Part Number in Other Format'] == field)
        is_ref_num = bool(record['Reference Number'] == field)
        
        # This condition marks the end of the variable-length name fields.
        if is_formatted_part_num or is_ref_num:
            break
        names_end_pos += 1
    
    names = data_fields[current_pos:names_end_pos]
     # Assign the collected names to the appropriate columns.
    name_headers = headers[2:8] # "Part Name English" through "Part Name Language 5"
    mapping = {
            0: "Part Name Language 2",
            1: "Part Name English",
            2: "Part Name Language 4",
            3: "Part Name Language 1",
            4: "Part Name Language 3",
            5: "Part Name Language 5",
        }
    
    for i, name in enumerate(names):
        if record['Serial Number'] == '20':
            if i in mapping: # Use custom mapping for record #20
                record[mapping[i]] = name
        else:
            if i < len(name_headers):
                record[name_headers[i]] = name
    
    
    if record.get('Reference Number'):
        current_pos = int(chunk.index(record['Reference Number']))
    elif record.get('Part Number in Other Format'):
        current_pos = int(chunk.index(record['Part Number in Other Format']))
    else:
        current_pos = 0  # fallback if neither exists

    
    if current_pos + 1 < len(data_fields):
        record['Additional Information'] = data_fields[current_pos]
        current_pos += 1
        
    print(record)
    

In [331]:
start_indices = []
# Use enumerate() to get both the index (i) and the value (field) for each item.
for i, field in enumerate(all_fields):
    
    # This is the same condition as in the list comprehension.
    # It checks if the field is a number AND is short.
    if field.isdigit() and len(field) < 4:
        
        # If the conditions are met, add the index 'i' to our list.
        start_indices.append(i)
    
records_data = []
for i in range(len(start_indices)):
    start = start_indices[i]
    # The end of a chunk is the start of the next one, or the end of the file.
    end = start_indices[i + 1] if i + 1 < len(start_indices) else len(all_fields)
    chunk = all_fields[start:end]
    if chunk:
        processed_chunk = process_chunk(chunk)
        records_data.append(processed_chunk)

{'Serial Number': '1', 'Part Number': 'X530108146000', 'Part Name English': 'INTERNAL RETAINING RING', 'Part Name Language 1': "ANNEAU D\\'ARRET POUR ALESAGE", 'Part Name Language 2': 'INNENHALTERING', 'Part Name Language 3': 'ANELLO DI SICUREZZA INTERNO', 'Part Name Language 4': 'ANILLO DE RETENCION INTERNO', 'Part Name Language 5': 'PIER\\xc5\\x9aCIE\\xc5\\x83 USTALAJ\\xc4\\x84CY WEWN\\xc4\\x98TRZNY', 'Part Number in Other Format': 'X530.108.146.000', 'Reference Number': '72311106', 'Additional Information': '', 'Extra Data': ''}
{'Serial Number': '2', 'Part Number': '139100150134', 'Part Name English': 'SHIM RING', 'Part Name Language 1': 'ANNEAU ENTRETOISE', 'Part Name Language 2': 'UNTERLEGRING', 'Part Name Language 3': 'DISCO SPESSORE', 'Part Name Language 4': 'ANILLO DE CALCE', 'Part Name Language 5': 'PIER\\xc5\\x9aCIE\\xc5\\x83 REGULACYJNY', 'Part Number in Other Format': '139.100.150.134', 'Reference Number': '72316231', 'Additional Information': '', 'Extra Data': ''}
{'Seria

In [332]:


# Improved record extraction: group tokens between serial numbers
records = []
matches = matcher(doc)

# Find all serial number match positions
serial_positions = [(start, end) for match_id, start, end in matches if nlp.vocab.strings[match_id] == "SERIAL_NUMBER"]

if serial_positions:
    for i, (start, end) in enumerate(serial_positions):
        serial_text = doc[start:end].text
        next_start = serial_positions[i+1][0] if i+1 < len(serial_positions) else len(doc)
        data_tokens = doc[end:next_start]
        # Extract part numbers from this data
        part_numbers = []
        for match_id, p_start, p_end in matches:
            if nlp.vocab.strings[match_id] == "PART_NUMBER" and end <= p_start < next_start:
                part_text = doc[p_start:p_end].text
                part_numbers.append(part_text)
        records.append({
            "Serial Number": serial_text,
            "Data": [token.text for token in data_tokens],
            "Part Numbers": part_numbers
        })

print(records)

KeyError: "[E018] Can't retrieve string for hash '11796146433234896099'. This usually refers to an issue with the `Vocab` or `StringStore`."