DOwload model


In [190]:
import spacy
import csv
from spacy.matcher import Matcher

In [191]:
try:
    with open("source/Input_File_01.txt", 'rb') as f:
        binary_content = f.read()
except FileNotFoundError:
    print(f"Error: Input file not found at 'source_data/Input_File_01.txt'")

 Read the file in binary mode and split it by the null-byte delimiter (\x00) to get a list of the data fields.

In [192]:
fields = binary_content.split(b'\\x00')   # split by null byte
all_fields = []                   # make an empty list

for field in fields:
    if field:  # skip empty parts
        text = field.decode('latin-1')  # convert bytes → string
        text = text.strip()             # remove spaces/newlines
        all_fields.append(text)         # add to the list

print(all_fields)

['1', 'X530108146000', 'INTERNAL RETAINING RING', "ANNEAU D\\'ARRET POUR ALESAGE", 'INNENHALTERING', 'ANELLO DI SICUREZZA INTERNO', 'ANILLO DE RETENCION INTERNO', 'PIER\\xc5\\x9aCIE\\xc5\\x83 USTALAJ\\xc4\\x84CY WEWN\\xc4\\x98TRZNY', 'X530.108.146.000', '72311106', 'DIN472 - 90X3', '2', '139100150130', 'SHIM RING', 'ANNEAU ENTRETOISE', 'UNTERLEGRING', 'DISCO SPESSORE', 'ANILLO DE CALCE', 'PIER\\xc5\\x9aCIE\\xc5\\x83 REGULACYJNY', '139.100.150.130', '72316227', '0,10 MM', '139100150131', '139.100.150.131', '72316228', '0,15 MM', '139100150132', '139.100.150.132', '72316229', '0,25 MM', '139100150133', '139.100.150.133', '72316230', '0,30 MM', '139100150134', '139.100.150.134', '72316231', '0,80 MM', '3', 'X548967266000', 'O-RING', 'JOINT TORIQUE', 'GIUNTO DI TENUTA', 'JUNTA TOROIDAL', 'X548.967.266.000', '72313499', '82,14X3,53-NBR 70 FWN56402', '4', '178100010020', 'SEALING COVER', 'CACHE ETANCHE', 'DICHTDECKEL', 'COPERTURA TENUTA', 'CUBIERTA SELLANTE', 'POKRYWA USZCZELNIAJ\\xc4\\x84CA

In [193]:

nlp = spacy.load("en_core_web_sm")  # Load English model


In [None]:
def process_chunk(chunk):
    # Join the fields into a single text for processing
    # (or process chunks of fields that constitute a record)
    chunk_text = "\n".join(chunk)

    # Process the text with spaCy
    doc = nlp(chunk_text)

    # Initialize the Matcher with the shared vocab
    matcher = Matcher(nlp.vocab)

    # Pattern for Serial Numbers (e.g., a number with 1 to 3 digits)
    serial_number_pattern = [
        {
            "IS_DIGIT": True,      # Ensures the token is composed of only digits
            "LENGTH": {">=": 1, "<=": 3} # Ensures the token has 1, 2 or 3 characters
        }
    ]
    matcher.add("SERIAL_NUMBER", [serial_number_pattern])

    # Refined part number pattern
    part_number_pattern = [{"TEXT": {"REGEX": r"^(?=.*\d)[A-Z0-9]{12,13}$"}}]
    matcher.add("PART_NUMBER", [part_number_pattern])

    # Defined part number in other formats
    other_part_number_pattern = [{"TEXT": {"REGEX": r"^(?=.*\d)(?=.*\.)([A-Z0-9.]{15,16})$"}}]
    matcher.add("PART_NUMBER_ALT", [other_part_number_pattern])

    # Reference number
    reference_number_pattern = [{"TEXT": {"REGEX": r"^[0-9]{8}$"}}]
    matcher.add("REFERENCE_NUMBER", [reference_number_pattern])

    # Find all matches in the doc
    matches = matcher(doc)

    # You can then iterate through the matches to extract the data
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]  # Get the pattern name
        span = doc[start:end]  # The matched span
        print(f"Found '{string_id}': {span.text}")

    

In [195]:
start_indices = []
# Use enumerate() to get both the index (i) and the value (field) for each item.
for i, field in enumerate(all_fields):
    
    # This is the same condition as in the list comprehension.
    # It checks if the field is a number AND is short.
    if field.isdigit() and len(field) < 4:
        
        # If the conditions are met, add the index 'i' to our list.
        start_indices.append(i)
    
records_data = []
for i in range(len(start_indices)):
    start = start_indices[i]
    # The end of a chunk is the start of the next one, or the end of the file.
    end = start_indices[i + 1] if i + 1 < len(start_indices) else len(all_fields)
    chunk = all_fields[start:end]
    if chunk:
        processed_chunk = process_chunk(chunk)
        records_data.append(processed_chunk)

Found 'SERIAL_NUMBER': 1
Found 'PART_NUMBER': X530108146000
Found 'PART_NUMBER_ALT': X530.108.146.000
Found 'REFERENCE_NUMBER': 72311106
[(11796146433234896099, 0, 1), (7507135594510279738, 2, 3), (4972689857689866458, 29, 30), (4546272708814385746, 31, 32)]
Found 'SERIAL_NUMBER': 2
Found 'PART_NUMBER': 139100150130
Found 'PART_NUMBER_ALT': 139.100.150.130
Found 'REFERENCE_NUMBER': 72316227
Found 'PART_NUMBER': 139100150131
Found 'PART_NUMBER_ALT': 139.100.150.131
Found 'REFERENCE_NUMBER': 72316228
Found 'PART_NUMBER': 139100150132
Found 'PART_NUMBER_ALT': 139.100.150.132
Found 'REFERENCE_NUMBER': 72316229
Found 'PART_NUMBER': 139100150133
Found 'PART_NUMBER_ALT': 139.100.150.133
Found 'REFERENCE_NUMBER': 72316230
Found 'PART_NUMBER': 139100150134
Found 'PART_NUMBER_ALT': 139.100.150.134
Found 'REFERENCE_NUMBER': 72316231
[(11796146433234896099, 0, 1), (7507135594510279738, 2, 3), (4972689857689866458, 22, 23), (4546272708814385746, 24, 25), (7507135594510279738, 29, 30), (497268985768

In [196]:


# Improved record extraction: group tokens between serial numbers
records = []
matches = matcher(doc)

# Find all serial number match positions
serial_positions = [(start, end) for match_id, start, end in matches if nlp.vocab.strings[match_id] == "SERIAL_NUMBER"]

if serial_positions:
    for i, (start, end) in enumerate(serial_positions):
        serial_text = doc[start:end].text
        next_start = serial_positions[i+1][0] if i+1 < len(serial_positions) else len(doc)
        data_tokens = doc[end:next_start]
        # Extract part numbers from this data
        part_numbers = []
        for match_id, p_start, p_end in matches:
            if nlp.vocab.strings[match_id] == "PART_NUMBER" and end <= p_start < next_start:
                part_text = doc[p_start:p_end].text
                part_numbers.append(part_text)
        records.append({
            "Serial Number": serial_text,
            "Data": [token.text for token in data_tokens],
            "Part Numbers": part_numbers
        })

print(records)

[{'Serial Number': '1', 'Data': ['\n', 'X530108146000', '\n', 'INTERNAL', 'RETAINING', 'RING', '\n', 'ANNEAU', "D\\'ARRET", 'POUR', 'ALESAGE', '\n', 'INNENHALTERING', '\n', 'ANELLO', 'DI', 'SICUREZZA', 'INTERNO', '\n', 'ANILLO', 'DE', 'RETENCION', 'INTERNO', '\n', 'PIER\\xc5\\x9aCIE\\xc5\\x83', 'USTALAJ\\xc4\\x84CY', 'WEWN\\xc4\\x98TRZNY', '\n', 'X530.108.146.000', '\n', '72311106', '\n', 'DIN472', '-', '90X3', '\n'], 'Part Numbers': ['X530108146000', 'INTERNAL', 'RETAINING', 'RING', 'ANNEAU', 'POUR', 'ALESAGE', 'INNENHALTERING', 'ANELLO', 'SICUREZZA', 'INTERNO', 'ANILLO', 'RETENCION', 'INTERNO', '72311106', 'DIN472', '90X3']}, {'Serial Number': '2', 'Data': ['\n', '139100150130', '\n', 'SHIM', 'RING', '\n', 'ANNEAU', 'ENTRETOISE', '\n', 'UNTERLEGRING', '\n', 'DISCO', 'SPESSORE', '\n', 'ANILLO', 'DE', 'CALCE', '\n', 'PIER\\xc5\\x9aCIE\\xc5\\x83', 'REGULACYJNY', '\n', '139.100.150.130', '\n', '72316227', '\n', '0,10', 'MM', '\n', '139100150131', '\n', '139.100.150.131', '\n', '72316228'