In [1]:
import re

line = "HURESPLI     2     LINE NUMBER OF THE CURRENT           (22 - 23)"

# Pattern 1: Basic extraction with minimal assumptions about spacing
pattern1 = r"(\w+)\s+(\d+)\s+(.*?)\s+\((\d+)\s*-\s*(\d+)\)"
match = re.match(pattern1, line)
if match:
    var_name, var_len, desc, start_pos, end_pos = match.groups()
    print("Pattern 1:", var_name, var_len, desc, start_pos, end_pos)

# Pattern 2: More flexible with whitespace, ensuring description can include wider range of characters
pattern2 = r"(\w+)\s+(\d+)\s+(.+?)\s+\((\d+)\s*-\s*(\d+)\)"
match = re.match(pattern2, line)
if match:
    var_name, var_len, desc, start_pos, end_pos = match.groups()
    print("Pattern 2:", var_name, var_len, desc, start_pos, end_pos)

# Pattern 3: Assumes variable names are uppercase letters and underscores, allowing for more specific matching
pattern3 = r"([A-Z_]+)\s+(\d+)\s+(.+?)\s+\((\d+)\s*-\s*(\d+)\)"
match = re.match(pattern3, line)
if match:
    var_name, var_len, desc, start_pos, end_pos = match.groups()
    print("Pattern 3:", var_name, var_len, desc, start_pos, end_pos)

# Pattern 4: Specifically designed for descriptions that may include letters, numbers, and spaces
pattern4 = r"(\w+)\s+(\d+)\s+([A-Za-z0-9\s]+?)\s+\((\d+)\s*-\s*(\d+)\)"
match = re.match(pattern4, line)
if match:
    var_name, var_len, desc, start_pos, end_pos = match.groups()
    print("Pattern 4:", var_name, var_len, desc, start_pos, end_pos)

# Pattern 5: Highly specific, assuming descriptions don't contain digits and allowing multiple spaces
pattern5 = r"(\w+)\s+(\d+)\s+([A-Za-z\s]+?)\s+\((\d+)\s*-\s*(\d+)\)"
match = re.match(pattern5, line)
if match:
    var_name, var_len, desc, start_pos, end_pos = match.groups()
    print("Pattern 5:", var_name, var_len, desc, start_pos, end_pos)


Pattern 1: HURESPLI 2 LINE NUMBER OF THE CURRENT 22 23
Pattern 2: HURESPLI 2 LINE NUMBER OF THE CURRENT 22 23
Pattern 3: HURESPLI 2 LINE NUMBER OF THE CURRENT 22 23
Pattern 4: HURESPLI 2 LINE NUMBER OF THE CURRENT 22 23
Pattern 5: HURESPLI 2 LINE NUMBER OF THE CURRENT 22 23


In [4]:
text = """                CPS RECORD LAYOUT FOR BASIC LABOR FORCE ITEMS
                            January - March 1994
                          A1. HOUSEHOLD INFORMATION

NAME         SIZE  DESCRIPTION                          LOCATION

HRHHID          12     HOUSEHOLD IDENTIFIER                   (1 - 12)

                   EDITED UNIVERSE: ALL HHLD's IN SAMPLE

HUINTTYP     2     TYPE OF INTERVIEW                    (13 - 14)

                   VALID ENTRIES

                   0   CATI INTERVIEW
                   1   PERSONAL (CAPI)
                   2   TELEPHONE (CAPI)"""
                   
line_pattern = r".* \(\d+ - \d+\) *$"
filtered_lines = [line for line in text.splitlines() if re.match(line_pattern, line)]
print("Number of filtered lines:", len(filtered_lines))


Number of filtered lines: 2
