In [5]:
# ============================
#        IMPORTS
# ============================
import re
import statistics
import pandas as pd

# ============================
#     HELPER FUNCTIONS
# ============================
def count_tokens(line, delimiter):
    tokens = re.split(delimiter, line.strip())
    return len([t for t in tokens if t])

def is_numeric(token):
    try:
        float(token)
        return True
    except ValueError:
        pass
    if "�" in token:
        parts = [p.strip() for p in token.split("�") if p.strip()]
        if len(parts) == 2:
            return all(is_numeric(p) for p in parts)
        elif len(parts) == 1:
            return is_numeric(parts[0])
    return False

def numeric_ratio(line, delimiter):
    tokens = [t for t in re.split(delimiter, line.strip()) if t]
    if not tokens:
        return 0.0
    return sum(is_numeric(t) for t in tokens) / len(tokens)

def most_common(lst):
    return max(set(lst), key=lst.count) if lst else None

def safe_mean(x):
    return statistics.mean(x) if len(x) > 0 else 0

def safe_var(x):
    return statistics.variance(x) if len(x) > 1 else 0

def safe_cv(x):
    if len(x) == 0:
        return 0
    mean_val = safe_mean(x)
    if mean_val == 0:
        return 0
    return safe_var(x) ** 0.5 / mean_val

def compute_interval_overlap(interval1, interval2):
    start1, end1 = interval1
    start2, end2 = interval2
    return max(0, min(end1, end2) - max(start1, start2))

def intervals_overlap(interval1, interval2):
    return max(interval1[0], interval2[0]) < min(interval1[1], interval2[1])

def generate_row_pattern(tokens):
    return ''.join('N' if is_numeric(tok) else 'S' for tok in tokens)

def get_token_intervals_multi(line):
    tokens = []
    token_counts = {}
    parts = re.split(r'(\s{2,})', line)
    pos = 0
    for part in parts:
        if re.fullmatch(r'\s{2,}', part):
            pos += len(part)
        elif part:
            start = pos + 1
            end = pos + len(part)
            base = part.strip()
            token_counts[base] = token_counts.get(base, 0) + 1
            tokens.append({
                "key": f"{base} {token_counts[base]}" if token_counts[base] > 1 else base,
                "display": base,
                "interval": (start, end)
            })
            pos += len(part)
    return tokens

# ============================
#        CORE FUNCTIONS
# ============================
def segregate_blocks(lines, start_idx, end_idx):
    blocks = []
    block = None
    block_idx = 0

    for i in range(start_idx, end_idx):
        line = lines[i]
        if line.strip():
            if block is None:
                block = {
                    "idx": block_idx,
                    "start": i,
                    "end": None,
                    "lines": [],
                    "block_type": None,
                    "delimiter": None,
                    "headers": [],
                    "header_extent": 0,
                    "title": None,
                    "df": None,
                    "used_as_header_for": [],
                    "stats": {},
                    "row_patterns": [],
                    "modal_token_count": 0,
                    "token_count_cv": 0.0
                }
            block["lines"].append({
                "line_idx": i,
                "text": line,
                "count_multispace_tokens": count_tokens(line, r"\s{2,}"),
                "count_single_tokens": count_tokens(line, r"\s+"),
                "count_tab_tokens": count_tokens(line, r"\t"),
                "numeric_multispace_ratio": numeric_ratio(line, r"\s{2,}"),
                "numeric_single_ratio": numeric_ratio(line, r"\s+"),
                "numeric_tab_ratio": numeric_ratio(line, r"\t"),
                "line_len": len(line)
            })
        elif block is not None:
            block["end"] = i - 1
            blocks.append(block)
            block_idx += 1
            block = None

    if block is not None:
        block["end"] = end_idx - 1
        blocks.append(block)

    return blocks

# def compute_statistics(block):
    get = lambda attr: [line[attr] for line in block["lines"]]
    s, m, t = get("count_single_tokens"), get("count_multispace_tokens"), get("count_tab_tokens")
    ns, nm, nt = get("numeric_single_ratio"), get("numeric_multispace_ratio"), get("numeric_tab_ratio")
    lens = get("line_len")
    block["stats"] = {
        "mean_single_tokens": statistics.mean(s), "var_single_tokens": statistics.variance(s), "cv_single_tokens": safe_cv(s),
        "mean_multispace_tokens": statistics.mean(m), "var_multispace_tokens": statistics.variance(m), "cv_multispace_tokens": safe_cv(m),
        "mean_tab_tokens": statistics.mean(t), "var_tab_tokens": statistics.variance(t), "cv_tab_tokens": safe_cv(t),
        "mean_line_len": statistics.mean(lens), "var_line_len": statistics.variance(lens), "cv_line_len": safe_cv(lens),
        "mean_numeric_single": statistics.mean(ns), "mean_numeric_multispace": statistics.mean(nm), "mean_numeric_tab": statistics.mean(nt)
    }

def compute_statistics(block):
    s = []
    m = []
    t = []
    ns = []
    nm = []
    nt = []
    lens = []

    for line in block["lines"]:
        s.append(line["count_single_tokens"])
        m.append(line["count_multispace_tokens"])
        t.append(line["count_tab_tokens"])
        ns.append(line["numeric_single_ratio"])
        nm.append(line["numeric_multispace_ratio"])
        nt.append(line["numeric_tab_ratio"])
        lens.append(line["line_len"])


    block["stats"] = {
        "mean_single_tokens": safe_mean(s),
        "var_single_tokens": safe_var(s),
        "cv_single_tokens": safe_cv(s),

        "mean_multispace_tokens": safe_mean(m),
        "var_multispace_tokens": safe_var(m),
        "cv_multispace_tokens": safe_cv(m),

        "mean_tab_tokens": safe_mean(t),
        "var_tab_tokens": safe_var(t),
        "cv_tab_tokens": safe_cv(t),

        "mean_line_len": safe_mean(lens),
        "var_line_len": safe_var(lens),
        "cv_line_len": safe_cv(lens),

        "mean_numeric_single": safe_mean(ns),
        "mean_numeric_multispace": safe_mean(nm),
        "mean_numeric_tab": safe_mean(nt)
    }


def detect_header_extent(block, delimiter):
    patterns, title_line = [], None
    for i, line in enumerate(block["lines"]):
        tokens = [t for t in re.split(delimiter, line["text"].strip()) if t]
        pattern = generate_row_pattern(tokens)
        patterns.append(pattern)
        if i == 0 and pattern == "S":
            title_line = i
    start_i = title_line + 1 if title_line is not None else 0
    extent = 0
    for pattern in patterns[start_i:]:
        if all(c == "S" for c in pattern): extent += 1
        else: break
    return extent, title_line

def merge_headers_by_overlap(token_maps):
    base_row = token_maps[0]
    merged_headers = [{"name": tok["display"], "interval": tok["interval"]} for tok in base_row]
    for row in token_maps[1:]:
        for tok in row:
            matched = False
            for hdr in merged_headers:
                if intervals_overlap(hdr["interval"], tok["interval"]):
                    hdr["name"] += " " + tok["display"]
                    hdr["interval"] = (min(hdr["interval"][0], tok["interval"][0]), max(hdr["interval"][1], tok["interval"][1]))
                    matched = True
                    break
            if not matched:
                merged_headers.append({"name": tok["display"], "interval": tok["interval"]})
    return sorted(merged_headers, key=lambda x: x["interval"][0])

def extract_headers(block, delimiter):
    extent, title_line = detect_header_extent(block, delimiter)
    block["header_extent"] = extent
    block["title"] = block["lines"][title_line]["text"] if title_line is not None else None
    if extent == 0:
        return [], 0
    header_lines = block["lines"][title_line+1:title_line+1+extent] if title_line is not None else block["lines"][:extent]
    if extent == 1:
        token_objs = get_token_intervals_multi(header_lines[0]["text"])
        return [{"name": t["display"], "interval": t["interval"]} for t in token_objs], extent
    token_maps = [get_token_intervals_multi(line["text"]) for line in header_lines]
    return merge_headers_by_overlap(token_maps), extent

def generate_df(headers, header_extent, lines, delimiter):
    data_lines = lines[header_extent:]
    col_names = [h["name"] for h in headers]
    rows = []
    for line in data_lines:
        tokens = [t.strip() for t in re.split(delimiter, line["text"].strip()) if t.strip()]
        rows.append(tokens)
    return pd.DataFrame(rows, columns=col_names)

def assign_tokens_by_overlap(headers, lines, delimiter):
    n_cols, n_rows = len(headers), len(lines)
    matrix = [[None for _ in range(n_cols)] for _ in range(n_rows)]
    col_names = [h["name"] for h in headers]
    for i, line in enumerate(lines):
        tokens = get_token_intervals_multi(line["text"])
        for tok in tokens:
            best_match = max(
                range(n_cols),
                key=lambda j: compute_interval_overlap(tok["interval"], headers[j]["interval"]),
                default=None
            )
            if best_match is not None:
                matrix[i][best_match] = (matrix[i][best_match] or "") + (" " if matrix[i][best_match] else "") + tok["display"]
    return pd.DataFrame(matrix, columns=col_names)

def process_block(blocks, block_idx):
    block = blocks[block_idx]
    compute_statistics(block)
    mode_multi = most_common([l["count_multispace_tokens"] for l in block["lines"]])
    mean_numeric_single = block["stats"]["mean_numeric_single"]
    if mean_numeric_single < 0.3:
        if mode_multi > 1:
            if len(block["lines"]) < 5:
                headers, _ = extract_headers(block, r"\s{2,}")
                block["headers"], block["block_type"] = headers, "header-only"
                return
            block["block_type"] = "narrative"
            return
        elif mode_multi == 1:
            block["block_type"] = "narrative"
            return

    cv_multi = block["stats"]["cv_multispace_tokens"]
    cv_tab = block["stats"]["cv_tab_tokens"]
    mode_tab = most_common([l["count_tab_tokens"] for l in block["lines"]])

    delimiter = r"\t" if cv_tab == 0 and mode_tab > 1 else r"\s{2,}" if cv_multi == 0 and mode_multi > 1 else None
    if delimiter:
        block["delimiter"] = delimiter
        headers, extent = extract_headers(block, delimiter)
        block["headers"], block["header_extent"] = headers, extent
        if extent == 0:
            for prev in reversed(blocks[:block_idx]):
                if prev["block_type"] == "header-only":
                    prev_headers = prev["headers"]
                    prev_width = len(prev_headers)
                    data_mode = most_common([count_tokens(line["text"], delimiter) for line in block["lines"]])
                    df = generate_df(prev_headers, 0, block["lines"], delimiter) if data_mode == prev_width else assign_tokens_by_overlap(prev_headers, block["lines"], delimiter)
                    block["df"], block["block_type"] = df, "data"
                    prev["used_as_header_for"].append(block["idx"])
                    return
            block["block_type"] = "narrative"
            return
        block["df"] = generate_df(headers, extent, block["lines"], delimiter)
        block["block_type"] = "complete-tabular"
        return

    delimiter = r"\s{2,}"
    block["delimiter"] = delimiter
    headers, extent = extract_headers(block, delimiter)
    block["headers"], block["header_extent"] = headers, extent
    if extent > 0:
        data_lines = block["lines"][extent:]
        token_counts = [count_tokens(l["text"], delimiter) for l in data_lines]
        new_cv = safe_cv(token_counts)
        block["df"] = generate_df(headers, extent, block["lines"], delimiter) if new_cv == 0 else assign_tokens_by_overlap(headers, data_lines, delimiter)
        block["block_type"] = "complete-tabular"
    else:
        for prev in reversed(blocks[:block_idx]):
            if prev["block_type"] == "header-only":
                prev_headers = prev["headers"]
                prev_width = len(prev_headers)
                data_mode = most_common([count_tokens(l["text"], delimiter) for l in block["lines"]])
                df = generate_df(prev_headers, 0, block["lines"], delimiter) if data_mode == prev_width else assign_tokens_by_overlap(prev_headers, block["lines"], delimiter)
                block["df"], block["block_type"] = df, "data"
                prev["used_as_header_for"].append(block["idx"])
                return
        block["block_type"] = "narrative"

# ============================
#        ENTRY POINT
# ============================
def parse(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    lines = text.splitlines()
    i = next((j for j, line in enumerate(lines) if line.startswith("DATA:")), None)
    if i is None:
        raise ValueError("No 'DATA:' line found. Not a templated NOAA file.")
    blocks = segregate_blocks(lines, i + 1, len(lines))
    for idx in range(len(blocks)):
        process_block(blocks, idx)
    return blocks


In [7]:
blocks = parse("test.txt")

for block in blocks:
    print(block["headers"])
    print(block["title"])
    print(block["block_type"])
    display(block["df"])

[]
None
narrative


None

[{'name': 'Sample Number', 'interval': (1, 6)}, {'name': '238U (ppb)', 'interval': (15, 19)}, {'name': '232Th (ppt)', 'interval': (27, 31)}, {'name': '230Th/232Th (atomic x10-6)', 'interval': (36, 49)}, {'name': 'd234U* (measured)', 'interval': (53, 62)}, {'name': '230Th/238U (activity)', 'interval': (68, 77)}, {'name': '230Th Age (uncorrected)', 'interval': (82, 94)}, {'name': '230Th AgeBP (corrected)', 'interval': (96, 107)}, {'name': '230Th Age (AD) (corrected)', 'interval': (111, 124)}]
None
header-only


None

[]
None
data


Unnamed: 0,Sample Number,238U (ppb),232Th (ppt),230Th/232Th (atomic x10-6),d234U* (measured),230Th/238U (activity),230Th Age (uncorrected),230Th AgeBP (corrected),230Th Age (AD) (corrected)
0,CAS-D-10,354.7�0.4,1�10,30000�420000,1163�2,0.00335�0.00007,169�4,112�4,1838�4
1,CAS-D-t,331.5�0.6,4�10,12000�34000,1177�6,0.00812�0.00013,407�6,351�6,1599�6
2,CAS-D-5,308.7�0.3,50�10,1000�250,1134�2,0.00937�0.00009,478�5,421�5,1529�5
3,CAS-D-6,281.8�0.3,10�10,4500�4600,1184�4,0.01106�0.00011,553�5,496�5,1454�5
4,CAS-D-2,212.5�0.2,10�10,6000�6000,1190�2,0.01847�0.00014,923�7,866�7,1084�7
5,CAS-D-12,286.1�0.3,10�10,11000�14000,1190�2,0.01918�0.00011,958�6,901�6,1049�6
6,CAS-D-13,326.3�0.3,20�10,6500�4100,1195�2,0.02168�0.00011,1081�6,1024�6,926�6
7,CAS-D-14,269.0�0.3,1�10,120000�1600000,1203�2,0.02468�0.00013,1227�7,1170�7,780�7
8,CAS-D-b,424.7�0.8,90�10,2300�250,1209�5,0.02907�0.00016,1439�9,1383�9,567�9
9,CAS-A-t,394.5�0.8,390�10,34�2,1166�5,0.00202�0.00007,89�7,33�7,1917�7


[]
None
narrative


None

[]
None
narrative


None

[{'name': 'Depth(cm)', 'interval': (1, 9)}, {'name': 'YearAD', 'interval': (12, 17)}, {'name': 'd18O (per mil PDB)', 'interval': (25, 42)}]
None
complete-tabular


Unnamed: 0,Depth(cm),YearAD,d18O (per mil PDB)
0,0.005,2005.670,-6.102
1,0.01,2004.366,-6.987
2,0.015,2003.062,-6.841
3,0.02,2001.758,-7.136
4,0.025,2000.454,-6.968
...,...,...,...
75,0.535,1907.872,-7.053
76,0.54,1906.568,-6.978
77,0.545,1905.264,-7.054
78,0.55,1903.960,


[]
None
narrative


None

[{'name': 'Depth(cm)', 'interval': (1, 9)}, {'name': 'YearAD', 'interval': (13, 18)}, {'name': 'd18O (per mil PDB)', 'interval': (25, 42)}]
None
complete-tabular


Unnamed: 0,Depth(cm),YearAD,d18O (per mil PDB)
0,0.004,1907.462,-6.751
1,0.020,1905.214,-6.971
2,0.040,1902.404,-6.876
3,0.060,1899.594,-6.824
4,0.080,1896.784,-6.771
...,...,...,...
516,13.830,1094.966,-7.969
517,13.855,1093.372,-7.357
518,13.880,1091.778,-7.893
519,13.905,1090.184,-7.822
