In [1]:
# ============================
#        IMPORTS
# ============================
import re
import statistics
import pandas as pd


In [2]:
# ============================
#     HELPER FUNCTIONS
# ============================
def count_tokens(line, delimiter):
    tokens = re.split(delimiter, line.strip())
    return len([t for t in tokens if t])

def is_numeric(token):
    try:
        float(token)
        return True
    except ValueError:
        pass
    if "�" in token:
        parts = [p.strip() for p in token.split("�") if p.strip()]
        if len(parts) == 2:
            return all(is_numeric(p) for p in parts)
        elif len(parts) == 1:
            return is_numeric(parts[0])
    return False

def numeric_ratio(line, delimiter):
    tokens = [t for t in re.split(delimiter, line.strip()) if t]
    if not tokens:
        return 0.0
    return sum(is_numeric(t) for t in tokens) / len(tokens)

def most_common(lst):
    return max(set(lst), key=lst.count) if lst else None

def safe_mean(x):
    return statistics.mean(x) if len(x) > 0 else 0

def safe_var(x):
    return statistics.variance(x) if len(x) > 1 else 0

def safe_cv(x):
    if len(x) == 0:
        return 0
    mean_val = safe_mean(x)
    if mean_val == 0:
        return 0
    return safe_var(x) ** 0.5 / mean_val

def compute_interval_overlap(interval1, interval2):
    start1, end1 = interval1
    start2, end2 = interval2
    return max(0, min(end1, end2) - max(start1, start2))

def intervals_overlap(interval1, interval2):
    return max(interval1[0], interval2[0]) < min(interval1[1], interval2[1])

def generate_row_pattern(tokens):
    return ''.join('N' if is_numeric(tok) else 'S' for tok in tokens)

def get_token_intervals_multi(line):
    tokens = []
    token_counts = {}
    parts = re.split(r'(\s{2,})', line)
    pos = 0
    for part in parts:
        if re.fullmatch(r'\s{2,}', part):
            pos += len(part)
        elif part:
            start = pos + 1
            end = pos + len(part)
            base = part.strip()
            token_counts[base] = token_counts.get(base, 0) + 1
            tokens.append({
                "key": f"{base} {token_counts[base]}" if token_counts[base] > 1 else base,
                "display": base,
                "interval": (start, end)
            })
            pos += len(part)
    return tokens

In [23]:
# ============================
#        CORE FUNCTIONS
# ============================
def segregate_blocks(lines, start_idx, end_idx):
    blocks = []
    block = None
    block_idx = 0

    for i in range(start_idx, end_idx):
        line = lines[i]
        if line.strip():
            if block is None:
                block = {
                    "idx": block_idx,
                    "start": i,
                    "end": None,
                    "lines": [],
                    "block_type": None,
                    "delimiter": None,
                    "headers": [],
                    "header_extent": 0,
                    "title": None,
                    "df": None,
                    "used_as_header_for": [],
                    "stats": {},
                    "row_patterns": [],
                    "modal_token_count": 0,
                    "token_count_cv": 0.0
                }
            block["lines"].append({
                "line_idx": i,
                "text": line,
                "count_multispace_tokens": count_tokens(line, r"\s{2,}"),
                "count_single_tokens": count_tokens(line, r"\s+"),
                "count_tab_tokens": count_tokens(line, r"\t"),
                "numeric_multispace_ratio": numeric_ratio(line, r"\s{2,}"),
                "numeric_single_ratio": numeric_ratio(line, r"\s+"),
                "numeric_tab_ratio": numeric_ratio(line, r"\t"),
                "line_len": len(line)
            })
        elif block is not None:
            block["end"] = i - 1
            blocks.append(block)
            block_idx += 1
            block = None

    if block is not None:
        block["end"] = end_idx - 1
        blocks.append(block)

    return blocks

# def compute_statistics(block):
    # get = lambda attr: [line[attr] for line in block["lines"]]
    # s, m, t = get("count_single_tokens"), get("count_multispace_tokens"), get("count_tab_tokens")
    # ns, nm, nt = get("numeric_single_ratio"), get("numeric_multispace_ratio"), get("numeric_tab_ratio")
    # lens = get("line_len")
    # block["stats"] = {
    #     "mean_single_tokens": statistics.mean(s), "var_single_tokens": statistics.variance(s), "cv_single_tokens": safe_cv(s),
    #     "mean_multispace_tokens": statistics.mean(m), "var_multispace_tokens": statistics.variance(m), "cv_multispace_tokens": safe_cv(m),
    #     "mean_tab_tokens": statistics.mean(t), "var_tab_tokens": statistics.variance(t), "cv_tab_tokens": safe_cv(t),
    #     "mean_line_len": statistics.mean(lens), "var_line_len": statistics.variance(lens), "cv_line_len": safe_cv(lens),
    #     "mean_numeric_single": statistics.mean(ns), "mean_numeric_multispace": statistics.mean(nm), "mean_numeric_tab": statistics.mean(nt)
    # }

def compute_statistics(block):
    s = []
    m = []
    t = []
    ns = []
    nm = []
    nt = []
    lens = []

    for line in block["lines"]:
        s.append(line["count_single_tokens"])
        m.append(line["count_multispace_tokens"])
        t.append(line["count_tab_tokens"])
        ns.append(line["numeric_single_ratio"])
        nm.append(line["numeric_multispace_ratio"])
        nt.append(line["numeric_tab_ratio"])
        lens.append(line["line_len"])


    block["stats"] = {
        "mean_single_tokens": safe_mean(s),
        "var_single_tokens": safe_var(s),
        "cv_single_tokens": safe_cv(s),

        "mean_multispace_tokens": safe_mean(m),
        "var_multispace_tokens": safe_var(m),
        "cv_multispace_tokens": safe_cv(m),

        "mean_tab_tokens": safe_mean(t),
        "var_tab_tokens": safe_var(t),
        "cv_tab_tokens": safe_cv(t),

        "mean_line_len": safe_mean(lens),
        "var_line_len": safe_var(lens),
        "cv_line_len": safe_cv(lens),

        "mean_numeric_single": safe_mean(ns),
        "mean_numeric_multispace": safe_mean(nm),
        "mean_numeric_tab": safe_mean(nt)
    }


def detect_header_extent(block, delimiter):
    patterns, title_line = [], None
    for i, line in enumerate(block["lines"]):
        tokens = [t for t in re.split(delimiter, line["text"].strip()) if t]
        pattern = generate_row_pattern(tokens)
        patterns.append(pattern)
        if i == 0 and pattern == "S":
            title_line = i
    start_i = title_line + 1 if title_line is not None else 0
    extent = 0
    for pattern in patterns[start_i:]:
        if all(c == "S" for c in pattern): extent += 1
        else: break
    return extent, title_line

def merge_headers_by_overlap(token_maps):
    base_row = token_maps[0]
    merged_headers = [{"name": tok["display"], "interval": tok["interval"]} for tok in base_row]
    for row in token_maps[1:]:
        for tok in row:
            matched = False
            for hdr in merged_headers:
                if intervals_overlap(hdr["interval"], tok["interval"]):
                    hdr["name"] += " " + tok["display"]
                    hdr["interval"] = (min(hdr["interval"][0], tok["interval"][0]), max(hdr["interval"][1], tok["interval"][1]))
                    matched = True
                    break
            if not matched:
                merged_headers.append({"name": tok["display"], "interval": tok["interval"]})
    return sorted(merged_headers, key=lambda x: x["interval"][0])

def extract_headers(block, delimiter):
    extent, title_line = detect_header_extent(block, delimiter)
    block["header_extent"] = extent
    block["title"] = block["lines"][title_line]["text"] if title_line is not None else None
    if extent == 0:
        return [], 0
    header_lines = block["lines"][title_line+1:title_line+1+extent] if title_line is not None else block["lines"][:extent]
    if extent == 1:
        token_objs = get_token_intervals_multi(header_lines[0]["text"])
        return [{"name": t["display"], "interval": t["interval"]} for t in token_objs], extent
    token_maps = [get_token_intervals_multi(line["text"]) for line in header_lines]
    return merge_headers_by_overlap(token_maps), extent

def generate_df(headers, header_extent, lines, delimiter):
    data_lines = lines[header_extent:]
    col_names = [h["name"] for h in headers]
    rows = []
    for line in data_lines:
        tokens = [t.strip() for t in re.split(delimiter, line["text"].strip()) if t.strip()]
        rows.append(tokens)
    return pd.DataFrame(rows, columns=col_names)

def assign_tokens_by_overlap(headers, lines, delimiter):
    n_cols, n_rows = len(headers), len(lines)
    matrix = [[None for _ in range(n_cols)] for _ in range(n_rows)]
    col_names = [h["name"] for h in headers]
    for i, line in enumerate(lines):
        tokens = get_token_intervals_multi(line["text"])
        for tok in tokens:
            best_match = max(
                range(n_cols),
                key=lambda j: compute_interval_overlap(tok["interval"], headers[j]["interval"]),
                default=None
            )
            if best_match is not None:
                matrix[i][best_match] = (matrix[i][best_match] or "") + (" " if matrix[i][best_match] else "") + tok["display"]
    return pd.DataFrame(matrix, columns=col_names)

def process_block(blocks, block_idx):
    block = blocks[block_idx]
    compute_statistics(block)
    lst = [l["count_multispace_tokens"] for l in block["lines"]]
    print(lst)
    mode_multi = most_common(lst)
    mean_numeric_single = block["stats"]["mean_numeric_single"]
    print(block_idx, mode_multi, mean_numeric_single)
    if mean_numeric_single < 0.3:
        if mode_multi > 1:
            if len(block["lines"]) < 6:
                headers, _ = extract_headers(block, r"\s{2,}")
                block["headers"], block["block_type"] = headers, "header-only"
                return
            block["block_type"] = "narrative"
            return
        elif mode_multi == 1:
            block["block_type"] = "narrative"
            return

    cv_multi = block["stats"]["cv_multispace_tokens"]
    cv_tab = block["stats"]["cv_tab_tokens"]
    mode_tab = most_common([l["count_tab_tokens"] for l in block["lines"]])

    delimiter = r"\t" if cv_tab == 0 and mode_tab > 1 else r"\s{2,}" if cv_multi == 0 and mode_multi > 1 else None
    if delimiter:
        block["delimiter"] = delimiter
        headers, extent = extract_headers(block, delimiter)
        block["headers"], block["header_extent"] = headers, extent
        if extent == 0:
            for prev in reversed(blocks[:block_idx]):
                if prev["block_type"] == "header-only":
                    prev_headers = prev["headers"]
                    prev_width = len(prev_headers)
                    data_mode = most_common([count_tokens(line["text"], delimiter) for line in block["lines"]])
                    df = generate_df(prev_headers, 0, block["lines"], delimiter) if data_mode == prev_width else assign_tokens_by_overlap(prev_headers, block["lines"], delimiter)
                    block["df"], block["block_type"] = df, "data"
                    prev["used_as_header_for"].append(block["idx"])
                    return
            block["block_type"] = "narrative"
            return
        block["df"] = generate_df(headers, extent, block["lines"], delimiter)
        block["block_type"] = "complete-tabular"
        return

    delimiter = r"\s{2,}"
    block["delimiter"] = delimiter
    headers, extent = extract_headers(block, delimiter)
    block["headers"], block["header_extent"] = headers, extent
    if extent > 0:
        data_lines = block["lines"][extent:]
        token_counts = [count_tokens(l["text"], delimiter) for l in data_lines]
        new_cv = safe_cv(token_counts)
        block["df"] = generate_df(headers, extent, block["lines"], delimiter) if new_cv == 0 else assign_tokens_by_overlap(headers, data_lines, delimiter)
        block["block_type"] = "complete-tabular"
    else:
        for prev in reversed(blocks[:block_idx]):
            if prev["block_type"] == "header-only":
                prev_headers = prev["headers"]
                prev_width = len(prev_headers)
                data_mode = most_common([count_tokens(l["text"], delimiter) for l in block["lines"]])
                df = generate_df(prev_headers, 0, block["lines"], delimiter) if data_mode == prev_width else assign_tokens_by_overlap(prev_headers, block["lines"], delimiter)
                block["df"], block["block_type"] = df, "data"
                prev["used_as_header_for"].append(block["idx"])
                return
        block["block_type"] = "narrative"

# ============================
#        ENTRY POINT
# ============================
def parse(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    lines = text.splitlines()
    i = next((j for j, line in enumerate(lines) if line.startswith("DATA:")), None)
    if i is None:
        raise ValueError("No 'DATA:' line found. Not a templated NOAA file.")
    blocks = segregate_blocks(lines, i + 1, len(lines))
    for idx in range(len(blocks)):
        process_block(blocks, idx)
    return blocks


In [None]:
blocks = parse("test.txt")

for id, block in enumerate(blocks):
    print(id)
    print(block["title"])
    print(block["block_type"])
    display(block["df"])

[]
None
narrative


None

[{'name': 'Sample Number', 'interval': (1, 6)}, {'name': '238U (ppb)', 'interval': (15, 19)}, {'name': '232Th (ppt)', 'interval': (27, 31)}, {'name': '230Th/232Th (atomic x10-6)', 'interval': (36, 49)}, {'name': 'd234U* (measured)', 'interval': (53, 62)}, {'name': '230Th/238U (activity)', 'interval': (68, 77)}, {'name': '230Th Age (uncorrected)', 'interval': (82, 94)}, {'name': '230Th AgeBP (corrected)', 'interval': (96, 107)}, {'name': '230Th Age (AD) (corrected)', 'interval': (111, 124)}]
None
header-only


None

[]
None
data


Unnamed: 0,Sample Number,238U (ppb),232Th (ppt),230Th/232Th (atomic x10-6),d234U* (measured),230Th/238U (activity),230Th Age (uncorrected),230Th AgeBP (corrected),230Th Age (AD) (corrected)
0,CAS-D-10,354.7�0.4,1�10,30000�420000,1163�2,0.00335�0.00007,169�4,112�4,1838�4
1,CAS-D-t,331.5�0.6,4�10,12000�34000,1177�6,0.00812�0.00013,407�6,351�6,1599�6
2,CAS-D-5,308.7�0.3,50�10,1000�250,1134�2,0.00937�0.00009,478�5,421�5,1529�5
3,CAS-D-6,281.8�0.3,10�10,4500�4600,1184�4,0.01106�0.00011,553�5,496�5,1454�5
4,CAS-D-2,212.5�0.2,10�10,6000�6000,1190�2,0.01847�0.00014,923�7,866�7,1084�7
5,CAS-D-12,286.1�0.3,10�10,11000�14000,1190�2,0.01918�0.00011,958�6,901�6,1049�6
6,CAS-D-13,326.3�0.3,20�10,6500�4100,1195�2,0.02168�0.00011,1081�6,1024�6,926�6
7,CAS-D-14,269.0�0.3,1�10,120000�1600000,1203�2,0.02468�0.00013,1227�7,1170�7,780�7
8,CAS-D-b,424.7�0.8,90�10,2300�250,1209�5,0.02907�0.00016,1439�9,1383�9,567�9
9,CAS-A-t,394.5�0.8,390�10,34�2,1166�5,0.00202�0.00007,89�7,33�7,1917�7


[]
None
narrative


None

[]
None
narrative


None

[{'name': 'Depth(cm)', 'interval': (1, 9)}, {'name': 'YearAD', 'interval': (12, 17)}, {'name': 'd18O (per mil PDB)', 'interval': (25, 42)}]
None
complete-tabular


Unnamed: 0,Depth(cm),YearAD,d18O (per mil PDB)
0,0.005,2005.670,-6.102
1,0.01,2004.366,-6.987
2,0.015,2003.062,-6.841
3,0.02,2001.758,-7.136
4,0.025,2000.454,-6.968
...,...,...,...
75,0.535,1907.872,-7.053
76,0.54,1906.568,-6.978
77,0.545,1905.264,-7.054
78,0.55,1903.960,


[]
None
narrative


None

[{'name': 'Depth(cm)', 'interval': (1, 9)}, {'name': 'YearAD', 'interval': (13, 18)}, {'name': 'd18O (per mil PDB)', 'interval': (25, 42)}]
None
complete-tabular


Unnamed: 0,Depth(cm),YearAD,d18O (per mil PDB)
0,0.004,1907.462,-6.751
1,0.020,1905.214,-6.971
2,0.040,1902.404,-6.876
3,0.060,1899.594,-6.824
4,0.080,1896.784,-6.771
...,...,...,...
516,13.830,1094.966,-7.969
517,13.855,1093.372,-7.357
518,13.880,1091.778,-7.893
519,13.905,1090.184,-7.822


In [5]:
blocks = parse("test2.txt")

for block in blocks:
    print(block["headers"])
    print(block["title"])
    print(block["block_type"])
    display(block["df"])

[]
None
narrative


None

[{'name': 'Sample Number', 'interval': (1, 13)}, {'name': 'Depth to top (mm)', 'interval': (16, 27)}, {'name': '238U (ppb)', 'interval': (34, 38)}, {'name': '232Th (ppt)', 'interval': (51, 56)}, {'name': '230Th/232Th (atomic x 10-6)', 'interval': (69, 83)}, {'name': 'd234U* measured', 'interval': (94, 101)}, {'name': '230Th/238U activity', 'interval': (113, 122)}, {'name': 'Age (yr) uncorrected', 'interval': (133, 143)}, {'name': 'Age (yr) corrected', 'interval': (154, 162)}, {'name': 'd234U initial corrected', 'interval': (173, 185)}, {'name': 'Age (yr BP)* corrected', 'interval': (195, 207)}]
None
header-only


None

[]
Dongge Cave, China (D4)
data


Unnamed: 0,Sample Number,Depth to top (mm),238U (ppb),232Th (ppt),230Th/232Th (atomic x 10-6),d234U* measured,230Th/238U activity,Age (yr) uncorrected,Age (yr) corrected,d234U initial corrected,Age (yr BP)* corrected
0,"Dongge Cave, China (D4)",,,,,,,,,,
1,D4-4 356 �0.5 62 �5,1402.0,,,6700 �600,-26 �1.8,0.06977 �0.00046,8101 �58,8091 �58,-26.6 �1.9,8035 �58
2,D4-9 n 388 �0.1 89 �4,1404.0,,,5000 �200,-23.9 �1.2,0.07009 �0.00014,8120 �19,8108 �21,-24.5 �1.2,8052 �21
3,D4-1 �0.8 �6,1410.0,497.2,468.0,1240 �20,-26.3 �1.5,0.07063 �0.00035,8206 �44,8157 �54,-26.9 �1.6,8101 �54
4,D4-8 n �0.1 �5,1425.0,493.8,191.0,3020 �80,-28.8 �0.5,0.07065 �0.00016,8232 �20,8212 �24,-29.5 �0.5,8156 �24
5,D4-10 n �0.1 �5,1447.0,370.5,,3100 �100,-13.2 �0.7,0.07259 �0.00017,8326 �21,8306 �25,-13.5 �0.7,8250 �25
6,D4-5 333 �0.4 67 �4,1452.0,,,6000 �400,-9.8 �2.0,0.07293 �0.00045,8336 �56,8326 �57,-10.1 �2.0,8270 �57
7,D4-2 �0.7 �6,1462.0,463.5,329.0,1700 �30,-10.7 �1.5,0.07333 �0.00036,8391 �45,8354 �51,-10.9 �1.5,8298 �51
8,D4-6 �0.3 �3,1470.0,305.1,170.0,2150 �40,-18.9 �1.7,0.07262 �0.00038,8379 �48,8350 �51,-19.3 �1.7,8294 �51
9,D4-11 n �0.1 96 �4,1492.0,359.8,,4600 �200,-10.1 �0.7,0.07382 �0.00016,8444 �20,8430 �22,-10.3 �0.7,8374 �22


[]
Dongge Cave, China (DA)
data


Unnamed: 0,Sample Number,Depth to top (mm),238U (ppb),232Th (ppt),230Th/232Th (atomic x 10-6),d234U* measured,230Th/238U activity,Age (yr) uncorrected,Age (yr) corrected,d234U initial corrected,Age (yr BP)* corrected
0,"Dongge Cave, China (DA)",,,,,,,,,,
1,DA5-13 �0.8 �5,858.5,586.3,570.0,1140 �10,-69.9 �1.7,0.06679 �0.00033,8126 �44,8073 �56,-71.5 �1.7,8018 �56
2,DA5-6 �0.7 �10,859.0,545.6,200.0,2900 �200,-72.9 �1.1,0.06594 �0.00048,8046 �61,8025 �63,-74.5 �1.2,7970 �63
3,DA5-5 �0.8 �10,865.1,615.3,590.0,1150 �30,-75.3 �1.2,0.06705 �0.00051,8209 �66,8156 �75,-77.1 �1.2,8101 �75
4,DA-17 n �0.1 �5,865.5,752.3,689.0,1210 �10,-74.8 �0.7,0.06703 �0.00015,8201 �20,8151 �32,-76.5 �0.7,8096 �32
5,DA5-4 �0.8 �10,871.2,516.2,340.0,1710 �60,-75.6 �1.3,0.06747 �0.00071,8265 �91,8229 �94,-77.3 �1.4,8174 �94
6,DA5-7 �0.6 �6,876.5,490.1,329.0,1670 �30,-74 �1.2,0.06797 �0.00039,8314 �50,8277 �56,-75.8 �1.2,8222 �56
7,DA5-8 �0.7 �6,876.5,610.6,394.0,1730 �30,-78 �1.1,0.06756 �0.00041,8300 �53,8264 �58,-79.9 �1.2,8209 �58
8,DA-15 n �0.1 �5,879.5,690.7,538.0,1440 �10,-73.6 �1.3,0.06816 �0.00015,8334 �22,8291 �31,-75.3 �1.3,8236 �31
9,DA5-3 �0.8 �10,880.5,601.8,290.0,2400 �100,-78.2 �1.2,0.06771 �0.00048,8320 �63,8294 �65,-80.1 �1.2,8239 �65


[]
Qunf Cave, Oman (Q5)
data


Unnamed: 0,Sample Number,Depth to top (mm),238U (ppb),232Th (ppt),230Th/232Th (atomic x 10-6),d234U* measured,230Th/238U activity,Age (yr) uncorrected,Age (yr) corrected,d234U initial corrected,Age (yr BP)* corrected
0,"Qunf Cave, Oman (Q5)",,,,,,,,,,
1,Q5-15 n 2 �0.8 �10,,659.6,250.0,2800 �100,-83.1 �1.6,0.06592 �0.00027,8138 �37,8125 �38,-85.1 �1.6,8069 �38
2,Q5-14 n 7 646 �0.7 �10,,,290.0,2500 �100,-75.9 �1.4,0.06692 �0.00026,8197 �36,8183 �36,-77.7 �1.4,8127 �36
3,Q5-19 n �0.7 �10,11.5,739.4,870.0,950 �10,-71 �1.4,0.06763 �0.00013,8242 �21,8205 �28,-72.7 �1.4,8149 �28
4,Q5-6 n �0.1 �4,17.0,492.1,295.0,1840 �20,-79.3 �0.4,0.06695 �0.00015,8234 �19,8215 �21,-81.2 �0.4,8159 �21
5,Q5-10 n 593 �0.9 �10,26.0,,970.0,681 �9,-84.9 �1.8,0.06758 �0.00022,8368 �33,8316 �42,-86.9 �1.8,8260 �42
6,Q5-7 n �0.1 �6,27.0,632.8,530.0,1340 �20,-83.3 �0.6,0.06791 �0.00015,8395 �20,8369 �24,-85.3 �0.6,8313 �24
7,Q5-20 n �0.7 �10,28.0,661.1,200.0,3800 �200,-83.1 �1.4,0.06804 �0.00015,8410 �24,8400 �24,-85.1 �1.4,8344 �24
8,Q5-21 n �0.6 �40,36.5,698.1,3350.0,242 �3,-71.2 �1.5,0.07037 �0.00018,8592 �27,8441 �80,-72.9 �1.5,8385 �80
9,Q5-8 n 703 �0.1 �10,40.5,,1350.0,598 �6,-80.4 �0.4,0.06949 �0.00014,8570 �18,8509 �35,-82.3 �0.4,8453 �35


[]
Paix�o Cave, Brazil (PX5)
data


Unnamed: 0,Sample Number,Depth to top (mm),238U (ppb),232Th (ppt),230Th/232Th (atomic x 10-6),d234U* measured,230Th/238U activity,Age (yr) uncorrected,Age (yr) corrected,d234U initial corrected,Age (yr BP)* corrected
0,"Paix�o Cave, Brazil (PX5)",,,,,,,,,,
1,PX-6 n 1884 �2 �5,91.0,,217.0,24500 �600,1363 �2,0.17128 �0.00028,8150 �17,8149 �17,1394.4 �2.5,8091 �17
2,PX-7 n 1382 �1 �3,94.0,,136.0,28900 �700,1381 �2,0.17287 �0.00028,8162 �16,8161 �16,1413.3 �2.3,8103 �16
3,PX-8 n 1610 �2 �5,97.0,,232.0,19900 �400,1392 �2,0.17435 �0.00029,8194 �17,8192 �17,1425 �2.5,8134 �17
4,PX-9 n 2094 �3 82 �2,100.5,,,74600 �2200,1405 �3,0.17617 �0.00032,8238 �18,8237 �18,1437.9 �2.7,8179 �18
5,PX-11 n 2456 �3 �3,104.0,,110.0,65200 �1700,1408 �3,0.17715 �0.00033,8275 �19,8274 �19,1441.2 �2.6,8216 �19


[]
Hoti Cave, Oman (H14)
data


Unnamed: 0,Sample Number,Depth to top (mm),238U (ppb),232Th (ppt),230Th/232Th (atomic x 10-6),d234U* measured,230Th/238U activity,Age (yr) uncorrected,Age (yr) corrected,d234U initial corrected,Age (yr BP)* corrected
0,"Hoti Cave, Oman (H14)",,,,,,,,,,
1,H14-10 n �0.5 �8,160.9,1696.1,584.0,5300 �100,599.9 �0.4,0.11026 �0.00034,7764 �25,7758 �25,613.2 �0.4,7702 �25
2,H14-9 n �0.5 �8,166.3,1786.9,796.0,4100 �40,596.4 �0.6,0.11066 �0.00032,7812 �23,7804 �24,609.7 �0.6,7748 �24
3,H14-8 n �0.5 �8,201.2,1700.4,285.0,11100 �300,605.9 �0.6,0.11253 �0.00036,7899 �26,7896 �26,619.6 �0.6,7840 �26
4,H14-7 n �0.5 �9,215.9,1896.2,118.0,30000 �2200,602.2 �0.4,0.11268 �0.00034,7929 �25,7928 �25,615.9 �0.4,7872 �25
5,H14-6 n 1616 �0.4 �7,238.3,,222.0,13800 �400,606 �0.5,0.11434 �0.00033,8029 �24,8027 �24,619.9 �0.5,7971 �24
6,H14-15 n �0.1 �6,249.2,1739.4,635.0,5140 �50,588.7 �0.3,0.1138 �0.00021,8063 �15,8056 �16,602.2 �0.3,8000 �15
7,H14-18 n �0.1 �6,272.2,1507.1,114.0,25000 �1300,593.3 �0.5,0.11491 �0.00019,8119 �14,8118 �14,607.1 �0.5,8062 �14
8,H14-5 n 1543 �0.4 �9,275.9,,665.0,4430 �60,578.4 �0.5,0.1156 �0.00036,8269 �27,8261 �27,592 �0.5,8205 �27
9,H14-19 n �0.1 �6,276.0,1151.5,314.0,7000 �130,577.9 �0.4,0.11578 �0.00019,8266 �14,8261 �15,591.6 �0.4,8205 �15


[]
Padre Cave, Brazil (PAD07)
data


Unnamed: 0,Sample Number,Depth to top (mm),238U (ppb),232Th (ppt),230Th/232Th (atomic x 10-6),d234U* measured,230Th/238U activity,Age (yr) uncorrected,Age (yr) corrected,d234U initial corrected,Age (yr BP)* corrected
0,"Padre Cave, Brazil (PAD07)",,,,,,,,,,
1,B2-8 �0.8 �20,131.0,652.9,440.0,3100 �100,905.3 �2.0,0.12761 �0.00066,7514 �41,7503 �41,924.7 �2.1,7448 �41
2,B2-16 n �0.1 �5,166.0,782.9,113.0,14600 �700,865.9 �0.8,0.12802 �0.00023,7704 �15,7701 �15,885 �0.8,7645 �15
3,B2-7 1080 �2 �10,190.0,,290.0,7600 �300,770.2 �2.1,0.12296 �0.00046,7804 �32,7800 �32,787.3 �2.1,7745 �32
4,B2-15 n �0.1 �4,211.0,789.6,104.0,15700 �600,790.6 �0.9,0.12538 �0.00022,7869 �15,7867 �15,808.3 �0.9,7811 �15
5,B2-6-II 979 �1 �10,237.0,,130.0,16500 �1500,879.1 �1.5,0.13355 �0.00056,7988 �36,7986 �36,899.2 �1.6,7931 �36
6,B2-14 n �0.1 �5,256.0,724.5,100.0,15300 �700,837.2 �1.0,0.13273 �0.00020,8126 �14,8124 �14,856.6 �1.0,8068 �14
7,B2-21 n �0.8 �10,267.0,979.3,240.0,8700 �400,785.2 �2.1,0.1296 �0.00022,8168 �17,8164 �17,803.6 �2.1,8108 �17
8,B2-5 856 �1 �10,274.0,,260.0,7300 �400,814.9 �2.1,0.13255 �0.00055,8218 �37,8214 �37,834 �2.1,8159 �37
9,B2-13 n �0.1 �4,279.0,890.4,220.0,7000 �200,809.9 �0.5,0.13207 �0.00018,8211 �12,8207 �12,828.9 �0.5,8152 �12


[]
None
narrative


None

[]
None
narrative


None

[]
None
narrative


None

[]
None
narrative


None

[]
None
narrative


None

[{'name': '1. Dongge Cave, China', 'interval': (1, 21)}, {'name': '(D4)', 'interval': (24, 27)}]
None
header-only


None

[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age', 'interval': (16, 18)}, {'name': 'd18O', 'interval': (26, 29)}]
None
complete-tabular


Unnamed: 0,Depth,Age,d18O
0,1395,7992,-8.96
1,1396,7999,-9.2
2,1397,8007,-8.91
3,1398,8014,-8.84
4,1400,8027,-8.91
...,...,...,...
188,1513,8417,-9.07
189,1518,8432,-8.96
190,1523,8448,-9.07
191,1528,8466,-9.41


[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age', 'interval': (16, 18)}, {'name': 'd18O', 'interval': (26, 29)}]
2. Dongge Cave, China (DA)
complete-tabular


Unnamed: 0,Depth,Age,d18O
0,Depth,Age,d18O
1,858.5,7990,-8.79
2,858.8,7996,-8.64
3,859,8000,-8.72
4,859.3,8006,-8.65
...,...,...,...
101,882.5,8313,-8.8
102,882.5,8321,-8.88
103,883.5,8328,-8.99
104,884.5,8336,-8.88


[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age', 'interval': (16, 18)}, {'name': 'd18O', 'interval': (26, 29)}]
3. Qunf Cave, Oman (Q5)
complete-tabular


Unnamed: 0,Depth,Age,d18O
0,Depth,Age,d18O
1,0,8030,-1.89
2,1,8050,-1.85
3,3,8088,-1.55
4,4.5,8108,-1.8
5,7,8127,-1.86
6,9.5,8138,-1.68
7,11.5,8149,-1.66
8,14.5,8154,-1.59
9,17,8159,-1.58


[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age', 'interval': (16, 18)}, {'name': 'd18O', 'interval': (26, 29)}]
4. Hoti Cave, Oman (H14)
complete-tabular


Unnamed: 0,Depth,Age,d18O
0,Depth,Age,d18O
1,153.8,7678,-4.08
2,154.3,7680,-4.08
3,154.8,7682,-4.23
4,155.3,7684,-4.1
...,...,...,...
385,381.3,8552,-5.04
386,382.1,8555,-4.91
387,383,8559,-4.88
388,383.8,8562,-4.77


[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age', 'interval': (16, 18)}, {'name': 'd18O', 'interval': (26, 29)}]
5. Paix�o Cave, Brazil (PX5)
complete-tabular


Unnamed: 0,Depth,Age,d18O
0,Depth,Age,d18O
1,90,8087,-5.07
2,90.5,8089,-5.37
3,91,8091,-5.49
4,91.5,8093,-5.57
5,92,8095,-4.79
6,92.5,8097,-4.17
7,93,8099,-4.86
8,93.5,8101,-4.51
9,94,8103,-4.16


[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age', 'interval': (16, 18)}, {'name': 'd18O', 'interval': (26, 29)}]
6. Padre Cave, Brazil (PAD07)
complete-tabular


Unnamed: 0,Depth,Age,d18O
0,Depth,Age,d18O
1,199,7782,-6.151
2,200.5,7789,-6.372
3,202,7797,-6.474
4,203.5,7804,-6.387
...,...,...,...
195,407.5,8503,-6.19
196,409.8,8504,-6.21
197,412,8505,-6.43
198,416,8507,-6.45


In [6]:
blocks = parse("test3.txt")

for block in blocks:
    print(block["headers"])
    print(block["title"])
    print(block["block_type"])
    display(block["df"])

[]
None
narrative


None

[]
None
narrative


None

[{'name': 'Sample mid depth', 'interval': (1, 9)}, {'name': 'Sample length', 'interval': (13, 18)}, {'name': 'd18O air (per mil)', 'interval': (21, 29)}, {'name': 'Sample mid gas age (default)', 'interval': (33, 42)}, {'name': 'delta age (default)', 'interval': (45, 53)}, {'name': 'Sample mid gas age (LD min)', 'interval': (56, 65)}, {'name': 'delta age (LD min)', 'interval': (69, 77)}]
None
complete-tabular


Unnamed: 0,Sample mid depth,Sample length,d18O air (per mil),Sample mid gas age (default),delta age (default),Sample mid gas age (LD min),delta age (LD min)
0,1108.637,0.15,-0.062,9297,88,9270,46
1,1113.567,0.13,0.247,9981,131,9980,49
2,1126.287,0.15,0.908,13195,263,13179,70
3,1127.492,0.1,0.859,13735,275,13684,75
4,1128.407,0.05,1.103,14164,273,14080,75
5,1129.802,0.05,1.17,14981,406,14924,76
6,1132.227,0.07,1.111,16836,761,16925,120
7,1132.752,0.05,1.032,17544,758,17658,119


[]
None
narrative


None

[]
None
narrative


None

[{'name': 'Sample mid depth', 'interval': (1, 9)}, {'name': 'Sample length', 'interval': (13, 18)}, {'name': 'CH4 ppbv', 'interval': (24, 27)}, {'name': 'Sample mid gas age (default)', 'interval': (31, 41)}, {'name': 'delta age (default)', 'interval': (44, 52)}, {'name': 'Sample mid gas age (LD min)', 'interval': (55, 64)}, {'name': '// two elements delta age (LD min)', 'interval': (68, 82)}]
None
complete-tabular


Unnamed: 0,Sample mid depth,Sample length,CH4 ppbv,Sample mid gas age (default),delta age (default),Sample mid gas age (LD min),// two elements delta age (LD min)
0,1108.637,0.15,657,9296.4,89,9269.1,47 // N N N N N
1,1111.752,0.1,659,9634.5,128,9634.5,48
2,1113.567,0.13,696,9979.9,132,9979.5,50
3,1115.872,0.1,702,10432.5,137,10431.6,54
4,1118.327,0.13,668,10940.4,137,10936.7,54
5,1118.492,0.1,674,10968.9,143,10968.6,56
6,1121.652,0.1,472,11619.8,192,11620.0,60
7,1122.342,0.1,484,11838.5,193,11834.0,60
8,1123.487,0.05,478,12195.7,204,12188.4,67
9,1124.022,0.1,444,12373.1,202,12360.9,65


[]
None
narrative


None

[]
None
narrative


None

[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age (Default)', 'interval': (13, 21)}, {'name': 'Age (LD min)', 'interval': (27, 34)}, {'name': 'd18O (per mil)', 'interval': (39, 47)}]
None
complete-tabular


Unnamed: 0,Depth,Age (Default),Age (LD min),d18O (per mil)
0,1105.282,8999.4,,-21.27
1,1105.382,9010.6,,-21.36
2,1105.482,9021.9,,-21.39
3,1105.582,9033.1,,-20.94
4,1105.682,9044.4,,-20.91
...,...,...,...,...
427,1133.587,,18956.4,-28.02
428,1133.596,,18968.7,-28.29
429,1133.604,,18981,-28.29
430,1133.613,,18993.4,-28.36


In [8]:
blocks = parse("test4.txt")

for block in blocks:
    print(block["headers"])
    print(block["title"])
    print(block["block_type"])
    display(block["df"])

[]
None
narrative


None

[]
None
narrative


None

[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age*a', 'interval': (11, 15)}, {'name': 'TOC*b', 'interval': (19, 23)}, {'name': 'C:N', 'interval': (29, 31)}, {'name': 'd13C-TOC', 'interval': (35, 42)}, {'name': 'Depth', 'interval': (48, 52)}, {'name': 'Age', 'interval': (60, 62)}, {'name': 'TOC', 'interval': (69, 71)}, {'name': 'C:N', 'interval': (78, 80)}, {'name': 'd13C-TOC', 'interval': (85, 92)}]
None
complete-tabular


Unnamed: 0,Depth,Age*a,TOC*b,C:N,d13C-TOC,Depth.1,Age,TOC,C:N.1,d13C-TOC.1
0,2.5 - - -,0.2,18.1,8.4,-21.34,135.0*c,11.7,,,
1,7.5 - - -,0.7,18.3,8.3,-21.49,[112.0],11.7,,,
2,12.5,1.2,18.4,8.3,-21.55,137.5,12.2,10.7,8.2,-22.34
3,17.5,1.6,19.5,8.3,-21.52,[122.0],12.8,8.9,8.0,-22.75
4,[11.5]*d,1.7,20.0,7.4,-21.01,142.5,13.0,9.0,7.8,-22.96
5,22.5 - - -,2.1,16.7,8.5,-21.43,145.0*c,13.2,,,
6,27.5 - - -,2.6,21.1,8.6,-21.45,[124.0],13.2,,,
7,32.5,3.0,21.0,8.5,-21.5,147.5,13.9,7.9,8.0,-23.75
8,37.5,3.5,20.3,8.7,-21.57,152.5,15.4,7.3,8.8,-24.98
9,42.5,4.0,18.1,8.5,-21.59,[132.0],15.4,10.6,6.8,-23.09


[]
None
narrative


None

[{'name': '2. Table 1A,', 'interval': (1, 12)}, {'name': 'Molecular and carbon isotopic data for cores KH-79-3, L-3 and KH-79-3, C-3.', 'interval': (15, 89)}]
None
header-only


None

[]
None
narrative


None

[{'name': 'Depth', 'interval': (1, 5)}, {'name': 'Age', 'interval': (10, 12)}, {'name': 'LSR', 'interval': (18, 20)}, {'name': 'DBD', 'interval': (28, 30)}, {'name': 'TOC-MAR', 'interval': (34, 40)}, {'name': 'Alk.', 'interval': (44, 47)}, {'name': 'SST', 'interval': (55, 57)}, {'name': 'Dino', 'interval': (63, 66)}, {'name': 'C28', 'interval': (73, 75)}, {'name': 'Phytol', 'interval': (79, 84)}, {'name': 'd13C37', 'interval': (88, 93)}, {'name': 'd13C38', 'interval': (97, 102)}, {'name': 'd13C28', 'interval': (106, 111)}, {'name': 'd13Cdin', 'interval': (116, 123)}]
None
complete-tabular


Unnamed: 0,Depth,Age,LSR,DBD,TOC-MAR,Alk.,SST,Dino,C28,Phytol,d13C37,d13C38,d13C28,d13Cdin
0,2.5 - - - -,0.2,10.7,0.38,74.0,,,1.84,4.14,4.27,-25.1,-24.8,,
1,7.5 - - -,0.7,10.7,0.38,74.0,1.52,19.1,1.74,3.94,3.87,,-24.1,,
2,12.5 - - - -,1.2,10.7,0.4,79.0,,,1.9,3.29,3.56,-21.3,-22.7,,
3,17.5 - - - - - -,1.6,10.7,0.4,84.0,,,1.52,2.99,3.08,,,,
4,[11.5]*a 1.7 - - - -,,10.7,0.4,86.0,,,3.53,5.16,2.05,,,-23.9,-23.2
5,22.5 - - - -,2.1,10.7,0.36,64.0,1.93,18.4,1.21,2.11,2.72,,,,
6,27.5 - - - - - -,2.6,10.7,0.36,81.0,,,1.54,2.79,3.42,,,,
7,32.5 - - - -,3.0,10.7,0.34,76.0,1.32,17.9,1.7,2.53,3.1,,,,
8,37.5 - - - - - -,3.5,10.7,0.34,74.0,,,1.54,2.58,3.02,,,,
9,42.5 - - - -,4.0,10.7,0.36,70.0,,,1.67,2.56,2.53,-23.5,-23.7,,


[]
None
narrative


None

In [25]:
blocks = parse("test5.txt")

for id, block in enumerate(blocks):
    # print(id, block)
    print(block["lines"])
    print(block["block_type"])
    # display(block["df"])

[1, 1, 1, 1, 1, 1, 2]
0 1 0.012987012987012988
[1]
1 1 0.0
[1, 5, 7]
2 1 0.0
[7, 7, 7, 7, 7, 7, 8, 7, 7]
3 7 0.5246913580246914
[1, 5, 7]
4 1 0.0
[7, 7, 7, 7, 7, 7, 7, 7, 7]
5 7 0.5308641975308642
[1, 5, 7]
6 1 0.0
[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
7 7 0.5357142857142857
[1, 5, 7]
8 1 0.0
[7, 7]
9 7 0.5555555555555556
[1, 1, 1, 1, 1, 1, 1, 1]
10 1 0.0125
[1, 1, 4, 6]
11 1 0.0
[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
12 6 0.7416666666666667
[{'line_idx': 57, 'text': 'Table A.', 'count_multispace_tokens': 1, 'count_single_tokens': 2, 'count_tab_tokens': 1, 'numeric_multispace_ratio': 0.0, 'numeric_single_ratio': 0.0, 'numeric_tab_ratio': 0.0, 'line_len': 8}, {'line_idx': 58, 'text': 'Results of Inter-depositional environment analysis.  ', 'count_multispace_tokens': 1, 'count_single_tokens': 5, 'count_tab_tokens': 1, 'numeric_multispace_ratio