In [1]:
import re
import pandas as pd
import chardet

# pd.set_option("display.max_colwidth", None)
# pd.set_option("display.max_rows", None)

In [2]:
def get_token_intervals_multi(line):
    """
    For a given line, return a dictionary where each key is a token (a contiguous
    sequence of characters that are not separated by two or more spaces) and the
    value is a tuple (start, end) representing the 1-indexed positions of the token in the line.
    
    Example:
      Given the line: "Age (yr)  Depth (cm)"
      This function returns:
         {"Age (yr)": (1, 8), "Depth (cm)": (11, 21)}
    """
    tokens = {}
    # Split the line on sequences of two or more spaces.
    # The capturing group ensures we retain the delimiters in the split result.
    parts = re.split(r'(\s{2,})', line)
    pos = 0
    for part in parts:
        if re.fullmatch(r'\s{2,}', part):
            # Delimiter: update position by length of delimiter.
            pos += len(part)
        else:
            if part:
                # Record token: starting at pos+1 (1-indexed) and ending at pos+len(part)
                start = pos + 1
                end = pos + len(part)
                tokens[part] = (start, end)
                pos += len(part)
    return tokens

def get_token_intervals_multi_2(line):
    """
    For a given line, return a dictionary where each key is a token (a contiguous
    sequence of characters that are not separated by two or more spaces) and the
    value is a tuple (start, end) representing the 1-indexed positions of the token in the line.
    
    If a token appears more than once, subsequent occurrences are suffixed with a counter.
    
    Example:
      Given the line: 
        "Age (yr)  Depth (cm)  Age (yr)  corrected  corrected  corrected"
      This function returns:
         {
           "Age (yr)": (1, 8),
           "Depth (cm)": (11, 21),
           "Age (yr) 2": (23, 30),
           "corrected": (32, 40),
           "corrected 2": (42, 50),
           "corrected 3": (52, 60)
         }
    """
    tokens = {}
    token_counts = {}  # To keep track of occurrences of each token
    parts = re.split(r'(\s{2,})', line)
    pos = 0
    for part in parts:
        if re.fullmatch(r'\s{2,}', part):
            # If this part is a delimiter (2 or more spaces), update the current position.
            pos += len(part)
        else:
            if part:
                start = pos + 1
                end = pos + len(part)
                # Check if token already exists.
                if part in token_counts:
                    token_counts[part] += 1
                    token_key = f"{part} {token_counts[part]}"
                else:
                    token_counts[part] = 1
                    token_key = part
                tokens[token_key] = (start, end)
                pos += len(part)
    return tokens

# --- Example usage ---
sample_text = """Age (yr)  Depth (cm)  Age (yr)  corrected  corrected  corrected"""
tokens = get_token_intervals_multi(sample_text)
for token, interval in tokens.items():
    print(f"Token: '{token}', Interval: {interval}")


Token: 'Age (yr)', Interval: (23, 30)
Token: 'Depth (cm)', Interval: (11, 20)
Token: 'corrected', Interval: (55, 63)


In [3]:
def compute_interval_overlap(interval1, interval2):
    """
    Given two intervals (a, b) and (c, d) (inclusive, 1-indexed),
    compute the raw overlap (number of overlapping characters) and the Jaccard similarity.
    Jaccard similarity = (size of intersection) / (size of union)
    """
    a, b = interval1
    c, d = interval2
    start = max(a, c)
    end = min(b, d)
    raw_overlap = max(0, end - start + 1)
    len1 = b - a + 1
    len2 = d - c + 1
    union = len1 + len2 - raw_overlap
    jaccard = raw_overlap / union if union else 0
    return raw_overlap, jaccard

In [4]:
def segregate_blocks(lines):
    """
    Given a list of lines, group them into blocks separated by empty lines.
    Each block is a list of non-empty lines.
    """
    blocks = []
    current_block = []
    for line in lines:
        if line.strip():
            current_block.append(line)
        else:
            if current_block:
                blocks.append(current_block)
                current_block = []
    if current_block:
        blocks.append(current_block)
    return blocks

In [5]:
def parse_intervals_overlaps(file_path):
    """
    Reads the file from file_path, segregates it into blocks (separated by empty lines),
    and for each block computes token intervals (using a multi-space delimiter) and the
    overlaps between tokens in the header (first line) and each data line.
    
    Returns:
      A list of DataFrames—one per block—with the following columns:
         - Token, Start Range, End Range, Overlap With, Raw Score, Jaccard Score, Line
      For the header row, the last three columns are null.
    """
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        encoding = chardet.detect(raw_data)['encoding']
        file_text = raw_data.decode(encoding)
    
    lines = file_text.splitlines()
    blocks = segregate_blocks(lines)
    block_dataframes = []
    
    print(len(blocks))
    for block_idx, block in enumerate(blocks, start=1):
        print(f"--- Processing Block {block_idx} ---")
        line_tokens = [get_token_intervals_multi_2(line) for line in block]
    
        rows = []

        header_tokens = line_tokens[0]
        for token, interval in header_tokens.items():
            rows.append({
                "Token": token,
                "Start Range": interval[0],
                "End Range": interval[1],
                "Overlaps With": None,
                "Raw Score": None,
                "Jaccard Score": None,
                "Line": 1
            })

        for line_number, token_dict in enumerate(line_tokens[1:], start=2):
            for token, interval in token_dict.items():
                overlaps = [] 
                for header_token, header_interval in header_tokens.items():
                    raw, jaccard = compute_interval_overlap(interval, header_interval)
                    if raw > 0:
                        overlaps.append((header_token, raw, jaccard))
                if overlaps:
                    for header_token, raw, jaccard in overlaps:
                        rows.append({
                            "Token": token,
                            "Start Range": interval[0],
                            "End Range": interval[1],
                            "Overlaps With": header_token,
                            "Raw Score": raw,
                            "Jaccard Score": round(jaccard, 2),
                            "Line": line_number
                        })
                else:
                    rows.append({
                        "Token": token,
                        "Start Range": interval[0],
                        "End Range": interval[1],
                        "Overlaps With": None,
                        "Raw Score": None,
                        "Jaccard Score": None,
                        "Line": line_number
                    })
        df_block = pd.DataFrame(rows)
        block_dataframes.append(df_block)
        display(df_block.head(len(header_tokens)*5)) 
        print("\n========================\n")
    
    return block_dataframes

In [6]:
if __name__ == "__main__":
    file_path = "./test_interval.txt"
    print("Displaying analysis of tokens overlaps.\n")
    dfs = parse_intervals_overlaps(file_path)

Displaying analysis of tokens overlaps.

12
--- Processing Block 1 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Sample Number,1,13,,,,1
1,Depth to top,16,27,,,,1
2,238U,34,37,,,,1
3,232Th,51,55,,,,1
4,230Th/232Th,71,81,,,,1
5,d234U*,95,100,,,,1
6,230Th/238U,113,122,,,,1
7,Age (yr),135,142,,,,1
8,Age (yr) 2,155,162,,,,1
9,d234U initial,173,185,,,,1




--- Processing Block 2 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Sample Number,1,13,,,,1
1,Depth to top,16,27,,,,1
2,238U,34,37,,,,1
3,232Th,51,55,,,,1
4,230Th/232Th,71,81,,,,1
5,d234U*,95,100,,,,1
6,230Th/238U,113,122,,,,1
7,Age (yr),135,142,,,,1
8,Age (yr) 2,155,162,,,,1
9,d234U initial,173,185,,,,1




--- Processing Block 3 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Sample Number,1,13,,,,1
1,Depth to top,16,27,,,,1
2,238U,34,37,,,,1
3,232Th,51,55,,,,1
4,230Th/232Th,71,81,,,,1
5,d234U*,95,100,,,,1
6,230Th/238U,113,122,,,,1
7,Age (yr),135,142,,,,1
8,Age (yr) 2,155,162,,,,1
9,d234U initial,173,185,,,,1




--- Processing Block 4 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Sample Number,1,13,,,,1
1,Depth to top,16,27,,,,1
2,238U,34,37,,,,1
3,232Th,51,55,,,,1
4,230Th/232Th,71,81,,,,1
5,d234U*,95,100,,,,1
6,230Th/238U,113,122,,,,1
7,Age (yr),135,142,,,,1
8,Age (yr) 2,155,162,,,,1
9,d234U initial,173,185,,,,1




--- Processing Block 5 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Sample Number,1,13,,,,1
1,Depth to top,16,27,,,,1
2,238U,34,37,,,,1
3,232Th,51,55,,,,1
4,230Th/232Th,71,81,,,,1
5,d234U*,95,100,,,,1
6,230Th/238U,113,122,,,,1
7,Age (yr),135,142,,,,1
8,Age (yr) 2,155,162,,,,1
9,d234U initial,173,185,,,,1




--- Processing Block 6 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Sample Number,1,13,,,,1
1,Depth to top,16,27,,,,1
2,238U,34,37,,,,1
3,232Th,51,55,,,,1
4,230Th/232Th,71,81,,,,1
5,d234U*,95,100,,,,1
6,230Th/238U,113,122,,,,1
7,Age (yr),135,142,,,,1
8,Age (yr) 2,155,162,,,,1
9,d234U initial,173,185,,,,1




--- Processing Block 7 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Depth,1,5,,,,1
1,Age,16,18,,,,1
2,d18O,26,29,,,,1
3,1395,1,4,Depth,4.0,0.8,2
4,7992,15,18,Age,3.0,0.75,2
5,-8.96,25,29,d18O,4.0,0.8,2
6,1396,1,4,Depth,4.0,0.8,3
7,7999,15,18,Age,3.0,0.75,3
8,-9.2,26,29,d18O,4.0,1.0,3
9,1397,1,4,Depth,4.0,0.8,4




--- Processing Block 8 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Depth,1,5,,,,1
1,Age,16,18,,,,1
2,d18O,26,29,,,,1
3,858.5,1,5,Depth,5.0,1.0,2
4,7990,15,18,Age,3.0,0.75,2
5,-8.79,25,29,d18O,4.0,0.8,2
6,858.8,1,5,Depth,5.0,1.0,3
7,7996,15,18,Age,3.0,0.75,3
8,-8.64,25,29,d18O,4.0,0.8,3
9,859,1,3,Depth,3.0,0.6,4




--- Processing Block 9 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Depth,1,5,,,,1
1,Age,16,18,,,,1
2,d18O,26,29,,,,1
3,0,1,1,Depth,1.0,0.2,2
4,8030,15,18,Age,3.0,0.75,2
5,-1.89,25,29,d18O,4.0,0.8,2
6,1,1,1,Depth,1.0,0.2,3
7,8050,15,18,Age,3.0,0.75,3
8,-1.85,25,29,d18O,4.0,0.8,3
9,3,1,1,Depth,1.0,0.2,4




--- Processing Block 10 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Depth,1,5,,,,1
1,Age,16,18,,,,1
2,d18O,26,29,,,,1
3,153.8,1,5,Depth,5.0,1.0,2
4,7678,15,18,Age,3.0,0.75,2
5,-4.08,25,29,d18O,4.0,0.8,2
6,154.3,1,5,Depth,5.0,1.0,3
7,7680,15,18,Age,3.0,0.75,3
8,-4.08,25,29,d18O,4.0,0.8,3
9,154.8,1,5,Depth,5.0,1.0,4




--- Processing Block 11 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Depth,1,5,,,,1
1,Age,16,18,,,,1
2,d18O,26,29,,,,1
3,90,1,2,Depth,2.0,0.4,2
4,8087,15,18,Age,3.0,0.75,2
5,-5.07,25,29,d18O,4.0,0.8,2
6,90.5,1,4,Depth,4.0,0.8,3
7,8089,15,18,Age,3.0,0.75,3
8,-5.37,25,29,d18O,4.0,0.8,3
9,91,1,2,Depth,2.0,0.4,4




--- Processing Block 12 ---


Unnamed: 0,Token,Start Range,End Range,Overlaps With,Raw Score,Jaccard Score,Line
0,Depth,1,5,,,,1
1,Age,16,18,,,,1
2,d18O,26,29,,,,1
3,199,1,3,Depth,3.0,0.6,2
4,7782,15,18,Age,3.0,0.75,2
5,-6.151,24,29,d18O,4.0,0.67,2
6,200.5,1,5,Depth,5.0,1.0,3
7,7789,15,18,Age,3.0,0.75,3
8,-6.372,24,29,d18O,4.0,0.67,3
9,202,1,3,Depth,3.0,0.6,4






In [7]:
len(dfs)

12