show the keys of ArXiv-tables entries

In [None]:
from datasets import load_dataset

dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Check available keys in first entry
print(dataset[0].keys())

get the first entry

In [None]:
from datasets import load_dataset

# Load from Hugging Face
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Extract LaTeX from each entry
for i, entry in enumerate(dataset):
    latex_code = entry["latex_content"]
    print(f"[Table {i} LaTeX]")
    print(latex_code)
    break;

In [None]:
from datasets import load_dataset
import re

# Load dataset
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Track max count
max_count = 0
top_entries = []

for i, entry in enumerate(dataset):
    latex = entry["latex_content"]
    count = len(re.findall(r'\\begin{tabular}', latex))

    if count > max_count:
        max_count = count
        top_entries = [(i, count, latex)]
    elif count == max_count:
        top_entries.append((i, count, latex))

print(f"Max tabular count: {max_count}")
print(f"Entries with most tabulars:")
for idx, cnt, code in top_entries:
    print(f"\n--- Entry {idx} has {cnt} tabulars ---\n")
    print(code[:1000])  # preview first 1000 characters

In [49]:
import re

def add_column_to_outermost_tabular(latex_code):
    lines = latex_code.splitlines()
    new_lines = []
    nest = 0

    for line in lines:
        stripped = line.strip()

        # Start of a tabular block
        if r'\begin{tabular}' in stripped:
            nest += 1
            if nest == 1:
                # Add an extra column (e.g., 'l' alignment)
                line = re.sub(r'{([^}]*)}', lambda m: '{' + m.group(1) + 'l}', line, count=1)

        # End of tabular block
        elif r'\end{tabular}' in stripped:
            if nest == 1:
                pass  # could mark end for outermost if needed
            nest -= 1

        # Modify rows in outermost tabular only
        if nest == 1 and '&' in line and r'\\' in line:
            parts = line.split('&')
            parts.insert(-1, ' NEW ')  # insert before last cell
            line = ' & '.join(parts)

        new_lines.append(line)

    return '\n'.join(new_lines)

In [None]:
import re

def add_column_to_outermost_tabular(latex_str, new_column_type="|p{3cm}|", new_cell_value=" NEW "):
    lines = latex_str.splitlines()
    new_lines = []
    nest = 0
    inside_outer_tabular = False

    for line in lines:
        stripped = line.strip()

        match = re.match(r'(\\begin\{tabular\})\{([^}]*)\}', stripped)
        if match:
            nest += 1
            if nest == 1:
                inside_outer_tabular = True
                original_start, colspec = match.groups()
                new_colspec = colspec.strip() + new_column_type
                line = re.sub(r'{([^}]*)}', lambda m: '{' + m.group(1).strip() + '|p{3cm}|}', line, count=1)

        elif r'\end{tabular}' in stripped:
            if nest == 1:
                inside_outer_tabular = False
            nest -= 1
       

        # Modify only inside the outermost tabular
        if inside_outer_tabular:
            # Patch multicolumn if found
            if r'\multicolumn{' in line:
                line = re.sub(r'\\multicolumn\{(\d+)\}', lambda m: f'\\multicolumn{{{int(m.group(1)) + 1}}}', line)

            # Add a new cell to data rows
            if '&' in line and r'\\' in line:
                parts = line.split('&')
                parts.insert(-1, new_cell_value)
                line = ' & '.join(parts)

        new_lines.append(line)

    return '\n'.join(new_lines)

# Load dataset
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Process the first 3 entries
for i in range(3):
    original = dataset[i]["latex_content"]
    modified = add_column_to_outermost_tabular(original)

    #print(f"\n--- Table {i}: Original ---\n")
    #print(original[:800])
    
    print(f"\n--- Table {i}: Modified ---\n")
    print(modified[:800])

In [None]:
def modify_column_spec(latex_line, addition="p{3cm}|"):
    brace_count = 0
    first_done = False
    start_idx = None
    end_idx = None

    pattern = r"(\\begin\{tabular\}\{)([^}]+)(\})"
    match = re.search(pattern, latex)
    
    if not match:
        return latex  # no tabular environment found

    prefix, column_format, suffix = match.groups()
    
    if '|c|' in column_format:
        new_column_format = column_format + '|c'
        new_latex = prefix + new_column_format + suffix + latex[match.end():]
        return new_latex
    for i, char in enumerate(latex_line):
        if char == '{':
            brace_count += 1
            if brace_count == 1 and not first_done:
                first_done = True
            elif brace_count == 1 and first_done and start_idx is None:
                start_idx = i + 1  # content starts after this
        elif char == '}':
            if brace_count == 1 and start_idx is not None and end_idx is None:
                end_idx = i  # content ends before this
                break
            brace_count -= 1

    if start_idx is not None and end_idx is not None:
        # Get original column spec
        colspec = latex_line[start_idx:end_idx]
        new_colspec = colspec + addition
        return latex_line[:start_idx] + new_colspec + latex_line[end_idx:]
    else:
        return latex_line  # unchanged if not found

line = r"\begin{tabular}{|c|c|c|}"
group = modify_column_spec(line)
print("Second group:", group)

In [None]:
import re

def add_column_to_outermost_tabular(latex_str, new_cell_value=" NEW "):
    lines = latex_str.splitlines()
    new_lines = []
    nest = 0
    inside_outer_tabular = False

    for line in lines:
        stripped = line.strip()

        match = re.match(r'(\\begin\{tabular\})\{([^}]*)\}', stripped)
        if match:
            nest += 1
            if nest == 1:
                inside_outer_tabular = True
                original_start, colspec = match.groups()
                line = modify_column_spec(line, addition="p{3cm}|")

        elif r'\end{tabular}' in stripped:
            if nest == 1:
                inside_outer_tabular = False
            nest -= 1
       

        # Modify only inside the outermost tabular
        if inside_outer_tabular:
            # Patch multicolumn if found
            if r'\multicolumn{' in line:
                line = re.sub(r'\\multicolumn\{(\d+)\}', lambda m: f'\\multicolumn{{{int(m.group(1)) + 1}}}', line)
            
            if r'\cline{' in line:
                line = re.sub(r'\\cline\{(\d+)-(\d+)\}', lambda m: fr'\cline{{{m.group(1)}-{int(m.group(2))+1}}}', line)

            # Add a new cell to data rows
            if '&' in line and r'\\' in line:
                parts = line.split('&')
                parts.insert(-1, new_cell_value)
                line = ' & '.join(parts)

        new_lines.append(line)

    return '\n'.join(new_lines)


# Load dataset
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Process the first 3 entries
for i in range(3):
    original = dataset[i]["latex_content"]
    modified = add_column_to_outermost_tabular(original)
    #modified = replace_last_line_with_end_table(modified)

    print(f"\n--- Table {i}: Original ---\n")
    print(original[:800])
    
    print(f"\n--- Table {i}: Modified ---\n")
    print(modified[:1000])

In [70]:
import re

def modify_latex_column_format(latex):
    pattern = r"(\\begin\{tabular\}\{)([^}]+)(\})"
    match = re.search(pattern, latex)
    
    if not match:
        return latex  # no tabular environment found

    prefix, column_format, suffix = match.groups()
    
    if 'c|' in column_format:
        new_column_format = column_format + '|c'
    else:
        new_column_format = column_format + '|p{3cm}'

    new_latex = prefix + new_column_format + suffix + latex[match.end():]
    return new_latex

In [None]:
# Load dataset
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Process the first 3 entries
for i in range(3):
    original = dataset[i]["latex_content"]
    modified = modify_latex_column_format(original)
    #modified = replace_last_line_with_end_table(modified)

    print(f"\n--- Table {i}: Original ---\n")
    print(original[:800])
    
    print(f"\n--- Table {i}: Modified ---\n")
    print(modified[:1000])