show the keys of ArXiv-tables entries

In [46]:
from datasets import load_dataset

dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Check available keys in first entry
print(dataset[0].keys())

dict_keys(['id', 'arxiv_id', 'page', 'bounding_box', 'latex_content', 'extracted_content', 'similarity_score', 'table_image', 'page_image'])


get the first entry

In [47]:
from datasets import load_dataset

# Load from Hugging Face
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Extract LaTeX from each entry
for i, entry in enumerate(dataset):
    latex_code = entry["latex_content"]
    print(f"[Table {i} LaTeX]")
    print(latex_code)
    break;

[Table 0 LaTeX]
\begin{table}[H]
    \centering
    \scriptsize
    \begin{tabular}{|p{3cm}|p{3cm}|p{4cm}|p{3cm}|}
      \hline
      \hline
      \multicolumn{4}{c}{Details of Experiments for the Employed Data Set}\\
      \cline{1-4}
      \emph{Domain} & \emph{Raw Features} & \emph{Response} & \emph{Data Set Cardinality}\\
      \hline
      Australian Credit Scoring & 16 & Desired credit approval of individuals based on characteristics & 690\\\hline
    \end{tabular}
    \caption{\small Data set descriptions for the experiments used to validate the efficacy of the proposed algorithm. We summarize here the domain of the application, the input features to the algorithm, the response variable we wish to predict and the number of examples provided in the data.}
  \end{table}


In [48]:
from datasets import load_dataset
import re

# Load dataset
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Track max count
max_count = 0
top_entries = []

for i, entry in enumerate(dataset):
    latex = entry["latex_content"]
    count = len(re.findall(r'\\begin{tabular}', latex))

    if count > max_count:
        max_count = count
        top_entries = [(i, count, latex)]
    elif count == max_count:
        top_entries.append((i, count, latex))

print(f"Max tabular count: {max_count}")
print(f"Entries with most tabulars:")
for idx, cnt, code in top_entries:
    print(f"\n--- Entry {idx} has {cnt} tabulars ---\n")
    print(code[:1000])  # preview first 1000 characters

Max tabular count: 11
Entries with most tabulars:

--- Entry 1187 has 11 tabulars ---

\begin{table}[h!]
\centering
\caption[Comparison of movement data libraries.]{Comparison of movement data libraries. Packages are predominantly available open source in R and Python and they are compared with regards to their focus, documentation and functionality. While other movement analysis libraries already provide well-maintained and documented code with rich functionality for trajectory analysis, only Trackintel provides robust and flexible methods to aggregate trajectories into locations, trips and tours. \\
(\checkmark / \halfcheckmark / x : available / partially available / not available)}
\resizebox{\textwidth}{!}{
% \begin{tabular}{@{}l|llp{9cm}@{}}
% \toprule
% Library & 
% \rot{Documentation score} & \rot{Coverage}                    & \rot{Open source} \\
% \midrule
% Trackintel &
% \checkmark & X & X \\
% \bottomrule
% \end{tabular}

\begin{tabular}{l|lclllllllllllll}
\toprule
       

In [49]:
import re

def add_column_to_outermost_tabular(latex_code):
    lines = latex_code.splitlines()
    new_lines = []
    nest = 0

    for line in lines:
        stripped = line.strip()

        # Start of a tabular block
        if r'\begin{tabular}' in stripped:
            nest += 1
            if nest == 1:
                # Add an extra column (e.g., 'l' alignment)
                line = re.sub(r'{([^}]*)}', lambda m: '{' + m.group(1) + 'l}', line, count=1)

        # End of tabular block
        elif r'\end{tabular}' in stripped:
            if nest == 1:
                pass  # could mark end for outermost if needed
            nest -= 1

        # Modify rows in outermost tabular only
        if nest == 1 and '&' in line and r'\\' in line:
            parts = line.split('&')
            parts.insert(-1, ' NEW ')  # insert before last cell
            line = ' & '.join(parts)

        new_lines.append(line)

    return '\n'.join(new_lines)

In [60]:
import re

def add_column_to_outermost_tabular(latex_str, new_column_type="|p{3cm}|", new_cell_value=" NEW "):
    lines = latex_str.splitlines()
    new_lines = []
    nest = 0
    inside_outer_tabular = False

    for line in lines:
        stripped = line.strip()

        match = re.match(r'(\\begin\{tabular\})\{([^}]*)\}', stripped)
        if match:
            nest += 1
            if nest == 1:
                inside_outer_tabular = True
                original_start, colspec = match.groups()
                new_colspec = colspec.strip() + new_column_type
                line = re.sub(r'{([^}]*)}', lambda m: '{' + m.group(1).strip() + '|p{3cm}|}', line, count=1)

        elif r'\end{tabular}' in stripped:
            if nest == 1:
                inside_outer_tabular = False
            nest -= 1
       

        # Modify only inside the outermost tabular
        if inside_outer_tabular:
            # Patch multicolumn if found
            if r'\multicolumn{' in line:
                line = re.sub(r'\\multicolumn\{(\d+)\}', lambda m: f'\\multicolumn{{{int(m.group(1)) + 1}}}', line)

            # Add a new cell to data rows
            if '&' in line and r'\\' in line:
                parts = line.split('&')
                parts.insert(-1, new_cell_value)
                line = ' & '.join(parts)

        new_lines.append(line)

    return '\n'.join(new_lines)

# Load dataset
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Process the first 3 entries
for i in range(3):
    original = dataset[i]["latex_content"]
    modified = add_column_to_outermost_tabular(original)

    #print(f"\n--- Table {i}: Original ---\n")
    #print(original[:800])
    
    print(f"\n--- Table {i}: Modified ---\n")
    print(modified[:800])


--- Table 0: Modified ---

\begin{table}[H]
    \centering
    \scriptsize
    \begin{tabular|p{3cm}|}{|p{3cm}|p{3cm}|p{4cm}|p{3cm}|}
      \hline
      \hline
      \multicolumn{5}{c}{Details of Experiments for the Employed Data Set}\\
      \cline{1-4}
      \emph{Domain}  &  \emph{Raw Features}  &  \emph{Response}  &  NEW  &  \emph{Data Set Cardinality}\\
      \hline
      Australian Credit Scoring  &  16  &  Desired credit approval of individuals based on characteristics  &  NEW  &  690\\\hline
    \end{tabular}
    \caption{\small Data set descriptions for the experiments used to validate the efficacy of the proposed algorithm. We summarize here the domain of the application, the input features to the algorithm, the response variable we wish to predict and the number of examples provided in the data.}
  \end{t

--- Table 1: Modified ---

\begin{table}[H]
    \centering
    \scriptsize
    \begin{tabular|p{3cm}|}{|p{3cm}|p{2cm}|p{2cm}|p{2cm}|p{2cm}|}
      \hline
      \hline
   

In [59]:
def modify_column_spec(latex_line, addition="p{3cm}|"):
    brace_count = 0
    first_done = False
    start_idx = None
    end_idx = None

    for i, char in enumerate(latex_line):
        if char == '{':
            brace_count += 1
            if brace_count == 1 and not first_done:
                first_done = True
            elif brace_count == 1 and first_done and start_idx is None:
                start_idx = i + 1  # content starts after this
        elif char == '}':
            if brace_count == 1 and start_idx is not None and end_idx is None:
                end_idx = i  # content ends before this
                break
            brace_count -= 1

    if start_idx is not None and end_idx is not None:
        # Get original column spec
        colspec = latex_line[start_idx:end_idx]
        new_colspec = colspec + addition
        return latex_line[:start_idx] + new_colspec + latex_line[end_idx:]
    else:
        return latex_line  # unchanged if not found

line = r"\begin{tabular}{|p{3cm}|p{3cm}|p{3cm}|}"
group = modify_column_spec(line)
print("Second group:", group)

Second group: \begin{tabular}{|p{3cm}|p{3cm}|p{3cm}|p{3cm}|}


In [None]:
import re

def add_column_to_outermost_tabular(latex_str, new_cell_value=" NEW "):
    lines = latex_str.splitlines()
    new_lines = []
    nest = 0
    inside_outer_tabular = False

    for line in lines:
        stripped = line.strip()

        match = re.match(r'(\\begin\{tabular\})\{([^}]*)\}', stripped)
        if match:
            nest += 1
            if nest == 1:
                inside_outer_tabular = True
                original_start, colspec = match.groups()
                line = modify_column_spec(line, addition="p{3cm}|")

        elif r'\end{tabular}' in stripped:
            if nest == 1:
                inside_outer_tabular = False
            nest -= 1
       

        # Modify only inside the outermost tabular
        if inside_outer_tabular:
            # Patch multicolumn if found
            if r'\multicolumn{' in line:
                line = re.sub(r'\\multicolumn\{(\d+)\}', lambda m: f'\\multicolumn{{{int(m.group(1)) + 1}}}', line)

            # Add a new cell to data rows
            if '&' in line and r'\\' in line:
                parts = line.split('&')
                parts.insert(-1, new_cell_value)
                line = ' & '.join(parts)

        new_lines.append(line)

    return '\n'.join(new_lines)

def replace_last_line_with_end_table(latex_str):
    lines = latex_str.strip().splitlines()
    if lines:
        lines = lines[:-1]  # remove last line
    lines.append(r"\end{table}")
    return "\n".join(lines)

# Load dataset
dataset = load_dataset("staghado/ArXiv-tables", split="train")

# Process the first 3 entries
for i in range(3):
    original = dataset[i]["latex_content"]
    modified = add_column_to_outermost_tabular(original)
    modified = replace_last_line_with_end_table(modified)

    #print(f"\n--- Table {i}: Original ---\n")
    #print(original[:800])
    
    print(f"\n--- Table {i}: Modified ---\n")
    print(modified[:800])


--- Table 0: Modified ---

\begin{table}[H]
    \centering
    \scriptsize
    \begin{tabular}{|p{3cm}|p{3cm}|p{4cm}|p{3cm}|p{3cm}|}
      \hline
      \hline
      \multicolumn{5}{c}{Details of Experiments for the Employed Data Set}\\
      \cline{1-4}
      \emph{Domain}  &  \emph{Raw Features}  &  \emph{Response}  &  NEW  &  \emph{Data Set Cardinality}\\
      \hline
      Australian Credit Scoring  &  16  &  Desired credit approval of individuals based on characteristics  &  NEW  &  690\\\hline
    \end{tabular}
    \caption{\small Data set descriptions for the experiments used to validate the efficacy of the proposed algorithm. We summarize here the domain of the application, the input features to the algorithm, the response variable we wish to predict and the number of examples provided in the data.}
\end{tabl

--- Table 1: Modified ---

\begin{table}[H]
    \centering
    \scriptsize
    \begin{tabular}{|p{3cm}|p{2cm}|p{2cm}|p{2cm}|p{2cm}|p{3cm}|}
      \hline
      \hline
    