In [1]:
#imports
import pandas as pd
import os, ipdb, re
import random, evaluate
import string
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from datasets import DatasetDict, Dataset, load_dataset
import wandb
import ast
import re, os
import subprocess

In [2]:
import logging

# Create a logger
logger = logging.getLogger('my_logger')
logger.setLevel(logging.ERROR)  # Set the logging level

# Create a file handler that logs even debug messages
fh = logging.FileHandler('dataset_creation_logs.log')
fh.setLevel(logging.ERROR)

# Create a formatter and set it for the handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(fh)


## Latex

In [3]:
id_ = ["2105.02723v1","2104.06378v1","2104.04946v1","1912.03330v1", "1912.02738v4","1912.01326v3", "1912.00998v2"]
latex = f"/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/sample/{id_[0]}.tex"
# latex = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/1312.6114v10.tex"

In [6]:
def potential_tdms_context(latex_file, output_folder):
    
    if len(latex_file.rsplit("/",1)) != 2:
        return 
    
    base_source, filename = latex_file.rsplit("/",1)

    if len(filename.rsplit(".",1)) != 2:
        return 
    
    file_id, file_ext = filename.rsplit(".",1)
    
    
    # Read the input LaTeX file
    with open(latex_file, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()

    # # Define the pattern to match sections to be removed
    # # This pattern looks for the string \section{Introduction} or \section{Experiment setup}
    # # followed by any content until the next occurrence of \section or the end of the file
    # pattern = re.compile(
    #     r'\\section\{Introduction\}.*?(?=\\section|\Z)|'
    #     r'\\section\{Related work\}.*?(?=\\section|\Z)|'
    #     r'\\section\{Experiment setup\}.*?(?=\\section|\Z)',
    #     re.DOTALL
    # )

    # # Define the pattern to match sections to be removed
    # # This pattern looks for the string \section, followed by any number of characters,
    # # followed by either 'Introduction' or 'Experiment setup', followed by any characters
    # # until the next occurrence of \section or the end of the file.
    # # \s* matches any whitespace characters, and [^}]* matches any character except '}'
    # pattern = re.compile(
    #     r'\\section\*?\s*\{[^}]*\b(Introduction(s?)|Related work(s?)|Future work(s?)|Background(s?)|Discussion(s?)|Methodology|Appendix|Supplementary|Supplemental)\b[^}]*\}.*?(?=\\section|\\end\{document\}|\\bibliography|\Z)',
    #     # r'\\section\s*\{[^}]*\b(Introduction|Related work|Future work)\b[^}]*\}.*?(?=\\section|\Z)',
    #     re.DOTALL | re.IGNORECASE
    # )
    

    pattern = re.compile(
        r'''
        (                             # Start capturing group
            \\section                 # Match \section
            \*?                       # Match optional *
            \s*                       # Match optional whitespace
            \{                        # Match {
            (?!                       # Start negative lookahead
                Result(s?)               # Negative lookahead for Results
                |                     # Or
                Experimentation(s?)       # Negative lookahead for Experimentation
                |
                Experiment(s?)
                |
                Conclusion
            )                         # End negative lookahead
            [^}]*                     # Match any characters except }
            \}                        # Match }
            .*?                       # Match any characters (non-greedy)
            (?=\\section|\\end\{document\}|\\bibliography|\Z)          # Positive lookahead for next \section, \end{document}, \bibliography or end of string
        )                             # End capturing group
        ''',
        re.DOTALL | re.IGNORECASE | re.VERBOSE
    )



    # Remove the matched content
    content_new = re.sub(pattern, '', content)

    if not os.path.exists(f"{output_folder}"):
        os.makedirs(f"{output_folder}")
        
    # if os.path.exists(f"{base_source}/edits/{file_id}_edit.{file_ext}"):
    #     os.remove(f"{base_source}/edits/{file_id}_edit.{file_ext}")
            
    # Write the modified content back to the file
    with open(f"{output_folder}/{file_id}_summarised.{file_ext}", 'w', encoding='utf-8', errors='ignore') as file:
        file.write(content_new)

In [7]:
# source_folder = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_tex"
source_folder = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_links_tex_first_5000"

for file_id in tqdm(os.listdir(f"{source_folder}")):
    latex_file = f"{source_folder}/{file_id}"
    potential_tdms_context(latex_file, output_folder="/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_summarised")

100%|██████████| 4737/4737 [03:55<00:00, 20.12it/s]


In [9]:
def pandoc_latex_to_text(latex_file, output_folder):
    # Output a plain text file given a valid .tex file. 
    
    file_ID = latex_file.rsplit("/")[-1].rsplit(".", 1)[0]
    
    if not os.path.exists(f"{output_folder}"):
        os.makedirs(f"{output_folder}")
    
    # logger.warning(f"Processing {file_ID}")
    # Construct the command
    command = [
        "pandoc",
        "--to=plain",
        "--template=../data_proccess/template.plain",
        "--wrap=none",
        f"{latex_file}",
        "-o",
        f"{output_folder}/{file_ID}.txt",
        "--quiet"
    ]
    
    try:
    
        # print(f"Processing file : {file_ID}")
        result = subprocess.run(command, stderr=subprocess.PIPE, text=True, timeout=120)
        result.check_returncode()  # This will raise CalledProcessError if the command failed
    except subprocess.CalledProcessError as e:
        error_message = e.stderr
        # print(f"File {latex_file} failed with an error: {error_message}")
        logger.error(f"File {latex_file} failed with an error: {error_message}")
    except subprocess.TimeoutExpired as e:
        # Handle the timeout case
        logger.error(f"File {latex_file} processing timed out after 2 minutes")
    

# pandoc --to=plain --template=template.plain --wrap=none "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_tex/{id_[i]}.tex" -o "$arxiv_txt_dir/${file_ID}.txt" --quiet

In [27]:
# !pandoc --to=plain --template=../data_proccess/template.plain --wrap=none "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_tex/1911.08670v2.tex"

In [None]:
i = -1

# Example usage
id_ = ["2105.01288v1","2105.01601v1","2105.01883v1", "2105.02184v1", "1911.08670v2"]
# latex_file = f"/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/sample/{id_[i]}.tex"
latex_file = f"/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_tex/{id_[i]}.tex"

pandoc_latex_to_text(latex_file, output_folder="/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/sample")

In [32]:
id_ = ["2105.01288v1_edit","2105.01601v1_edit","2105.01883v1_edit", "2105.02184v1_edit", "1611.01731v2_summarised"]
latex_file = f"/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/sample/edits/{id_[i]}.tex"


pandoc_latex_to_text(latex_file, 
                     output_folder="/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/sample/edits")

# try:
#     # Code that might raise an exception
#     pandoc_latex_to_text(latex_file, output_folder="/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/sample/edits")
# except Exception as e:
#     # Code that runs if the exception occurs
#     print(f"File {latex_file} filed :(")
#     print(f"An error occurred: {e}")

# Run for all .tex files 

In [31]:
120/60

2.0

In [5]:
source_tex = "/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_tex_summarised"
for latex_file in tqdm(os.listdir(f"{source_tex}")):
# for latex_file in os.listdir(f"{source_tex}"):
    pandoc_latex_to_text(f"{source_tex}/{latex_file}", 
                         output_folder="/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_txt_summarised")

100%|██████████| 4657/4657 [31:08<00:00,  2.49it/s]   


In [10]:
source_tex = "//nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_summarised"
for latex_file in tqdm(os.listdir(f"{source_tex}")):
# for latex_file in os.listdir(f"{source_tex}"):
    pandoc_latex_to_text(f"{source_tex}/{latex_file}", 
                         output_folder="/nfs/home/kabenamualus/Research/LLLM-LeaderboardLLM/data_proccess/arxiv_no_leaderboard_txt_summarised")

100%|██████████| 4737/4737 [14:13<00:00,  5.55it/s]  


In [27]:
count = 0
for file in os.listdir("../data_proccess/arxiv_txt_summarised/"):
    file_path = "../data_proccess/arxiv_txt_summarised"

    with open(f"{file_path}/{file}", 'r') as file:
        content = file.read()

    if len(content.split("\n\n")) < 5 :
    # if len(content.split()) <= 300 and len(content.split()) <= 380:
    # if len(content.split()) <= 250 :
        # print(len(content.split()))
        count += 1
        
    # if 'Title:\t\n\nAbstract:\t\n' == content:
    #     print("delete")
    
print(count)

322


In [None]:
count = 0
for file in os.listdir("../data_proccess/arxiv_txt_summarised/"):
    file_path = "../data_proccess/arxiv_txt_summarised"

    with open(f"{file_path}/{file}", 'r') as file:
        content = file.read()

    if len(content.split()) >= 50 and len(content.split()) <= 100:
        # print(len(content.split()))
        count += 1
        
    # if 'Title:\t\n\nAbstract:\t\n' == content:
    #     print("delete")
    
print(count)