In [None]:
import pandas as pd
import tqdm
import glob
import re
import PyPDF2
from collections import Counter

# Create master list of papers

In [None]:
def create_paper_master_list(reset=False):

    if reset:
        open("../data/pdfs/arxiv_master_list.txt", "w").close()
    
    with open("../data/pdfs/arxiv_master_list.txt", "r") as infile:
        arxiv_master_list = infile.read()

    to_write = []

    for _dir in glob.glob("../data/pdfs/*"):

        papers = glob.glob(f"{_dir}/*.pdf")
        arxiv_ids = [paper.split("/")[-1].replace(".pdf", "") for paper in papers]

        for _id in arxiv_ids:
            if _id not in arxiv_master_list:
                to_write.append(_id)

    with open("../data/pdfs/arxiv_master_list.txt", "a") as outfile:
        outfile.write("\n".join(to_write))

In [None]:
create_paper_master_list(reset=False)

# Remove duplicated papers

In [None]:
def remove_duplicated_papers():
    
    with open("../data/pdfs/arxiv_master_list.txt", "r") as infile:
        arxiv_master_list = infile.read()

    all_papers = {}

    duplicated_papers = dict(Counter(arxiv_master_list.split("\n")))

    # get all paper arxiv ids by category
    for _dir in glob.glob("../data/pdfs/*/"):

        papers = glob.glob(f"{_dir}/*.pdf")
        arxiv_ids = [paper.split("/")[-1].replace(".pdf", "") for paper in papers]

        all_papers[_dir] = arxiv_ids

    # if a paper is duplicated, delete until only one is left
    for doc, count in tqdm.tqdm(duplicated_papers.items()):

        if count > 1:

            dir_present = [doc in v for k,v in all_papers.items()]
            idx = [i for i, x in enumerate(dir_present) if x]
            directories = [list(all_papers.keys())[i] for i in idx]

            while len(directories) > 1:
                for directory in directories:
                    os.remove(os.path.join(directory, f"{doc}.pdf"))
                    directories.remove(directory)

        else:
            continue

In [None]:
remove_duplicated_papers()

In [None]:
# once duplicated papers have been delete, reinitialise master list
create_paper_master_list(reset=True)

# Parse PDFs

In [None]:
def parse_pdfs_to_text(cat):

    # replace arxiv id "." with "_" in filenames
    pdfs = glob.glob(f"../data/pdfs/{cat.replace('.', '_')}/*.pdf")

    for pdf in pdfs:

        new_name = re.sub(r"(\d+)\.(\d+)", r"\1_\2", pdf)
        os.rename(pdf, new_name)
    
    # need to read new names with "_" instead of "." in arxiv id
    pdfs = glob.glob(f"../data/pdfs/{cat.replace('.', '_')}/*.pdf")

    for pdf in tqdm.tqdm(pdfs):

        text = []

        with open(pdf, "rb") as infile:

            try:
                parsed_pdf = PyPDF2.PdfReader(infile)

                for page in range(len(parsed_pdf.pages)):

                    page_obj = parsed_pdf.pages[page]
                    text.append(page_obj.extract_text())

                text = "\n".join(text)

                paper_id = re.search('(\d+.*)\.pdf', pdf).group(1)

                with open(f"../data/txt/full_papers/{cat.replace('.', '_')}/{paper_id}.txt", "w") as outfile:
                    outfile.write(text)

            except Exception as e:
                print(f"{pdf}: {e}")

In [None]:
parse_pdfs_to_text(cat)

### Remove papers with 0 byte size

In [None]:
def delete_empty_files(cat):

    docs = glob.glob(f"../data/txt/full_papers/{cat.replace('.', '_')}/*.txt")

    for doc in docs:

        size = os.path.getsize(doc)

        if size == 0:

            os.remove(doc)

In [None]:
delete_empty_files(cat)

### Extract references and content (without LLM) and append to "csXXpapers_aug.jsonl" master file

In [None]:
def extract_refs_content(cat):

    arxiv_data = pd.read_json(f"../data/{cat.replace('.', '_')}papers.jsonl", lines=True)
    arxiv_ids = arxiv_data["entry_id"].apply(lambda x: x.split("/")[-1].replace(".", "_"))

    papers = glob.glob(f"../data/txt/full_papers/{cat.replace('.', '_')}/*.txt")

    for paper_path in tqdm.tqdm(papers):

        # save references to .txt file
        with open(paper_path, "r") as infile:
            paper = infile.read()

        references = paper.split("\nReferences\n")[-1]
        if len(references) != 2:
            references = paper.split("\nREFERENCES\n")[-1]
        else:
            references = "No references found"

        paper_id = paper_path.split("/")[-1]

        with open(f"../data/txt/references/{cat.replace('.', '_')}/{paper_id}", "w") as outfile:
            outfile.write(references)

        # append references to "cs_XXpapers.jsonl"
        try:
            id = arxiv_ids[arxiv_ids == paper_id.replace(".txt", "")].index.values[0]
            arxiv_data.loc[id, "references"] = str(references)
        except Exception as e:
            print(paper_id)

        # save content to .txt file
        content = paper.split("\nReferences\n")[0]

        with open(f"../data/txt/content/{cat.replace('.', '_')}/{paper_id}", "w") as outfile:
            outfile.write(content)

        # append content to "cs_XXpapers.jsonl"
        try:
            id = arxiv_ids[arxiv_ids == paper_id.replace(".txt", "")].index.values[0]
            arxiv_data.loc[id, "content"] = str(content)
        except Exception as e:
            print(paper_id)
            
    with open(f"../data/{cat.replace('.', '_')}papers_aug.jsonl", "w") as f:
        f.write(arxiv_data.to_json(orient='records', lines=True, force_ascii=False))

In [None]:
extract_refs_content(cat)

# Create version of cs_XXpapers.jsonl with valid "content" field (not None)

In [None]:
cat = "cs_CL"

arxiv_data = pd.read_json(f"../data/{cat.replace('.', '_')}papers_aug.jsonl", lines=True)
arxiv_data.dropna(subset=["content"], inplace=True)

In [None]:
with open(f"../data/{cat.replace('.', '_')}papers_aug_clean.jsonl", "w") as f:
    f.write(arxiv_data.to_json(orient='records', lines=True, force_ascii=False))

# Check references

In [None]:
with open(f"../data/txt/references/{cat.replace('.', '_')}/2309_09958v1.txt", "r") as infile:
    paper = infile.read()

In [None]:
references = paper.split("\nReferences\n")[-1]

In [None]:
print(references)