In [1]:
import pandas as pd

# Configuration
metadata_file_path = 'arxiv-metadata-oai-snapshot.json'
output_pickle_path = 'filtered_dataset.pkl'

# Load existing metadata
metadata_df = pd.read_json(metadata_file_path, lines=True)



In [6]:
# # Split the 'categories' column into lists
# metadata_df['categories'] = metadata_df['categories'].str.split()

# # Explode the DataFrame to have one category per row
# exploded_df = metadata_df.explode('categories')

# # Find and print all unique categories
# unique_categories = exploded_df['categories'].unique()
# print("Unique Categories:")
# print(unique_categories)

Unique Categories:
['hep-ph' 'math.CO' 'cs.CG' 'physics.gen-ph' 'math.CA' 'math.FA'
 'cond-mat.mes-hall' 'gr-qc' 'cond-mat.mtrl-sci' 'astro-ph' 'math.NT'
 'math.AG' 'math.AT' 'hep-th' 'math.PR' 'hep-ex' 'nlin.PS'
 'physics.chem-ph' 'q-bio.MN' 'math.NA' 'cond-mat.str-el'
 'cond-mat.stat-mech' 'math.RA' 'physics.optics' 'physics.comp-ph'
 'q-bio.PE' 'q-bio.CB' 'quant-ph' 'q-bio.QM' 'hep-lat' 'nucl-th' 'math.OA'
 'math.QA' 'math-ph' 'math.MP' 'nlin.CD' 'physics.plasm-ph'
 'physics.space-ph' 'nlin.SI' 'cs.IT' 'math.IT' 'cs.NE' 'cs.AI'
 'physics.ed-ph' 'math.DG' 'cond-mat.soft' 'physics.pop-ph' 'cs.DS'
 'math.CV' 'math.DS' 'physics.soc-ph' 'nucl-ex' 'math.RT' 'cond-mat.other'
 'physics.flu-dyn' 'physics.data-an' 'cs.CE' 'cs.MS' 'cs.NA' 'math.GR'
 'cond-mat.supr-con' 'math.AC' 'math.SG' 'cs.CC' 'math.KT' 'math.GT'
 'math.AP' 'physics.class-ph' 'q-bio.OT' 'physics.bio-ph' 'q-bio.BM'
 'nlin.CG' 'cs.DM' 'cs.LO' 'cond-mat.dis-nn' 'math.MG' 'physics.atom-ph'
 'math.SP' 'math.ST' 'stat.TH' 'physic

In [7]:
# Remove part after the dot and get unique high-level categories
unique_high_level_categories = set(category.split('.')[0] for category in unique_categories if pd.notna(category))
print("Unique High-level Categories:")
print(unique_high_level_categories)


Unique High-level Categories:
{'econ', 'q-alg', 'alg-geom', 'nlin', 'q-bio', 'hep-ph', 'atom-ph', 'patt-sol', 'hep-lat', 'mtrl-th', 'solv-int', 'gr-qc', 'quant-ph', 'cs', 'chem-ph', 'comp-gas', 'nucl-th', 'ao-sci', 'chao-dyn', 'math', 'cmp-lg', 'cond-mat', 'astro-ph', 'plasm-ph', 'hep-th', 'stat', 'dg-ga', 'funct-an', 'eess', 'acc-phys', 'hep-ex', 'adap-org', 'nucl-ex', 'bayes-an', 'physics', 'q-fin', 'math-ph', 'supr-con'}


In [2]:

# Function to extract unique primary categories from a single 'categories' string
def get_unique_primary_categories(categories_str):
    # Split on spaces to get individual categories
    categories = categories_str.split(' ')
    # Split on '.' to separate primary and sub-categories, and take the primary part
    primary_categories = {category.split('.')[0] for category in categories}
    return primary_categories

# Apply the function to the 'categories' column
metadata_df['unique_primary_categories'] = metadata_df['categories'].apply(get_unique_primary_categories)

# Filter papers with only one unique_primary_category
single_category_papers = metadata_df[metadata_df['unique_primary_categories'].apply(len) == 1]

# Create a copy of the DataFrame to avoid SettingWithCopyWarning
single_category_papers = single_category_papers.copy()

# Now, to simplify further analysis, convert the set to a string
single_category_papers['unique_primary_category'] = single_category_papers['unique_primary_categories'].apply(lambda x: list(x)[0])

# Count the number of records in each unique_primary_category
category_counts = single_category_papers['unique_primary_category'].value_counts()

# Print the count of records in each unique_primary_category
print(category_counts)

# Select the desired columns
selected_columns_df = single_category_papers[['id', 'title', 'doi', 'categories', 'unique_primary_category']]

# Save the selected columns to a pickle file
selected_columns_df.to_pickle(output_pickle_path)

print(f'Filtered data saved to {output_pickle_path}.')




unique_primary_category
math        392119
cs          331262
astro-ph    254712
cond-mat    246311
physics     109502
hep-ph       79856
quant-ph     65928
hep-th       57766
gr-qc        29569
stat         24573
nucl-th      19028
hep-ex       17286
q-bio        14758
nlin         10294
hep-lat       9991
eess          9989
nucl-ex       7376
q-fin         5950
econ          2052
Name: count, dtype: int64
Filtered data saved to filtered_dataset.pkl.


In [1]:
import pandas as pd
import requests
import fitz
import os
from tqdm import tqdm

# Configuration
input_pickle_path = 'filtered_dataset.pkl'
output_csv_path = 'enriched_dataset.csv'
pdf_folder = 'pdfs'

# Load the DataFrame from the pickle file
metadata_df = pd.read_pickle(input_pickle_path)

# Get unique categories from the DataFrame
unique_categories = metadata_df['unique_primary_category'].unique()

# Ensure the PDF folder exists
os.makedirs(pdf_folder, exist_ok=True)

# Function to download PDF
def download_pdf(paper_id, category):
    pdf_url = f'https://export.arxiv.org/pdf/{paper_id}.pdf'
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(pdf_url, headers=headers)
    if response.status_code == 200:
        paper_id = paper_id.replace('/', '_')
        category_folder = os.path.join(pdf_folder, category)
        os.makedirs(category_folder, exist_ok=True)
        pdf_file_path = os.path.join(category_folder, f'{paper_id}.pdf')
        with open(pdf_file_path, 'wb') as file:
            file.write(response.content)
        return pdf_file_path
    else:
        print(f'Failed to retrieve {paper_id}')
        return None

def convert_pdf_to_text(pdf_file_path):
    try:
        pdf_document = fitz.open(pdf_file_path)
        text = ""
        for page_number in range(len(pdf_document)):
            page = pdf_document.load_page(page_number)
            text += page.get_text()
        pdf_document.close()
        return text
    except fitz.fitz.FileDataError as e:
        print(f'Failed to convert {pdf_file_path} to text: {e}')
        return None 

# Initialize an empty list to collect data
full_text_data = []

# Main loop to process each category
for category in tqdm(unique_categories, desc="Processing categories"):
    category_papers = metadata_df[metadata_df['unique_primary_category'] == category]
    sampled_papers = category_papers.sample(n=500, random_state=1, replace=False)
    
    for index, row in tqdm(sampled_papers.iterrows(), total=sampled_papers.shape[0], desc=f"Processing papers in {category}", leave=False):
        paper_id = row['id']
        # print(f'Processing {paper_id}...')
        
        pdf_file_path = download_pdf(paper_id, category)
        if pdf_file_path is None:
            continue
        
        paper_text = convert_pdf_to_text(pdf_file_path)
        
        # Delete the PDF file after processing
        os.remove(pdf_file_path)

        enriched_paper_data = row.to_dict()
        enriched_paper_data['full_text'] = paper_text
        full_text_data.append(enriched_paper_data)

# Create a new DataFrame with the enriched data
enriched_data_df = pd.DataFrame(full_text_data)

# Save the enriched data to a new CSV file
enriched_data_df.to_csv(output_csv_path, index=False)

print(f'Data enrichment complete. Enriched dataset saved to {output_csv_path}.')


Processing categories:   0%|          | 0/19 [00:00<?, ?it/s]

Failed to convert pdfs\hep-ph\hep-ph_9301240.pdf to text: cannot open broken document




Failed to convert pdfs\hep-ph\1010.5976.pdf to text: cannot open broken document




Failed to convert pdfs\hep-ph\hep-ph_9302216.pdf to text: cannot open broken document


Processing categories:   5%|▌         | 1/19 [14:28<4:20:34, 868.57s/it]

Failed to convert pdfs\physics\2109.03600.pdf to text: cannot open broken document




Failed to convert pdfs\physics\physics_0001034.pdf to text: cannot open broken document




Failed to convert pdfs\physics\physics_0003032.pdf to text: cannot open broken document


Processing categories:  11%|█         | 2/19 [42:07<6:17:46, 1333.35s/it]

Failed to convert pdfs\math\1703.01743.pdf to text: cannot open broken document




Failed to convert pdfs\math\2309.08677.pdf to text: cannot open broken document




Failed to convert pdfs\math\1501.02916.pdf to text: cannot open broken document




Failed to convert pdfs\math\1601.00956.pdf to text: cannot open broken document




Failed to convert pdfs\math\1209.2063.pdf to text: cannot open broken document


Processing categories:  16%|█▌        | 3/19 [56:22<4:57:18, 1114.91s/it]

Failed to convert pdfs\cond-mat\2202.08110.pdf to text: cannot open broken document




Failed to convert pdfs\cond-mat\1701.08691.pdf to text: cannot open broken document




Failed to convert pdfs\cond-mat\cond-mat_9402069.pdf to text: cannot open broken document




Failed to convert pdfs\cond-mat\2005.01251.pdf to text: cannot open broken document


Processing categories:  21%|██        | 4/19 [1:16:11<4:46:01, 1144.08s/it]

Failed to convert pdfs\gr-qc\gr-qc_9710115.pdf to text: cannot open broken document




Failed to convert pdfs\gr-qc\1208.5824.pdf to text: cannot open broken document




Failed to convert pdfs\gr-qc\1106.5296.pdf to text: cannot open broken document




Failed to convert pdfs\gr-qc\gr-qc_0610157.pdf to text: cannot open broken document




Failed to convert pdfs\gr-qc\gr-qc_9402029.pdf to text: cannot open broken document


Processing categories:  26%|██▋       | 5/19 [1:31:06<4:06:03, 1054.54s/it]

Failed to convert pdfs\astro-ph\astro-ph_9707257.pdf to text: cannot open broken document




Failed to convert pdfs\astro-ph\astro-ph_9301010.pdf to text: cannot open broken document




Failed to convert pdfs\astro-ph\1403.0814.pdf to text: cannot open broken document


Processing categories:  32%|███▏      | 6/19 [1:58:16<4:30:49, 1249.96s/it]

Failed to convert pdfs\hep-th\0707.1382.pdf to text: cannot open broken document




Failed to convert pdfs\hep-th\hep-th_9307070.pdf to text: cannot open broken document




Failed to convert pdfs\hep-th\hep-th_0408153.pdf to text: cannot open broken document




Failed to convert pdfs\hep-th\2309.00420.pdf to text: cannot open broken document




Failed to convert pdfs\hep-th\1102.4058.pdf to text: cannot open broken document


Processing categories:  42%|████▏     | 8/19 [2:29:08<3:18:12, 1081.10s/it]

Failed to convert pdfs\nlin\2307.14812.pdf to text: cannot open broken document




Failed to convert pdfs\nlin\1204.6637.pdf to text: cannot open broken document




Failed to convert pdfs\nlin\1506.07301.pdf to text: cannot open broken document


Processing categories:  47%|████▋     | 9/19 [2:47:35<3:01:31, 1089.18s/it]

Failed to convert pdfs\q-bio\q-bio_0605026.pdf to text: cannot open broken document




Failed to convert pdfs\q-bio\2203.14201.pdf to text: cannot open broken document




Failed to convert pdfs\q-bio\1907.05529.pdf to text: cannot open broken document


Processing categories:  53%|█████▎    | 10/19 [3:17:22<3:15:42, 1304.68s/it]

Failed to convert pdfs\cs\1309.7735.pdf to text: cannot open broken document




Failed to convert pdfs\cs\2310.02845.pdf to text: cannot open broken document


Processing categories:  58%|█████▊    | 11/19 [3:47:49<3:15:17, 1464.71s/it]

Failed to convert pdfs\nucl-th\2104.07421.pdf to text: cannot open broken document




Failed to convert pdfs\nucl-th\nucl-th_0412036.pdf to text: cannot open broken document




Failed to convert pdfs\nucl-th\0911.5705.pdf to text: cannot open broken document


Processing categories:  63%|██████▎   | 12/19 [4:02:51<2:30:53, 1293.40s/it]

Failed to convert pdfs\quant-ph\quant-ph_0202050.pdf to text: cannot open broken document




Failed to convert pdfs\quant-ph\1008.1521.pdf to text: cannot open broken document




Failed to convert pdfs\quant-ph\quant-ph_9911076.pdf to text: cannot open broken document


Processing categories:  74%|███████▎  | 14/19 [4:41:56<1:42:51, 1234.39s/it]

Failed to convert pdfs\hep-lat\hep-lat_9211054.pdf to text: cannot open broken document




Failed to convert pdfs\hep-lat\hep-lat_9211026.pdf to text: cannot open broken document




Failed to convert pdfs\hep-lat\hep-lat_9211064.pdf to text: cannot open broken document


Processing categories:  79%|███████▉  | 15/19 [4:55:41<1:14:03, 1110.95s/it]

Failed to convert pdfs\stat\1901.04312.pdf to text: cannot open broken document




Failed to convert pdfs\stat\2305.08235.pdf to text: cannot open broken document




Failed to convert pdfs\stat\1512.09325.pdf to text: cannot open broken document




Failed to convert pdfs\stat\1910.12925.pdf to text: cannot open broken document




Failed to convert pdfs\stat\2109.09339.pdf to text: cannot open broken document




Failed to convert pdfs\stat\1612.00099.pdf to text: cannot open broken document




Failed to convert pdfs\stat\1501.02469.pdf to text: cannot open broken document




Failed to convert pdfs\stat\1304.0150.pdf to text: cannot open broken document


Processing categories:  84%|████████▍ | 16/19 [5:18:41<59:35, 1191.80s/it]  

Failed to convert pdfs\q-fin\2008.00908.pdf to text: cannot open broken document




Failed to convert pdfs\q-fin\0911.3117.pdf to text: cannot open broken document




Failed to convert pdfs\q-fin\2202.03146.pdf to text: cannot open broken document




Failed to convert pdfs\q-fin\2112.10447.pdf to text: cannot open broken document


Processing categories:  89%|████████▉ | 17/19 [5:37:22<39:00, 1170.44s/it]

Failed to convert pdfs\eess\2309.09859.pdf to text: cannot open broken document




Failed to convert pdfs\eess\2303.01672.pdf to text: cannot open broken document




Failed to convert pdfs\eess\2104.11316.pdf to text: cannot open broken document




Failed to convert pdfs\eess\2309.06909.pdf to text: cannot open broken document


Processing categories:  95%|█████████▍| 18/19 [6:05:18<22:02, 1322.47s/it]

Failed to convert pdfs\econ\2305.11350.pdf to text: cannot open broken document




Failed to convert pdfs\econ\2305.14029.pdf to text: cannot open broken document




Failed to convert pdfs\econ\2202.06921.pdf to text: cannot open broken document




Failed to convert pdfs\econ\2203.15646.pdf to text: cannot open broken document




Failed to convert pdfs\econ\2005.05713.pdf to text: cannot open broken document




Failed to convert pdfs\econ\2112.10542.pdf to text: cannot open broken document




Failed to convert pdfs\econ\2008.10217.pdf to text: cannot open broken document




Failed to convert pdfs\econ\2209.12426.pdf to text: cannot open broken document


Processing categories: 100%|██████████| 19/19 [6:22:41<00:00, 1208.50s/it]


Data enrichment complete. Enriched dataset saved to enriched_dataset.csv.


see the csv

In [2]:
import pandas as pd

# Load the enriched dataset from the CSV file
enriched_data_df = pd.read_csv('enriched_dataset.csv')

# Remove rows where the 'full_text' column is None or only whitespace
cleaned_data_df = enriched_data_df[enriched_data_df['full_text'].apply(lambda x: bool(str(x).strip()))]

# Save the cleaned data to a new CSV file
cleaned_data_df.to_csv('cleaned_dataset_10k.csv', index=False)
cleaned_data_df.head()


Unnamed: 0,id,title,doi,categories,unique_primary_category,full_text
0,hep-ph/0610334,Weak interaction corrections to hadronic top q...,10.1103/PhysRevD.74.113005,hep-ph,hep-ph,arXiv:hep-ph/0610334v2 30 Nov 2006\nPITHA 06/...
1,2104.06416,Next-to-leading non-global logarithms in QCD,,hep-ph,hep-ph,"Prepared for submission to JHEP\nOUTP-21-08P, ..."
2,hep-ph/9606269,$K_L \to \pi^o \nu \overline{\nu}$ in Extended...,10.1103/PhysRevD.54.4393,hep-ph,hep-ph,arXiv:hep-ph/9606269v3 27 Jun 1996\nWM-96-105...
3,hep-ph/9811382,A critical phenomenological study of inclusive...,10.1007/s100529900018,hep-ph,hep-ph,arXiv:hep-ph/9811382v1 18 Nov 1998\nA CRITICA...
4,1304.2781,Progress in the NNPDF global analysis,,hep-ph,hep-ph,arXiv:1304.2781v1 [hep-ph] 9 Apr 2013\nEdinb...


In [3]:
# Group by 'unique_primary_category' and get the count of records in each category
category_counts = cleaned_data_df.groupby('unique_primary_category').size()

# Print the count of records in each category
print(category_counts)

unique_primary_category
astro-ph    500
cond-mat    500
cs          500
econ        500
eess        500
gr-qc       500
hep-ex      500
hep-lat     500
hep-ph      500
hep-th      500
math        500
nlin        500
nucl-ex     500
nucl-th     500
physics     500
q-bio       500
q-fin       500
quant-ph    500
stat        500
dtype: int64


In [4]:
def is_under_250_words(text):
    word_count = len(str(text).split())
    return word_count < 250

# Apply the function to the 'full_text' column
cleaned_data_df['under_250_words'] = cleaned_data_df['full_text'].apply(is_under_250_words)

# Now 'under_250_words' column will have True if the full text is under 250 words, and False otherwise.
# You can view the first few rows to check:
print(cleaned_data_df[['full_text', 'under_250_words']].head())

# If you want to get the count of records that are under 250 words:
count_under_250 = cleaned_data_df['under_250_words'].sum()
print(f'Number of records with full text under 250 words: {count_under_250}')


                                           full_text  under_250_words
0  arXiv:hep-ph/0610334v2  30 Nov 2006\nPITHA 06/...            False
1  Prepared for submission to JHEP\nOUTP-21-08P, ...            False
2  arXiv:hep-ph/9606269v3  27 Jun 1996\nWM-96-105...            False
3  arXiv:hep-ph/9811382v1  18 Nov 1998\nA CRITICA...            False
4  arXiv:1304.2781v1  [hep-ph]  9 Apr 2013\nEdinb...            False
Number of records with full text under 250 words: 70


In [13]:
import pandas as pd

# Load the dataset
df = pd.read_csv('testing/cleaned_dataset.csv')

# Remove rows where full_text is NaN or empty
df = df.dropna(subset=['full_text'])
df = df[df['full_text'].str.strip() != '']

# Calculate the length of each full_text entry
df['full_text_length'] = df['full_text'].apply(len)


# Print the first few rows of the dataset
print("\nFirst Few Rows of the Dataset:")
print(df.head())


# Count the number of unique values in specific columns
print("\nCount of Unique Categories:")
print(df['categories'].nunique())
print(df['unique_primary_category'].nunique())

# Print statistics about the size of the full text
print("\nStatistics about the Size of Full Text:")
print(df['full_text_length'].describe())


First Few Rows of the Dataset:
               id                                              title  \
0  hep-ph/0610334  Weak interaction corrections to hadronic top q...   
1      2104.06416       Next-to-leading non-global logarithms in QCD   
2  hep-ph/9606269  $K_L \to \pi^o \nu \overline{\nu}$ in Extended...   
3  hep-ph/9811382  A critical phenomenological study of inclusive...   
4       1304.2781              Progress in the NNPDF global analysis   

                          doi categories unique_primary_category  \
0  10.1103/PhysRevD.74.113005     hep-ph                  hep-ph   
1                         NaN     hep-ph                  hep-ph   
2    10.1103/PhysRevD.54.4393     hep-ph                  hep-ph   
3       10.1007/s100529900018     hep-ph                  hep-ph   
4                         NaN     hep-ph                  hep-ph   

                                           full_text  full_text_length  
0  arXiv:hep-ph/0610334v2  30 Nov 2006\nPITHA 06/... 