## Generate CSV files of sampled data from our preprocessed data

In [None]:
import pandas as pd

In [None]:
## Read preprocessed data

# -------------------------------------------------------
# This required having this file (~3GB) in your directory
# (not included in this repo)
# -------------------------------------------------------
df = pd.read_csv("arxiv-main-categories.csv")

In [None]:
## This code generates and saves {num_sample_csvs} sample CSV files from arxiv-main-categories.csv
# This takes ~ 10-15 min on a laptop

num_sample_csvs = 3

category_map = {
    "plasm-ph": "physics",
    "atom-ph": "physics",
    "chem-ph": "physics",
    "supr-con": "physics",
    "acc-phys": "physics",
    "cmp-lg": "math",
    "q-alg": "math",
    "alg-geom": "math",
    "dg-ga": "math",
    "funct-an": "math",
    "patt-sol": "math",
    "solv-int": "math",
    "adap-org": "math",
    "chao-dyn": "math",
    "ao-sci": "stat",
    "bayes-an": "stat",
    "mtrl-th": "cond-mat",
    "comp-gas": "cond-mat",
}

# Map smaller categories to their parent categories
# using https://arxiv.org/category_taxonomy
df["categories"] = df["categories"].replace(category_map)

# Number to sample per category
n_per_category = 4500

# Generate 3 files, each containing 4500 samples for each of the 20 main categories
# 90 000 entries per csv
for i in range(1, num_sample_csvs):
    sampled = (
        df.groupby("categories", group_keys=False)
          .apply(lambda g: g.sample(n=n_per_category, random_state=i))
    )
    
    out_file = f"arxiv-stratified-sample-{i}.csv"
    sampled.to_csv(out_file, index=False)
    print(f"Saved stratified sample {i} to {out_file}")
