In [2]:
################################################
#   I. Preprocessing                         ###
################################################



# 1.0 Importing libraries and data
import pandas as pd

df_raw = pd.read_json("original_data.json", lines=True)

In [3]:
# 1.1) Extract the primary category from the 'categories' column
df_raw["primary_category"] = df_raw["categories"].str.split().str[0]


# 1.2)  Extract the creation date from the *first* version (v1) 
def get_first_version_date(versions_list):
    if versions_list and isinstance(versions_list, list) and len(versions_list) > 0:
        first_version = versions_list[0]
        return first_version.get("created", None)
    return None

df_raw["created_date"] = df_raw["versions"].apply(get_first_version_date)

# 1.3) Convert the date strings to datetime objects and extract the year
df_raw["created_date"] = pd.to_datetime(df_raw["created_date"], errors="coerce")
df_raw["year"] = df_raw["created_date"].dt.year



In [4]:
import pyreadr

# This loads object "arxiv_cats.RData" to result
result = pyreadr.read_r("arxiv_cats.RData")
# This stores result as dataframe 
df_cats_map = result ["arxiv_cats"]


# getting rid of unnecessary columns
df_cats_map.drop(columns=["long_description"], inplace=True)
df_cats_map.drop(columns=["short_description"], inplace=True)
df_cats_map.drop(columns=["subfield"], inplace=True)

# Adding a few rows manually because of wrong formating in df_cats_map DAtaframe (astro-ph vs astro-ph.Co )
new_rows = [
    {"category": "astro-ph", "field": "Physics",},
    {"category": "cond-mat",  "field": "Physics",},
]

# Convert the list of dicts to a DataFrame
df_manual_rows  = pd.DataFrame(new_rows)

# Append to your existing df_cats_map
df_cats_map = pd.concat([df_cats_map, df_manual_rows], ignore_index=True)



In [5]:

# merging df_cats_map with df

df = pd.merge(
    df_raw,
    df_cats_map, 
    how="left",            # keep all rows from df_mai n
    left_on="primary_category",
    right_on="category"
)

In [6]:
# for better overview I trim the data to make it more convenient

df_final = df.drop(columns=["submitter", "authors", "comments", "journal-ref", "doi", "report-no", "license", "versions", "update_date", "authors_parsed", "created_date",])

In [7]:
########################################
###### exporting df_final to pickle ######
########################################

df_final.to_pickle("df_clean.pkl")
