# 1. Setup and Load Datasets

In [10]:
import pandas as pd
import numpy as np

# Define the paths to the raw data files
arxiv_path = './data/raw/raw_arxiv.csv'
semantic_scholar_path = './data/raw/raw_semantic_scholar.csv'

# Load the datasets into pandas DataFrames
try:
    arxiv_df = pd.read_csv(arxiv_path)
    print(f"Successfully loaded {len(arxiv_df)} records from ArXiv.")
except FileNotFoundError:
    arxiv_df = pd.DataFrame()

try:
    ss_df = pd.read_csv(semantic_scholar_path)
    print(f"Successfully loaded {len(ss_df)} records from Semantic Scholar.")
except FileNotFoundError:
    ss_df = pd.DataFrame()

Successfully loaded 85 records from ArXiv.
Successfully loaded 2043 records from Semantic Scholar.


In [11]:
arxiv_df.head()

Unnamed: 0,id,title,summary,authors,year,source
0,http://arxiv.org/abs/2409.13423v1,Causal Reinforcement Learning for Optimisation...,Autonomous operations of robots in unknown env...,"['Julian Gerald Dcruz', 'Sam Mahoney', 'Jia Yu...",2024-09-20T11:40:51Z,arxiv
1,http://arxiv.org/abs/2402.04869v2,Learning by Doing: An Online Causal Reinforcem...,As a key component to intuitive cognition and ...,"['Ruichu Cai', 'Siyang Huang', 'Jie Qiao', 'We...",2025-04-24T07:58:03Z,arxiv
2,http://arxiv.org/abs/2307.01452v2,Causal Reinforcement Learning: A Survey,Reinforcement learning is an essential paradig...,"['Zhihong Deng', 'Jing Jiang', 'Guodong Long',...",2023-11-21T03:43:15Z,arxiv
3,http://arxiv.org/abs/2412.05783v1,Two-way Deconfounder for Off-policy Evaluation...,This paper studies off-policy evaluation (OPE)...,"['Shuguang Yu', 'Shuxing Fang', 'Ruixin Peng',...",2024-12-08T02:28:58Z,arxiv
4,http://arxiv.org/abs/2302.13240v1,Q-Cogni: An Integrated Causal Reinforcement Le...,"We present Q-Cogni, an algorithmically integra...","['Cris Cunha', 'Wei Liu', 'Tim French', 'Ajmal...",2023-02-26T05:50:26Z,arxiv


In [12]:
ss_df.head()

Unnamed: 0,id,doi,title,summary,authors,citation_count,year,source
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,10.1109/ISIT.2014.6875397,Applications of information Nonanticipative Ra...,The objective of this paper is to further inve...,"['Photios A. Stavrou', 'C. Kourtellaris', 'C. ...",4,2014.0,semantic_scholar
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,10.1609/aaai.v34i10.7134,Explainable Agency in Reinforcement Learning A...,This thesis explores how reinforcement learnin...,['Prashan Madumal'],1,2020.0,semantic_scholar
2,01e9241dbb9eaca99b86468bb079f4b631b71671,10.48550/arXiv.2406.01065,Causal prompting model-based offline reinforce...,Model-based offline Reinforcement Learning (RL...,"['Xuehui Yu', 'Yi Guan', 'Rujia Shen', 'Xin Li...",0,2024.0,semantic_scholar
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,,Learning Causal Overhypotheses through Explora...,Despite recent progress in reinforcement learn...,"['Eliza Kosoy', 'Adrian Liu', 'Jasmine Collins...",9,2022.0,semantic_scholar
4,0348b36927f740b82f51afcd1c35cae8386bc336,10.1109/iv51971.2022.9827374,Segmented Encoding for Sim2Real of RL-based En...,Among the challenges in the recent research of...,"['Seung H. Chung', 'S. Kong', 'S. Cho', 'I. M....",3,2022.0,semantic_scholar


# 2. Standardize Schemas

In [13]:
# --- Standardize ArXiv DataFrame ---
if not arxiv_df.empty:
    # Rename the 'id' column to be more specific
    arxiv_df.rename(columns={'id': 'arxiv_id'}, inplace=True)
    # Add placeholder columns that exist in the Semantic Scholar data
    arxiv_df['doi'] = np.nan
    arxiv_df['citation_count'] = np.nan
    # Year as number
    arxiv_df["year"] = pd.to_datetime(arxiv_df["year"], errors="coerce").dt.year

arxiv_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   arxiv_id        85 non-null     object 
 1   title           85 non-null     object 
 2   summary         85 non-null     object 
 3   authors         85 non-null     object 
 4   year            85 non-null     int32  
 5   source          85 non-null     object 
 6   doi             0 non-null      float64
 7   citation_count  0 non-null      float64
dtypes: float64(2), int32(1), object(5)
memory usage: 5.1+ KB


In [14]:
# --- Standardize Semantic Scholar DataFrame ---
if not ss_df.empty:
    # Rename the 'id' column to avoid confusion
    ss_df.rename(columns={'id': 'paper_id_s2'}, inplace=True)
    # Add a placeholder for the arxiv_id
    ss_df['arxiv_id'] = np.nan
    # Year as number
    ss_df["year"] = pd.to_numeric(ss_df["year"], errors="coerce").astype("Int64")

ss_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2043 entries, 0 to 2042
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   paper_id_s2     2043 non-null   object 
 1   doi             1764 non-null   object 
 2   title           2043 non-null   object 
 3   summary         1888 non-null   object 
 4   authors         2043 non-null   object 
 5   citation_count  2043 non-null   int64  
 6   year            2004 non-null   Int64  
 7   source          2043 non-null   object 
 8   arxiv_id        0 non-null      float64
dtypes: Int64(1), float64(1), int64(1), object(6)
memory usage: 145.8+ KB


# 3. Combine and De-Duplicate

In [16]:
# Combine the two standardized dataframes into one
combined_df = pd.concat([arxiv_df, ss_df], ignore_index=True)
print(f"\nTotal records before de-duplication: {len(combined_df)}")

# --- Robust De-duplication ---
# Clean up the DOI column by removing any leading/trailing whitespace
combined_df['doi'] = combined_df['doi'].str.strip()

# Replace empty strings in 'doi' with NaN to handle them consistently
#combined_df['doi'].replace('', np.nan, inplace=True)
combined_df.loc[combined_df['doi'] == '', 'doi'] = np.nan

# Sort by citation count (descending) to keep the more cited version of a paper
# and by source to have a predictable order
combined_df.sort_values(by=['citation_count', 'source'], ascending=[False, True], inplace=True)

# First pass: Drop duplicates based on DOI for all papers that have one
final_df = combined_df.drop_duplicates(subset='doi', keep='first').copy()

# For entries that had no DOI (NaN), they are not de-duplicated yet.
# Second pass: De-duplicate the remaining entries based on a cleaned title.
# Create a temporary lowercase title for matching.
final_df['title_lower'] = final_df['title'].str.lower().str.strip()
final_df.drop_duplicates(subset='title_lower', keep='first', inplace=True)

# Clean up the temporary column
final_df = final_df.drop(columns=['title_lower'])

print(f"Total unique records after de-duplication: {len(final_df)}")
final_df.head()


Total records before de-duplication: 2128
Total unique records after de-duplication: 1758


Unnamed: 0,arxiv_id,title,summary,authors,year,source,doi,citation_count,paper_id_s2
399,,Decision Transformer: Reinforcement Learning v...,We introduce a framework that abstracts Reinfo...,"['Lili Chen', 'Kevin Lu', 'A. Rajeswaran', 'Ki...",2021,semantic_scholar,,1736.0,c1ad5f9b32d80f1c65d67894e5b8c2fdf0ae4500
1528,,Reinforcement learning for demand response: A ...,A need is identified to further explore reinfo...,"['José R. Vázquez-Canteli', 'Z. Nagy']",2019,semantic_scholar,10.1016/J.APENERGY.2018.11.002,600.0,648ea87fe7f99ca8ea5090cb1ba40242299ef4c4
1395,,Perceptual Learning Directs Auditory Cortical ...,The primary sensory cortex is positioned at a ...,"['D. Polley', 'Elizabeth E Steinberg', 'M. Mer...",2006,semantic_scholar,10.1523/JNEUROSCI.3771-05.2006,567.0,3fdfbf8de976c103ad0ee05cc9112fe8316fa342
328,,Reinforcement Knowledge Graph Reasoning for Ex...,Recent advances in personalized recommendation...,"['Yikun Xian', 'Zuohui Fu', 'S. Muthukrishnan'...",2019,semantic_scholar,10.1145/3331184.3331203,480.0,9a14989424b16a4685c43ffc8057b40157631dd2
1608,,Working-memory capacity protects model-based l...,It is found that stress response attenuates th...,"['A. R. Otto', 'Candace M. Raio', 'Alice Y. Ch...",2013,semantic_scholar,10.1073/pnas.1312011110,432.0,78b89e68ed84ef344f82922ea702a736a5791d0a


In [17]:
# --- Final Data Cleaning ---
# Ensure the summary column is a string
final_df['summary'] = final_df['summary'].astype(str)

# Remove rows where the summary is missing or very short (e.g., less than 20 characters)
initial_count = len(final_df)
final_df = final_df[final_df['summary'].str.len() > 20]
print(f"\nRemoved {initial_count - len(final_df)} records with missing or short summaries.")
print(f"Final corpus size: {len(final_df)}")

# --- Save the Master Corpus ---
master_path = './data/processed/master_corpus.csv'
final_df.to_csv(master_path, index=False)
print(f"\nMaster corpus of {len(final_df)} unique papers saved to {master_path}")

# Display the distribution of papers from each source
print("\nSource Distribution:")
print(final_df['source'].value_counts())


Removed 106 records with missing or short summaries.
Final corpus size: 1652

Master corpus of 1652 unique papers saved to ./data/processed/master_corpus.csv

Source Distribution:
source
semantic_scholar    1652
Name: count, dtype: int64


In [18]:
final_df.info()
final_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1652 entries, 399 to 2125
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   arxiv_id        0 non-null      object 
 1   title           1652 non-null   object 
 2   summary         1652 non-null   object 
 3   authors         1652 non-null   object 
 4   year            1649 non-null   Int64  
 5   source          1652 non-null   object 
 6   doi             1651 non-null   object 
 7   citation_count  1652 non-null   float64
 8   paper_id_s2     1652 non-null   object 
dtypes: Int64(1), float64(1), object(7)
memory usage: 130.7+ KB


Unnamed: 0,arxiv_id,title,summary,authors,year,source,doi,citation_count,paper_id_s2
399,,Decision Transformer: Reinforcement Learning v...,We introduce a framework that abstracts Reinfo...,"['Lili Chen', 'Kevin Lu', 'A. Rajeswaran', 'Ki...",2021,semantic_scholar,,1736.0,c1ad5f9b32d80f1c65d67894e5b8c2fdf0ae4500
1528,,Reinforcement learning for demand response: A ...,A need is identified to further explore reinfo...,"['José R. Vázquez-Canteli', 'Z. Nagy']",2019,semantic_scholar,10.1016/J.APENERGY.2018.11.002,600.0,648ea87fe7f99ca8ea5090cb1ba40242299ef4c4
1395,,Perceptual Learning Directs Auditory Cortical ...,The primary sensory cortex is positioned at a ...,"['D. Polley', 'Elizabeth E Steinberg', 'M. Mer...",2006,semantic_scholar,10.1523/JNEUROSCI.3771-05.2006,567.0,3fdfbf8de976c103ad0ee05cc9112fe8316fa342
328,,Reinforcement Knowledge Graph Reasoning for Ex...,Recent advances in personalized recommendation...,"['Yikun Xian', 'Zuohui Fu', 'S. Muthukrishnan'...",2019,semantic_scholar,10.1145/3331184.3331203,480.0,9a14989424b16a4685c43ffc8057b40157631dd2
1608,,Working-memory capacity protects model-based l...,It is found that stress response attenuates th...,"['A. R. Otto', 'Candace M. Raio', 'Alice Y. Ch...",2013,semantic_scholar,10.1073/pnas.1312011110,432.0,78b89e68ed84ef344f82922ea702a736a5791d0a
