# Dataset Merge
As a first step, we load the two datasets into memory and join them. Additionally we drop some attributes and rows to reduce the overall size of the dataset to a manageable size.

In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from pathlib import Path

## Load Datasets

### Loading and cleaning the arXiv dataset

#### Loading from parquet

In [2]:
arxiv_raw = dd.read_parquet('dataset/arxiv_raw.parquet')
arxiv_raw.head()

Unnamed: 0_level_0,authors,title,comments,journal-ref,categories,abstract,update_date,authors_parsed
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.1007/s00410-013-0898-x,"Denton S. Ebel, Richard O. Sack",Djerfisherite: Nebular Source of Refractory Po...,"18 pages, 5 figures, 4 tables",Contributions to Mineralogy and Petrology 166:...,astro-ph.EP astro-ph.IM physics.geo-ph,Djerfisherite is an important carrier of pot...,2023-08-08,"[['Ebel', 'Denton S.', ''], ['Sack', 'Richard ..."
0.1007/s10035-020-01057-3,"Patrick Richard, Riccardo Artoni, Alexandre Va...",Influence of lateral confinement on granular f...,,"Granular Matter, Springer Verlag, 2020, 22 (4)",cond-mat.soft,The properties of confined granular flows ar...,2020-09-28,"[['Richard', 'Patrick', '', 'IPR'], ['Artoni',..."
0.1007/s10092-022-00484-3,"Rima Khouja (AROMATH), Bernard Mourrain (AROMA...",Newton-Type Methods For Simultaneous Matrix Di...,"Calcolo, Springer Verlag, 2022",,math.NA cs.NA,This paper proposes a Newton-type method to ...,2022-11-07,"[['Khouja', 'Rima', '', 'AROMATH'], ['Mourrain..."
0.1007/s10652-005-0611-3,"Victor S. L'vov, Anna Pomyalov, Vasil Tiberkevich",Simple analytical model for entire turbulent b...,"14 pages, 5 figures, included, Enviromental fl...","Environmental Fluid Mechanics, v. 5, 373-386 (...",nlin.CD,We discuss a simple analytical model of the ...,2007-05-23,"[[""L'vov"", 'Victor S.', ''], ['Pomyalov', 'Ann..."
0.1007/s10659-021-09853-5,"Boris Kolev (LMT), Rodrigue Desmorat (LMT)",An intrinsic geometric formulation of Hyper-el...,,"Journal of Elasticity, Springer Verlag",physics.class-ph,"Isotropic hyper-elasticity, altogether with ...",2021-08-31,"[['Kolev', 'Boris', '', 'LMT'], ['Desmorat', '..."


#### Reloading from scratch

In [2]:
ARXIV_RAW_PATH = Path("dataset/arxiv-metadata-oai-snapshot.json")
arxiv_raw = dd.read_json(
    ARXIV_RAW_PATH,
    lines=True,
    blocksize=1e6,
    dtype={"id": "str", "license": "str"},
)
display(arxiv_raw.head())
display(arxiv_raw.describe())

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[['Balázs', 'C.', ''], ['Berger', 'E. L.', '']..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[['Streinu', 'Ileana', ''], ['Theran', 'Louis'..."
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[['Pan', 'Hongjun', '']]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[['Callan', 'David', '']]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[['Abu-Shammala', 'Wael', ''], ['Torchinsky', ..."


Unnamed: 0_level_0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
,string,string,string,string,string,string,string,string,string,string,string,string,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...,...


**Shrinking the arXiv dataset**
First we drop all entries without a doi since we cannot merge them with the citation data.

Then, we drop the following attributes:
- `id`: We do not need the id since we have the doi
- `versions`: We do not need the version since the version number has no meaning for new publications
- `license`: The license should have no impact on the content of the paper
- `report-no`: The report number should have no impact on the content of the paper
- `submitter`: Submitter is not relevant to the content of the paper

In [3]:
# Remove entries without DOI
arxiv_raw = arxiv_raw.dropna(subset=["doi"])

# Drop attributes
arxiv_raw = arxiv_raw.drop(["versions", "license", "report-no", "id", "submitter"], axis=1)

arxiv_raw = arxiv_raw.set_index("doi")
arxiv_raw.to_parquet("dataset/arxiv_raw.parquet")

In [4]:
arxiv_raw.head()

KeyboardInterrupt: 

### Loading and cleaning the citation dataset

#### Loading from parquet

In [3]:
citation_raw = dd.read_parquet('dataset/citation_raw.parquet')
citation_raw.head()

Unnamed: 0_level_0,citation_count
doi,Unnamed: 1_level_1
10.10.18045/zbefri.2015.2.207,1
10.1000/182,3
10.1000/287,1
10.1000/res#test,1
10.1001,6


#### Reloading dataset from scratch

In [4]:
CITATION_RAW_PATH = Path("dataset/coci-citation-count-2023-01-05.csv")
citation_raw = dd.read_csv(CITATION_RAW_PATH, blocksize=1e6)
citation_raw = citation_raw.rename(columns={"id": "doi"})
citation_raw = citation_raw.set_index("doi")
citation_raw.to_parquet("dataset/citation_raw.parquet")

In [6]:
citation_raw.head()

Unnamed: 0_level_0,citation_count
doi,Unnamed: 1_level_1
10.10.18045/zbefri.2015.2.207,1
10.1000/182,3
10.1000/287,1
10.1000/res#test,1
10.1001,6


### Joining the two datasets

#### Loading from parquet

In [7]:
dataset_raw = dd.read_parquet('dataset/dataset_raw.parquet')
dataset_raw.head()

Unnamed: 0_level_0,authors,title,comments,journal-ref,categories,abstract,update_date,authors_parsed,citation_count
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10.1002/adfm.202001307,"Qinghua Zhao, Wanqi Jie, Tao Wang, Andres Cast...",InSe Schottky diodes based on van der Waals co...,"25 pages, 13 figures, Main text and Supporting...","Advanced Functional Materials, 30, 2001307 (2020)",cond-mat.mes-hall cond-mat.mtrl-sci,Two-dimensional semiconductors are excellent...,2020-07-30,"[['Zhao', 'Qinghua', ''], ['Jie', 'Wanqi', '']...",33
10.1002/cphc.200900857,"Haifeng Ma, Thomas Brugger, Simon Berner, Yun ...",Boron Nitride Nanomesh: A template for Nano-ice,4 figures,"ChemPhysChem 2010, 11, 399",cond-mat.mtrl-sci cond-mat.mes-hall,Using variable temperature scanning tunnelin...,2010-02-04,"[['Ma', 'Haifeng', ''], ['Brugger', 'Thomas', ...",33
10.1002/prop.200710532,"Milovan Vasilic, Marko Vojinovic",Interaction of the Particle with the String in...,"Proceedings of the BW2007 conference, 5 pages","Fortsch.Phys.56:542,2008",gr-qc hep-th,Within the framework of generalized Papapetr...,2015-05-20,"[['Vasilic', 'Milovan', ''], ['Vojinovic', 'Ma...",1
10.1007/978-3-030-30493-5_44,"Itay Mosafi, Eli David, Nathan S. Netanyahu",DeepMimic: Mentor-Student Unlabeled Data Based...,,International Conference on Artificial Neural ...,cs.LG cs.NE stat.ML,"In this paper, we present a deep neural netw...",2019-12-03,"[['Mosafi', 'Itay', ''], ['David', 'Eli', ''],...",0
10.1007/lrr-2015-1,"Vitor Cardoso, Leonardo Gualtieri, Carlos Herd...",Exploring New Physics Frontiers Through Numeri...,"156 pages, 21 figures. Published in Living Rev...",,gr-qc astro-ph.HE hep-ph hep-th,The demand to obtain answers to highly compl...,2015-11-11,"[['Cardoso', 'Vitor', ''], ['Gualtieri', 'Leon...",55


In [8]:
display(dataset_raw.shape[0].compute())

573904

#### Reloading from scratch

In [4]:
display(citation_raw.shape[0].compute())
display(arxiv_raw.shape[0].compute())

77045952

1142334

In [5]:
dataset_raw = arxiv_raw.join(citation_raw, how="inner")

In [6]:
# Save the final dataset
dataset_raw.to_parquet("dataset/dataset_raw.parquet")