In [1]:
import json
import gzip
import pandas as pd
import numpy as np

### load dataset with promising papers

In [2]:
import os.path

if (not os.path.exists(r'./papers-with-abstracts.json.gz')):
    import requests
    #decide on the original dataset or the state for our experiements
    URL = "https://production-media.paperswithcode.com/about/papers-with-abstracts.json.gz"
    URL = "https://cloudstore.uni-ulm.de/s/CfX2AYANtf6HF6T/download/papers-with-abstracts.json.gz"
    response = requests.get(URL)
    open("./papers-with-abstracts.json.gz", "wb").write(response.content)

f = gzip.open(r'./papers-with-abstracts.json.gz')
papersAbstract = json.load(f)

### pre sort papers

In [3]:
df =pd.DataFrame(papersAbstract)
print('Papers_with_Code: ',len(df))
df = df.loc[df['arxiv_id'].dropna().index] #only arxiv papers because they should have Latex
print('With Laetx: ',len(df))
df = df.loc[df['proceeding'].dropna().index] #only papers with a coresponding conference
print('Papers with Conference',len(df))
df[:1]

Papers_with_Code:  307318
With Laetx:  250324
Papers with Conference 33575


Unnamed: 0,paper_url,arxiv_id,title,abstract,url_abs,url_pdf,proceeding,authors,tasks,date,methods
0,https://paperswithcode.com/paper/dynamic-netwo...,1805.10616,Dynamic Network Model from Partial Observations,Can evolving networks be inferred and modeled ...,http://arxiv.org/abs/1805.10616v4,http://arxiv.org/pdf/1805.10616v4.pdf,NeurIPS 2018 12,"[Elahe Ghalebi, Baharan Mirzasoleiman, Radu Gr...",[],2018-05-27,[]


### statistics about conferences

In [4]:
unique_conferences = df['proceeding'].unique()
print(len(unique_conferences),'unique conferences considering years',)

def remove_digits(s):
    return str.strip(''.join([i for i in s if not i.isdigit()]))
#Conferences without year
print(len(pd.unique(list(map(remove_digits,unique_conferences)))),'unique conferences without year',)

742 unique conferences considering years
422 unique conferences without year


In [5]:
if (not os.path.exists(r'CORE_Conference_Rankings.csv')):
    import requests
    URL = "https://cloudstore.uni-ulm.de/s/tbWedrwZ4bTR34x/download/CORE_Conference_Rankings.csv"
    response = requests.get(URL)
    open("CORE_Conference_Rankings.csv", "wb").write(response.content)

conferences = pd.read_csv(r'CORE_Conference_Rankings.csv')
#good conferences
conferences = conferences[conferences['Rank'].isin(['A*'])] #['A*','A','B']
conferences = conferences['Acronym'].unique()
print(len(conferences),'good conferences')
#conferences

58 good conferences


### select only papers with relevant conferences

In [6]:
def is_renowend(proceeding,conf):
    conferences = [conf]#['CCS']#TODO insert renowend conferences here
    for conference in conferences:
        if ' '+conference+' ' in [' '+x+' ' for x in proceeding.split(' ')]:
            return True
    return False

#short version
#df[df['proceeding'].apply(lambda x: is_renowend(x))]

#also get relevant conferences
for conf in conferences:
    papers = list(df['arxiv_id'][df['proceeding'].apply(lambda x: is_renowend(x,conf))])
    papers.sort()
    if(len(papers)>10):
        print(conf,' ',len(papers))

NeurIPS   6420
ACL   2619
ECCV   1482
CVPR   6485
ICCV   2460
ICLR   3224
ICML   1584
AAAI   27


In [8]:
def is_renowend(proceeding):
    #conferences = ['RE']#['CCS']#TODO insert renowend conferences here
    for conference in conferences:
        if conference in proceeding.split(' '):
            return True
    return False
df =df[df['proceeding'].apply(lambda x: is_renowend(x))]
df
#df[df['proceeding'].apply(lambda x: is_renowend(x))]

Unnamed: 0,paper_url,arxiv_id,title,abstract,url_abs,url_pdf,proceeding,authors,tasks,date,methods
0,https://paperswithcode.com/paper/dynamic-netwo...,1805.10616,Dynamic Network Model from Partial Observations,Can evolving networks be inferred and modeled ...,http://arxiv.org/abs/1805.10616v4,http://arxiv.org/pdf/1805.10616v4.pdf,NeurIPS 2018 12,"[Elahe Ghalebi, Baharan Mirzasoleiman, Radu Gr...",[],2018-05-27,[]
1,https://paperswithcode.com/paper/pac-bayes-bou...,1806.06827,PAC-Bayes bounds for stable algorithms with in...,PAC-Bayes bounds have been proposed to get ris...,http://arxiv.org/abs/1806.06827v2,http://arxiv.org/pdf/1806.06827v2.pdf,NeurIPS 2018 12,"[Omar Rivasplata, Emilio Parrado-Hernandez, Jo...",[],2018-06-18,"[{'name': 'SVM', 'full_name': 'Support Vector ..."
3,https://paperswithcode.com/paper/gradient-desc...,1802.06093,Gradient descent with identity initialization ...,We analyze algorithms for approximating a func...,http://arxiv.org/abs/1802.06093v4,http://arxiv.org/pdf/1802.06093v4.pdf,ICML 2018,"[Peter L. Bartlett, David P. Helmbold, Philip ...",[],2018-02-16,[]
11,https://paperswithcode.com/paper/bingan-learni...,1806.06778,BinGAN: Learning Compact Binary Descriptors wi...,"In this paper, we propose a novel regularizati...",http://arxiv.org/abs/1806.06778v5,http://arxiv.org/pdf/1806.06778v5.pdf,NeurIPS 2018 12,"[Maciej Zieba, Piotr Semberecki, Tarek El-Gaal...",[Dimensionality Reduction],2018-06-18,[]
17,https://paperswithcode.com/paper/a-memory-netw...,1805.02838,A Memory Network Approach for Story-based Temp...,We address the problem of story-based temporal...,http://arxiv.org/abs/1805.02838v3,http://arxiv.org/pdf/1805.02838v3.pdf,CVPR 2018,"[Sang-ho Lee, Jinyoung Sung, Youngjae Yu, Gunh...",[Video Summarization],2018-05-08,"[{'name': 'Memory Network', 'full_name': 'Memo..."
...,...,...,...,...,...,...,...,...,...,...,...
305109,https://paperswithcode.com/paper/how-much-more...,2207.01725,How Much More Data Do I Need? Estimating Requi...,Given a small training data set and a learning...,https://arxiv.org/abs/2207.01725v2,https://arxiv.org/pdf/2207.01725v2.pdf,CVPR 2022 1,"[Rafid Mahmood, James Lucas, David Acuna, Daiq...",[Autonomous Driving],2022-07-04,[]
305454,https://paperswithcode.com/paper/finding-falle...,2207.03483,Finding Fallen Objects Via Asynchronous Audio-...,The way an object looks and sounds provide com...,https://arxiv.org/abs/2207.03483v1,https://arxiv.org/pdf/2207.03483v1.pdf,CVPR 2022 1,"[Chuang Gan, Yi Gu, Siyuan Zhou, Jeremy Schwar...","[Imitation Learning, Object Localization]",2022-07-07,[]
306490,https://paperswithcode.com/paper/demystifying-...,2207.03574,Demystifying the Adversarial Robustness of Ran...,Neural networks' lack of robustness against at...,https://arxiv.org/abs/2207.03574v1,https://arxiv.org/pdf/2207.03574v1.pdf,AAAI Workshop AdvML 2022 2,"[Chawin Sitawarin, Zachary Golan-Strieb, David...","[Adversarial Robustness, Autonomous Vehicles]",2022-06-18,"[{'name': 'Softmax', 'full_name': 'Softmax', '..."
306662,https://paperswithcode.com/paper/gradual-domai...,2207.04587,Gradual Domain Adaptation without Indexed Inte...,The effectiveness of unsupervised domain adapt...,https://arxiv.org/abs/2207.04587v1,https://arxiv.org/pdf/2207.04587v1.pdf,NeurIPS 2021 12,"[Hong-You Chen, Wei-Lun Chao]","[Domain Adaptation, Unsupervised Domain Adapta...",2022-07-11,[]


In [9]:
#required to use the URL crawler for manual crawling
#df[['arxiv_id','url_abs']].to_csv(r'papers_to_crawl.csv',index=False)

In [10]:
#check if . replacement of the crawler works correctly
papers = list(df['arxiv_id'])
papers.sort()
print(len(papers))
print(len(set(papers)))
print(papers[:5])
print(papers[-5:])

24319
24319
['0906.2027', '0909.5457', '1003.0120', '1003.0783', '1006.3316']
['2207.01744', '2207.03483', '2207.03574', '2207.04587', '2207.05801']


### extract papers (source code) from aws dump 

In [11]:
import tarfile

#copy arxiv files from aws-dump to local data
def get_relevant_gz_file(arxivTar,arxiv_id):
    #declare filename
    filename= '/media/sdd/datasets/arxiv/src/'+arxivTar 
 
    file_obj= tarfile.open(filename,"r")
    #extract a file
    try:
        file = file_obj.extract(arxiv_id.split('.')[0]+'/'+arxiv_id+'.gz',path='/media/sdd/project-scisen/dataset/raw_files-c')
    except:
        pass 
        
    file_obj.close()

#get_relevant_gz_file('arXiv_src_2101_001.tar','2101.00085')
#get_relevant_gz_file(test[0],papers[0])

In [12]:
move_files = False #set this flag to True to extract files from aws dump
if move_files:
    for paper in papers:
        #get tarfiles
        tarfiles = [x for x in os.listdir(r'/media/sdd/datasets/arxiv/src') if paper.split('.')[0] in x]
        #simple progress indicator
        print(paper)
        for file in tarfiles:  
            get_relevant_gz_file(file,paper)
            
print('Files that are tar files and no pdf',30717) #find . -type f | wc -l #manually in directory
print('Files that are tar files and no pdf A*',24165) #find . -type f | wc -l #manually in directory

Files that are tar files and no pdf 30717
Files that are tar files and no pdf A* 24165


### extract latex text from source files into a single file

In [13]:
abs_path_origin ='/media/sdd/project-scisen/dataset/raw_files-c/' 
abs_path_target ='/media/sdd/project-scisen/dataset/texfiles-c'

extract_files = False
if extract_files:
    for subdir in os.listdir(abs_path_origin):
        #execute Latex extraction script
        %run main.py -o {abs_path_target} expand -i {abs_path_origin}{subdir}
    
print('Files converted to tex files',30505)

Files converted to tex files 30505
