In [61]:
import os
import json
import re
import pandas as pd

# links-between-papers-and-code.json

* contains repository URLs and frameworks

In [74]:
with open('/home/ws/ys8950/dev/data/paperswithcode/data/links-between-papers-and-code.json') as f:
    df_paper_code_links = pd.DataFrame(json.load(f))

In [297]:
df_paper_code_links.head(n=1)

Unnamed: 0,paper_url,paper_title,paper_arxiv_id,paper_url_abs,paper_url_pdf,repo_url,is_official,mentioned_in_paper,mentioned_in_github,framework
0,https://paperswithcode.com/paper/automatic-pos...,Automatic Post-Editing of Machine Translation:...,,https://aclanthology.org/D18-1341,https://aclanthology.org/D18-1341.pdf,https://github.com/trangvu/ape-npi,False,False,False,tf


In [302]:
# arXiv ID to repo URL export for npo
df_arxiv_to_repo = df_paper_code_links[df_paper_code_links.paper_arxiv_id.notna()][['paper_arxiv_id', 'repo_url']]
df_arxiv_to_repo['full_text_fn'] = df_arxiv_to_repo.paper_arxiv_id.apply(lambda x: x.replace('/', '') + '.txt')
df_arxiv_to_repo.to_csv('arXiv_fulltext_to_repo.tsv', sep='\t', index=False)

In [75]:
print(f'papers:\t\t\t{df_paper_code_links.shape[0]:,}')
print(f'papers w/ arXiv ID:\t{df_paper_code_links[df_paper_code_links.paper_arxiv_id.notna()].shape[0]:,}')
print('frameworks:\t\t{}'.format(
    '\n\t\t\t'.join(
        ['{}\t({:,})'.format(k, v) for k, v in df_paper_code_links['framework'].value_counts().iteritems()]
    )
))

papers:			125,524
papers w/ arXiv ID:	119,903
frameworks:		none	(54,108)
			pytorch	(44,026)
			tf	(25,012)
			mxnet	(601)
			paddle	(534)
			torch	(505)
			caffe2	(371)
			jax	(367)


# methods.json

* paper-method associations are given in papers-with-abstracts.json

In [82]:
with open('/home/ws/ys8950/dev/data/paperswithcode/data/methods.json') as f:
    df_methods = pd.DataFrame(json.load(f))

In [102]:
print(f'methods: {df_methods.shape[0]:,}')
df_methods.head(n=1)

methods: 1,802


Unnamed: 0,url,name,full_name,description,paper,introduced_year,source_url,source_title,code_snippet_url,collections
0,https://paperswithcode.com/method/densenas-a,DenseNAS-A,DenseNAS-A,**DenseNAS-A** is a mobile convolutional neura...,densely-connected-search-space-for-more,2000,,,https://github.com/JaminFong/DenseNAS/blob/e47...,[{'collection': 'Convolutional Neural Networks...


# datasets.json

In [116]:
with open('/home/ws/ys8950/dev/data/paperswithcode/data/datasets.json') as f:
    df_datasets = pd.DataFrame(json.load(f))

In [103]:
print(f'datasets: {df_datasets.shape[0]:,}')
df_datasets.head(n=1)

datasets: 5,124


Unnamed: 0,url,name,full_name,homepage,description,paper,introduced_date,warning,modalities,tasks,languages,variants,num_papers,data_loaders
0,https://paperswithcode.com/dataset/mnist,MNIST,,http://yann.lecun.com/exdb/mnist/,The **MNIST** database (**Modified National In...,{'title': 'Gradient-based learning applied to ...,,,[Images],"[{'task': 'Image Classification', 'url': 'http...",[],"[USPS-to-MNIST, MNIST-to-USPS, Rotating MNIST,...",4673,[{'url': 'https://huggingface.co/datasets/mnis...


# papers-with-abstracts.json

* contains for each paper
    * arXiv ID
    * methods
    * tasks
* does not contain
    * datasets

In [107]:
with open('/home/ws/ys8950/dev/data/paperswithcode/data/papers-with-abstracts.json') as f:
    df_paper_abstracts = pd.DataFrame(json.load(f))

In [214]:
df_paper_abstracts.head(n=1)

Unnamed: 0,paper_url,arxiv_id,title,abstract,url_abs,url_pdf,proceeding,authors,tasks,date,methods
0,https://paperswithcode.com/paper/dynamic-netwo...,1805.10616,Dynamic Network Model from Partial Observations,Can evolving networks be inferred and modeled ...,http://arxiv.org/abs/1805.10616v4,http://arxiv.org/pdf/1805.10616v4.pdf,NeurIPS 2018 12,"[Elahe Ghalebi, Baharan Mirzasoleiman, Radu Gr...",[],2018-05-27,[]


In [96]:
print(f'papers:\t\t\t{df_paper_abstracts.shape[0]:,}')
print(f'papers w/ arXiv ID:\t{df_paper_abstracts[df_paper_abstracts.arxiv_id.notna()].shape[0]:,}')

papers:			260,382
papers w/ arXiv ID:	210,123


# evaluation-tables.json

In [109]:
with open('/home/ws/ys8950/dev/data/paperswithcode/data/evaluation-tables.json') as f:
    df_eval_tables = pd.DataFrame(json.load(f))

In [120]:
print(f'papers:\t\t\t{df_eval_tables.shape[0]:,}')
df_eval_tables.head(n=1)

papers:			1,280


Unnamed: 0,categories,subtasks,source_link,task,description,datasets,synonyms
0,"[Methodology, Natural Language Processing, Com...","[{'categories': ['Methodology', 'Natural Langu...",,Optical Character Recognition,Optical character recognition or optical chara...,[{'dataset_links': [{'url': 'https://paperswit...,[]


‌  
‌  

### what info is where

* arXiv ID → method: `papers-with-abstracts`
* arXiv ID → task: `papers-with-abstracts`
* arXiv ID → dataset: `(not given, manually crawled)`


* framework: `links-between-papers-and-code`
* repo URLs: `links-between-papers-and-code`

‌  
‌  
‌  

# crawled dataset paper associations

In [121]:
with open('/home/ws/ys8950/dev/data/paperswithcode/data/crawled-dataset-papers.json') as f:
    dataset_papers = json.load(f)
# {
#     <dataset-url>:
#         {
#             'url': <paper-path/slug>,
#             ...
#         }
#     ...
# }

In [163]:
# create and clean reverse dict
paper_datasets = {}
for dataset_url, paper_list in dataset_papers.items():
    for paper in paper_list:
        dataset_slug = dataset_url.replace('https://paperswithcode.com/dataset/', '')
        paper_slug = paper['url'].replace('/paper/', '')
        if paper_slug not in paper_datasets:
            paper_datasets[paper_slug] = []
        paper_datasets[paper_slug].append(dataset_slug)

In [209]:
# extend paper abstracts dataframe
def add_dataset_info(row):
    paper_slug = row['paper_url'].replace('https://paperswithcode.com/paper/', '')
    paper_dataset_slugs = paper_datasets.get(paper_slug, [])
    paper_dataset_objects = [
        df_datasets[df_datasets.url == 'https://paperswithcode.com/dataset/'+slug].iloc[0].to_dict()
        for slug
        in paper_dataset_slugs
    ]
    row['datasets'] = paper_dataset_objects
    return row

df_paper_abstracts_extended = df_paper_abstracts.apply(add_dataset_info, axis=1)

In [212]:
print('papers:\t\t\t\t{:,}'.format(
    df_paper_abstracts_extended.shape[0]
))
for typ in ['tasks', 'methods', 'datasets']:
    print('papers w/ {}:\t\t{:,}'.format(
        typ,
        df_paper_abstracts_extended[df_paper_abstracts_extended[typ].apply(lambda x: len(x)) > 0].shape[0]
    ))
for typa in ['tasks', 'methods', 'datasets']:
    for typb in ['tasks', 'methods', 'datasets']:
        if typa == typb:
            continue
        print('papers w/ {} & {}:\t{:,}'.format(
            typa,
            typb,
            df_paper_abstracts_extended[
                (df_paper_abstracts_extended[typa].apply(lambda x: len(x)) > 0)
                &
                (df_paper_abstracts_extended[typb].apply(lambda x: len(x)) > 0)
            ].shape[0]
        ))
print('papers w/ all three:\t\t{:,}'.format(
            df_paper_abstracts_extended[
                (df_paper_abstracts_extended['tasks'].apply(lambda x: len(x)) > 0)
                &
                (df_paper_abstracts_extended['methods'].apply(lambda x: len(x)) > 0)
                &
                (df_paper_abstracts_extended['datasets'].apply(lambda x: len(x)) > 0)
            ].shape[0]
))
print('papers w/ any:\t\t\t{:,}'.format(
            df_paper_abstracts_extended[
                (df_paper_abstracts_extended['tasks'].apply(lambda x: len(x)) > 0)
                |
                (df_paper_abstracts_extended['methods'].apply(lambda x: len(x)) > 0)
                |
                (df_paper_abstracts_extended['datasets'].apply(lambda x: len(x)) > 0)
            ].shape[0]
))

papers:				260,382
papers w/ tasks:		159,351
papers w/ methods:		42,119
papers w/ datasets:		69,829
papers w/ tasks & methods:	30,854
papers w/ tasks & datasets:	56,717
papers w/ methods & tasks:	30,854
papers w/ methods & datasets:	17,270
papers w/ datasets & tasks:	56,717
papers w/ datasets & methods:	17,270
papers w/ all three:		14,322
papers w/ any:			180,780


In [210]:
# with open('/home/ws/ys8950/dev/data/paperswithcode/data/papers-with-abstracts-extended.json', 'w') as f:
#     df_paper_abstracts_extended.to_json(f, orient='records')

# How to construct relations between entities used in papers?

In [216]:
haveitall = df_paper_abstracts_extended[
    (df_paper_abstracts_extended['tasks'].apply(lambda x: len(x)) > 0)
    &
    (df_paper_abstracts_extended['methods'].apply(lambda x: len(x)) > 0)
    &
    (df_paper_abstracts_extended['datasets'].apply(lambda x: len(x)) > 0)
]
haveitall[:3]

Unnamed: 0,paper_url,arxiv_id,title,abstract,url_abs,url_pdf,proceeding,authors,tasks,date,methods,datasets
15,https://paperswithcode.com/paper/modularity-ma...,1806.06765,Modularity Matters: Learning Invariant Relatio...,We focus on two supervised visual reasoning ta...,http://arxiv.org/abs/1806.06765v1,http://arxiv.org/pdf/1806.06765v1.pdf,,"[Jason Jo, Vikas Verma, Yoshua Bengio]","[Relational Reasoning, Visual Reasoning]",2018-06-18,"[{'name': 'Average Pooling', 'full_name': 'Ave...",[{'url': 'https://paperswithcode.com/dataset/m...
17,https://paperswithcode.com/paper/a-memory-netw...,1805.02838,A Memory Network Approach for Story-based Temp...,We address the problem of story-based temporal...,http://arxiv.org/abs/1805.02838v3,http://arxiv.org/pdf/1805.02838v3.pdf,CVPR 2018,"[Sang-ho Lee, Jinyoung Sung, Youngjae Yu, Gunh...",[Video Summarization],2018-05-08,"[{'name': 'Memory Network', 'full_name': 'Memo...",[{'url': 'https://paperswithcode.com/dataset/i...
30,https://paperswithcode.com/paper/on-enhancing-...,1806.06626,On Enhancing Speech Emotion Recognition using ...,Generative Adversarial Networks (GANs) have ga...,http://arxiv.org/abs/1806.06626v1,http://arxiv.org/pdf/1806.06626v1.pdf,,"[Saurabh Sahu, Rahul Gupta, Carol Espy-Wilson]","[Emotion Recognition, Speech Emotion Recognition]",2018-06-18,"[{'name': 'Convolution', 'full_name': 'Convolu...",[{'url': 'https://paperswithcode.com/dataset/i...


In [225]:
hasitall = haveitall.iloc[0]
# https://paperswithcode.com/paper/modularity-matters-learning-invariant
print([d['name'] for d in hasitall.datasets])
print(hasitall.tasks)
print([m['name'] for m in hasitall.methods])

['MNIST', 'SHAPES']
['Relational Reasoning', 'Visual Reasoning']
['Average Pooling', 'ReLU', '1x1 Convolution', 'Batch Normalization', 'Bottleneck Residual Block', 'Global Average Pooling', 'Residual Block', 'Kaiming Initialization', 'Max Pooling', 'Residual Connection', 'Convolution', 'ResNet']


In [227]:
for d in hasitall.datasets:
    print(d['name'])
    print([t['task'] for t in d['tasks']])
# -> if dataset task in paper's used tasks, create tripe (task, performed_using, dataset)

MNIST
['Image Classification', 'Image Generation', 'Text Classification', 'Speech Recognition', 'Domain Adaptation', 'Graph Classification', 'Anomaly Detection', 'Image Clustering', 'Fine-Grained Image Classification', 'Feature Selection', 'Neural Architecture Search', 'Density Estimation', 'Continual Learning', 'Core set discovery', 'Stochastic Optimization', 'Unsupervised Anomaly Detection', 'Adversarial Defense', 'Video Prediction', 'Token Classification', 'Sequence-to-sequence Language Modeling', 'Unsupervised Image Classification', 'Network Pruning', 'Classification with Binary Weight Network', 'Sequential Image Classification', 'Continuously Indexed Domain Adaptation', 'Unsupervised Image-To-Image Translation', 'Hard-label Attack', 'Structured Prediction', 'One-Shot Learning', 'Handwritten Digit Recognition', 'Unsupervised MNIST', 'Rotated MNIST', 'Superpixel Image Classification', 'Summarization', 'NER', 'POS', 'SENTER', 'Iloko Speech Recognition']
SHAPES
['Question Answering', 

In [292]:
# nice result for https://paperswithcode.com/paper/glomo-unsupervisedly-learned-relational
# ('Image Classification', 'performed_using', 'ImageNet')
# ('Question Answering', 'performed_using', 'SQuAD')
# ('Natural Language Inference', 'performed_using', 'SNLI')
# ('Sentiment Analysis', 'performed_using', 'IMDb Movie Reviews')
# - - - - -
# tasks not mached: ['Transfer Learning', 'Word Embeddings']
# datasets not mached: []
# - - - - -


i = 0
for _, ppr in haveitall.iterrows():
    if i < 2:
        i += 1
        continue
    triples = []
    tasks_matched = []
    datasets_matched = []
    debug_result = 'matched:\n'
    for d in ppr.datasets:
        for t in d['tasks']:
            if t['task'] in ppr.tasks:
                datasets_matched.append(d['name'])
                tasks_matched.append(t['task'])
                triples.append(
                    (t['task'], 'performed_using', d['name'])
                )
                debug_result += '{}\n'.format((t['task'], 'performed_using', d['name']))
    debug_result += '- - - - -\n'
    debug_result += 'tasks not mached: {}\n'.format(
        list(set(ppr.tasks).difference(set(tasks_matched)))
    )
    debug_result += 'datasets not mached: {}\n'.format(
        list(set([d['name'] for d in ppr.datasets]).difference(set(datasets_matched)))
    )
    print(debug_result)
    # for m in ppr.methods:
    #     for t in ppr.tasks:
    #         triples.append(
    #             (m['name'], 'used_for', t)
    #         )
    #         # print((m['name'], 'used_for', t))
    break

matched:
('Speech Emotion Recognition', 'performed_using', 'IEMOCAP')
- - - - -
tasks not mached: ['Emotion Recognition']
datasets not mached: []

