# Exploring a paper + repository
https://elifesciences.org/articles/67509 (PMED: 34409937)

In [1]:
import re
import os
import mimetypes
import tempfile
import subprocess

from skimage.metrics import structural_similarity as ssim
import requests
import markdown
import lxml.html
import pdf2image
from PIL import Image
import numpy as np

In [2]:
PAPER_URL = 'https://api.elifesciences.org/articles/67509'
response = requests.get(PAPER_URL)
article = response.json()['body']

In [3]:
def find_github_repo(blocks):
    # TODO: assumes only a single repository in the text
    for block in blocks:
        if block['type'] == 'paragraph':
            text = block['text']
            match = re.search(r'https:\/\/github.com\/([a-zA-Z0-9-]+)\/([a-zA-Z0-9_\-]+)\b', text)  # TODO: Only matches main repo, not branches, etc.
            if match:
                return match.group(1), match.group(2)
        elif block['type'] == 'section':
            result = find_github_repo(block['content'])
            if result:
                return result
        else:  # figure, etc.
            continue

user, repo = find_github_repo(article)

# Get the filenames from the github repo (no longer necessary, we clone the repo anyway)
def get_repo_filenames(user, repo):
    response = requests.get(f'https://api.github.com/repos/{user}/{repo}/git/trees/master?recursive=1')
    tree = response.json()['tree']
    dirnames = [t['path'] for t in tree if t['type'] == 'tree']
    filenames = [(os.path.dirname(t['path']), os.path.basename(t['path']))
                 for t in tree if t['type'] == 'blob']
    return dirnames, filenames

In [4]:
tmp_dir = tempfile.mkdtemp()
subprocess.run(['git', 'clone', '--depth', '1', f'https://github.com/{user}/{repo}'], cwd=tmp_dir)

CompletedProcess(args=['git', 'clone', '--depth', '1', 'https://github.com/vsbuffalo/paradox_variation'], returncode=0)

In [5]:
def calc_hash(filename):  # (no longer used, not working well)
    if filename.endswith('.pdf'):
        try:
            images = pdf2image.convert_from_path(filename)
        except Exception:
            print('PDF conversion failed for file', filename)
            return None
        if len(images) != 1:  # not a single page, maybe a paper?
            return None
        image = images[0]
    else:
        try:
            image = Image.open(filename)
        except Exception:
            print('Cannot calculate hash for file', filename)  # e.g. svg files
            return None
    return str(imagehash.average_hash(image, hash_size=16)) + str(imagehash.colorhash(image))

def get_image(filename):
    if filename.endswith('.pdf'):
        try:
            images = pdf2image.convert_from_path(filename)
        except Exception:
            print('PDF conversion failed for file', filename)
            return None
        if len(images) != 1:  # not a single page, maybe a paper?
            return None
        image = images[0]
    else:
        try:
            image = Image.open(filename)
        except Exception:
            print('Cannot open file', filename)  # e.g. svg files
            return None
    return image

In [6]:
manifest = {}  # The main data structure storing all the information about our files

base_path = os.path.join(tmp_dir, repo)
for root, dirs, files in os.walk(base_path):
    dirname = root[len(base_path)+1:]  # quick&dirty way to only retain the path in the repo
    for fname in files:
        if '.git' in dirname:
            continue
        fullname = os.path.join(root, fname)
        mime_type, _ = mimetypes.guess_type(fname, strict=False)
        manifest[os.path.join(dirname, fname)] = {'type': mime_type, 'references': []}

The filenames in the repository are the basic unit that we'll display in a graph. In the following, the main data structure is a dictionary, the "manifest", with the filename as the key, and a dictionary with additional information as the value. The information dictionary currently has the following information:

* `inputs`: list of files that go into this file (e.g. loaded csv files)
* `outputs`: list of files that are produced by this file (e.g. saved figures)
* `dependencies`: other source files used by this file (e.g. via import)
* `type`: the mime type of the file
* `references`: list of references to this file (textual references, not inputs/outputs/dependencies)

Each reference is given as a dictionary. Currently two types of references are provided, references in README files, and references in the publication. Each reference has two entries:
* `origin`: the name of the README file or a link to the article
* `context`: the surrounding text of the reference
* `section`: the section of the document where the reference is


## Extracting information from the elife article

The text might refer to specific files, and some of the figures in the paper might be present in the repository.

In [7]:
# Find references to filenames and compare figures in elife articles

# Simple tag remover
TAG_RE = re.compile(r'<[^>]+>')
fig_dir = tempfile.mkdtemp()

def find_references_elife(blocks, manifest, path=None):
    if path is None:
        path = []
    for block in blocks:
        if block['type'] == 'paragraph':
            continue
            text = TAG_RE.sub('', block['text'])
            for filename in manifest:
                basename = os.path.basename(filename)
                if '.' not in basename:  # skip too generic filenames like "description"
                    continue
                if basename in text:
                    manifest[filename]['references'].append({'origin': PAPER_URL,
                                                             'context': text,
                                                             'section': ' → '.join(path)})
        elif block['type'] == 'figure':
            for image_block in block['assets']:
                if not 'image' in image_block:
                    continue
                source_file = os.path.join(fig_dir, image_block['image']['source']['filename'])
                if not os.path.exists(source_file):
                    # Download the source image
                    response = requests.get(image_block['image']['source']['uri'], allow_redirects=True)
                    with open(source_file, 'wb') as f:
                        f.write(response.content)
                source_image = get_image(source_file)
                if source_image is None:
                    continue
                print(f'Comparing {source_file} to images in the repo')
                # Compare to all known images
                for repo_file, metadata in manifest.items():
                    if not 'notebooks/figures' in repo_file:
                        continue
                    if (metadata['type'] is None
                            or not (metadata['type'] == 'application/pdf' or metadata['type'].startswith('image/'))):
                        continue
                    repo_image = get_image(os.path.join(tmp_dir, repo, repo_file))
                    if repo_image is None:
                        continue
                    if repo_image.size != source_image.size:
                        repo_aspect = repo_image.size[0]/repo_image.size[1]
                        source_aspect = source_image.size[0]/source_image.size[1]
                        if abs(repo_aspect - source_aspect) > 0.01:
                            continue
                        if repo_image.size[0] < source_image.size[0]:  # scale to repo_image size
                            scaled_repo_image = repo_image
                            scaled_source_image = source_image.resize(repo_image.size)
                        else:  # scale to source image size
                            scaled_repo_image = repo_image.resize(source_image.size)
                            scaled_source_image = source_image
                    else:
                        # No scaling necessary
                        scaled_repo_image = repo_image
                        scaled_source_image = source_image
                    
                    # Compare with structural similary
                    similarity = ssim(np.asarray(scaled_repo_image), np.asarray(scaled_source_image),
                                      multichannel=True)
                    if similarity > 0.9:
                        print(f"Found a match {repo_file} == {image_block['label']}!", repo_file)
                        metadata['references'].append({'origin': PAPER_URL,
                                                       'context': image_block['title'],
                                                       'label': image_block['label']})
                            
        elif block['type'] == 'section':
            find_references_elife(block['content'], manifest, path=path + [block['title']])
        else:
            print('Unknown block type', block['type'])
            continue

In [8]:
find_references_elife(article, manifest)

Comparing /tmp/tmp8bidx57x/elife-67509-fig1-v3.jpg to images in the repo
Found a match notebooks/figures/Nc_figure.pdf == Figure 1! notebooks/figures/Nc_figure.pdf
PDF conversion failed for file /tmp/tmpu20l61i5/paradox_variation/notebooks/figures/Rplots.pdf
Comparing /tmp/tmp8bidx57x/elife-67509-fig1-figsupp1-v3.jpg to images in the repo
PDF conversion failed for file /tmp/tmpu20l61i5/paradox_variation/notebooks/figures/Rplots.pdf
Comparing /tmp/tmp8bidx57x/elife-67509-fig1-figsupp2-v3.jpg to images in the repo
PDF conversion failed for file /tmp/tmpu20l61i5/paradox_variation/notebooks/figures/Rplots.pdf
Comparing /tmp/tmp8bidx57x/elife-67509-fig1-figsupp3-v3.jpg to images in the repo
PDF conversion failed for file /tmp/tmpu20l61i5/paradox_variation/notebooks/figures/Rplots.pdf
Comparing /tmp/tmp8bidx57x/elife-67509-fig1-figsupp4-v3.jpg to images in the repo
PDF conversion failed for file /tmp/tmpu20l61i5/paradox_variation/notebooks/figures/Rplots.pdf
Comparing /tmp/tmp8bidx57x/elife-

## Extracting crosslinks in the repo

* README files might refer to specific filenames
* scripts might load some data, write some data, and import/reuse other scripts

In [9]:
# Find relevant notes in README files
readmes = [fname for fname in manifest 
          if os.path.basename(fname).lower() in ['readme', 'readme.md', 'readme.txt']]
readmes

['README.md', 'data/README.md', 'data/range_maps/README.md']

In [10]:
def _find_filename_in_tree(tree, filename):
    for el in tree.iter():
        if el.text is not None and filename in el.text:
            return el
        if el.tail is not None and filename in el.tail:
            return el

def _get_context_paragraph(tree):
    if tree.tag in ('p', 'li') or tree.getparent() is None:
        return tree.text_content()
    else:
        return _get_context_paragraph(tree.getparent())

def _get_headers(tree, headers=None):
    if headers is None:
        headers = []
    if tree.tag in ('h1', 'h2', 'h3', 'h4'):  #FIXME...
        headers.append(tree.text_content())
    if tree.getprevious() is not None:
        return _get_headers(tree.getprevious(), headers=headers)
    elif tree.getparent() is not None:
        return _get_headers(tree.getparent(), headers=headers)
    else:
        return headers
        
    
def find_filename_in_html(text, filename):
    parsed = lxml.html.fromstring(html_text)
    found = _find_filename_in_tree(parsed, filename)
    assert found is not None
    # Get context paragraph and possible headers
    paragraph = _get_context_paragraph(found)
    headers = ' → '.join(reversed(_get_headers(found)))
    return paragraph, headers
    
    
for readme in readmes:
    readme_text = open(os.path.join(tmp_dir, repo, readme)).read()
    for filename, metadata in manifest.items():
        if not '.' in filename or filename in readmes:
            continue
        if filename in readme_text:
            if readme.endswith('.md'):
                html_text = markdown.markdown(readme_text)
                par, headers = find_filename_in_html(html_text, filename)
                metadata['references'].append({'origin': readme,
                                               'context': par,
                                               'section': headers})
            else:
                description = re.search(f"""["'`]?{filename}["'`]??(:| is| contains)?(.*\n)""", readme_text)
                if description:
                    metadata['references'].append({'origin': readme,
                                                   'context': description.group(0)})
                else:
                    description = re.search(f"""?([\n^]).*{filename}.*?([\n$])""", readme_text)
                    if description:
                        metadata['references'].append({'origin': readme,
                                                   'context': description.group(0)})
                    else:
                        metadata['references'].append({'origin': readme})

In [11]:
def input_output_r(content):
    # FIXME: more robust parsing
    inputs = re.findall("""read_[ct]sv\(['"]([\.\/\-_0-9a-zA-Z]+)['"][^)]*\)""", content)
    inputs.extend(re.findall("""load\(['"]([\.\/\-_0-9a-zA-Z]+)['"]\)""", content))
    outputs = re.findall("""write_[ct]sv\([^,]+,\s*['"]([\.\/\-_0-9a-zA-Z]+)['"][^)]*\)""", content)
    outputs.extend(re.findall("""pdf\(['"]([\.\/\-_0-9a-zA-Z]+)['"][^)]*\)""", content))
    outputs.extend(re.findall("""save\(.*, file=['"]([\.\/\-_0-9a-zA-Z]+)['"][^)]*\)""", content))
    dependencies = re.findall("""source\(['"]([\.\/\-_0-9a-zA-Z]+)['"]\)""", content)
    return set(inputs), set(outputs), set(dependencies)

In [12]:
def normalize_paths(filenames, reference_name):
    reference_dir = os.path.dirname(reference_name)
    return {os.path.normpath(os.path.join(reference_dir, fname))
            for fname in filenames}

In [13]:
for filename, metadata in manifest.items():
    if not filename.endswith('.r'):
        continue
        
    with open(os.path.join(tmp_dir, repo, filename), 'r') as f:
        lines = f.readlines()
        cleaned_lines = [l for l in lines if not l.strip().startswith('#')]
        content = '\n'.join(cleaned_lines)
        inputs, outputs, dependencies = input_output_r(content)
        metadata['inputs'] = normalize_paths(inputs, filename)
        metadata['outputs'] = normalize_paths(outputs, filename)
        metadata['dependencies'] = normalize_paths(dependencies, filename)

In [14]:
import pprint
pprint.pprint(manifest)

{'R/node_height.r': {'dependencies': set(),
                     'inputs': set(),
                     'outputs': set(),
                     'references': [],
                     'type': None},
 'R/range_funcs.r': {'dependencies': set(),
                     'inputs': set(),
                     'outputs': set(),
                     'references': [],
                     'type': None},
 'R/utilities.r': {'dependencies': set(),
                   'inputs': set(),
                   'outputs': set(),
                   'references': [],
                   'type': None},
 'README.md': {'references': [], 'type': 'text/markdown'},
 'data/Dataset_S1.txt': {'references': [], 'type': 'text/plain'},
 'data/Makefile': {'references': [], 'type': None},
 'data/RAW_dataset_Jan_06_2012.xlsx': {'references': [{'context': 'RAW_dataset_Jan_06_2012.xls: '
                                                                  'this was '
                                                                  'do

                                                    'type': 'application/pdf'},
 'data/total_range_plots/Necora_puber.pdf': {'references': [],
                                             'type': 'application/pdf'},
 'data/total_range_plots/Nelumbo_lutea.pdf': {'references': [],
                                              'type': 'application/pdf'},
 'data/total_range_plots/Nelumbo_nucifera.pdf': {'references': [],
                                                 'type': 'application/pdf'},
 'data/total_range_plots/Neurospora_crassa.pdf': {'references': [],
                                                  'type': 'application/pdf'},
 'data/total_range_plots/Nicotiana_langsdorffii.pdf': {'references': [],
                                                       'type': 'application/pdf'},
 'data/total_range_plots/Nicotiana_tabacum.pdf': {'references': [],
                                                  'type': 'application/pdf'},
 'data/total_range_plots/Nilaparvata_lugens.pdf': {'re

                                                   'references': [],
                                                   'type': None},
 'notebooks/figures/diversity_range_bodymass.pdf': {'references': [],
                                                    'type': 'application/pdf'},
 'notebooks/figures/diversity_range_bodymass.r': {'dependencies': {'R/utilities.r',
                                                                   'notebooks/figures/color_scheme.r'},
                                                  'inputs': {'data/diversity_range_bodymass_chains.Rdata',
                                                             'data/main_datasets.Rdata'},
                                                  'outputs': {'notebooks/figures/diversity_range_bodymass.pdf'},
                                                  'references': [],
                                                  'type': None},
 'notebooks/figures/fecundity_body_size.pdf': {'references': [],
                   