In [30]:
# Download spaces on the Janelia wiki and save to files on disk
# Based on documentation at https://atlassian-python-api.readthedocs.io/confluence.html#get-page-info

import os
from atlassian import Confluence

confluence_url = "https://wikis.janelia.org"
confluence_pat = os.environ.get('CONFLUENCE_TOKEN')
confluence = Confluence(url=confluence_url, token=confluence_pat)

space = 'SCSW'
title = 'Using Nextflow'
page_id = confluence.get_page_id(space, title)

print(page_id)

64061586


In [81]:
def get_page(page_id):
    page = confluence.get_page_by_id(page_id, expand="body.view,metadata.labels,ancestors", status=None, version=None)
    return parse_page(page)

def parse_page(page):
    page_id = int(page['id'])
    path = page['_links']['webui']
    title = page['title']
    body = page['body']['view']['value']
    labels = [l['name'] for l in page['metadata']['labels']['results']]
    ancestors = [p['title'] for p in page['ancestors']]
    return page_id,path,title,body,labels,ancestors

def get_link(path):
    return "%s%s" % (confluence_url, path)
    
page_id,path,title,body,labels,ancestors = get_page(page_id)
print("ID:", page_id)
print("Path:", path)
print("Link:", get_link(path))
print("Title:",title)
print("Labels:",labels)
print("Ancestors:",ancestors)
print(body)

ID: 30746766
Path: /display/SCSW/SCA+Java+and+Object-Oriented+Programming
Link: https://wikis.janelia.org/display/SCSW/SCA+Java+and+Object-Oriented+Programming
Title: SCA Java and Object-Oriented Programming
Labels: []
Ancestors: ['Scientific Computing Software', 'Education', 'SCA Curriculum']
<h2 id="SCAJavaandObjectOrientedProgramming-Introduction">Introduction</h2><p>This unit covers Java and Object-Oriented programming. Its &quot;un&quot;-structure has been shamelessly copied from the <a href="/display/SCSW/SCA+Python+curriculum">SCA Python curriculum</a>.</p><p>The idea is to mix self-guided education (online tutorials &amp; courses, books) with some practical projects, all with the assistance of an experienced developer from SciComp to act as guide/resource/tutor.</p><p>Suggested plan:</p><ul><li>initial orientation with guide; talk about what to tackle, what projects to try, how to get set up, etc.</li><li>if you have zero Java experience:<ul><li>install Java Development Kit and

In [69]:
page = confluence.get_page_by_title("ScientificComputing", "Scientific Computing Server - e06u05", start=None, limit=None, expand="metadata.labels,ancestors")
#page

In [87]:
#confluence.get_all_pages_from_space(space, start=0, limit=100, status=None, expand="body.view,metadata.labels", content_type='page')

spaces = ['SCSW','SCS','ScientificComputing']
limit = 50
num_pages = 0

for space in spaces:

    start = 0
    pages = []
    while True:
        pages_iter = confluence.get_all_pages_from_space(space, start=start, limit=limit, expand="body.view,metadata.labels,ancestors")

        if len(pages_iter) == 0:
            break

        start += len(pages_iter)

        for page in pages_iter:
            page_id,path,title,body,labels,ancestors = parse_page(page)
            
            # Skip archived pages
            if "ARCHIVE_SCSW" in ancestors: continue
            
            filepath = "./data/wiki/%s/%d" % (space,page_id)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
            with open(filepath, mode="wt") as f:
                f.write(get_link(path)+"\n")
                f.write(" / ".join(ancestors)+"\n")
                f.write(title+"\n")
                f.write(", ".join(labels)+"\n")
                f.write(body)
                num_pages += 1
        
        # no more to fetch
        if len(pages_iter) < limit:
            break

    print('Found %d pages in %s' % (num_pages,space))


Found 183 pages in SCSW
Found 238 pages in SCS
Found 520 pages in ScientificComputing
