
# WDA -> Floop Script


## The three files in play here


WDAfloopscrape.json - the main json data scrape.

labels.json - the labels for the scrape. there's some duplication sorted out below.

data.json - the data behind the floop graphic. we're trying to ammend the this json with info taken off WDA.



In [44]:
import json

# for now we'll ammend to the best choice where there are duplicate labels for a classification
# the ones shown below are the only duplicate labels for classifications in the scrape.
with open("labels.json", "r") as labels_file:   
    labels = json.load(labels_file)

    for key in labels.keys():    
        if labels[key] == ['Type', 'Geography', 'Area']: labels[key] = 'Area'
        if labels[key] == ['Year', 'Time', 'Month']: labels[key] = 'Time'
        if labels[key] == ['Household characteristics', 'Household Characteristics']: labels[key] = 'Household Characteristics'
        if labels[key] == ['Flow of Travel', 'Type']: labels[key] = 'Flow of Travel'
        if labels[key] == ['Seasonal Adjustment', 'SA / NSA']: labels[key] = 'Seasonal Adjustment'
        if labels[key] == ['Component', 'Standard Industrial Trade Classification (SITC)']: labels[key] = 'Standard Industrial Trade Classification (SITC)'


## What the scrape looks like, just one sample (the ASHE07E dataset)

In [45]:
# import json
from pprint import pprint

with open("WDAfloopscrape.json", "r") as main_file:   
    scrape = json.load(main_file)
    
    pprint({'ASHE07E':scrape['ASHE07E']})


{'ASHE07E': {'context': 'Economic',
             'name': 'Earnings by place of work',
             'topics': ['CL_0000695',
                        'CL_0000700',
                        'CL_0000699',
                        'CL_0000693',
                        'CL_0000712',
                        'CL_0000696',
                        'CL_0000635',
                        '2013WARDH'],
             'url': '<p>Not implemented yet</p>'}}


## Make a Dict of resources already defined

In [46]:

with open("data.json", "r") as labels_file:
    
    floopdata = json.load(labels_file)
    
    resources = [x for x in floopdata['data'] if '/topic/' in x]
    
    existingResources = {}
    for r in resources:
        if floopdata['data'][r]['type'] == 'topic':
            existingResources.update({floopdata['data'][r]['name']:floopdata['data'][r]['id']})
            
    # so thisis whats already defined put in a dict
    pprint (existingResources)
        

{'acorn': 'http://ons.floop.org.uk/resource/topic/51',
 'age': 'http://ons.floop.org.uk/resource/topic/32',
 'age band': 'http://ons.floop.org.uk/resource/topic/3',
 'age group': 'http://ons.floop.org.uk/resource/topic/28',
 'broad industry sector': 'http://ons.floop.org.uk/resource/topic/26',
 'continent': 'http://ons.floop.org.uk/resource/topic/58',
 'country': 'http://ons.floop.org.uk/resource/topic/59',
 'country of birth': 'http://ons.floop.org.uk/resource/topic/11',
 'department': 'http://ons.floop.org.uk/resource/topic/57',
 'dependent children': 'http://ons.floop.org.uk/resource/topic/14',
 'disability': 'http://ons.floop.org.uk/resource/topic/12',
 'do conditions cause difficulty': 'http://ons.floop.org.uk/resource/topic/30',
 'employed/self-employed': 'http://ons.floop.org.uk/resource/topic/48',
 'employment size band': 'http://ons.floop.org.uk/resource/topic/46',
 'ethnicity': 'http://ons.floop.org.uk/resource/topic/9',
 'eu/non-eu': 'http://ons.floop.org.uk/resource/topic/4

## A dataset as represented in the data.json file on floop

## Modding the data.json from floop

assumptions:
we'll start the new datasets at 155, that seems to be the next unused number

we'll start topics at 61 for the same reason

we'll use 24 to represent ONS for the same reason.

TODO - have to define org 24 as ONS somehow. also need to calculate the numbers so this whole thing becomes repeatable.


we need to define the datasets one at a time, only adding a new resource if we cant match the ONS topic to a pre-existing resource.

In [47]:

# TODO - should be calculated not stated
datasetCount = 155
topicsCount = 61
orgCount = 24


# we'll build the ammendments as a whole seperate dict then append to data.json later
with open("WDAfloopscrape.json", "r") as scrape_file:   
    scrape = json.load(scrape_file)
    
    
    """
    Clean out the horrid topic (txxxx) thing in our labels dict
    """
    for key in scrape:
        for t in scrape[key]['topics']:
            if '(T' in labels[t]:
                labels[t] = labels[t].split('(T')[0].strip()
    
    """
    Build a bigger data.json
    """
    ammendments = {}
    for key in scrape:
        
        # list of all resources we'll need to define for this dataset, and another of already defined ones we just need to list
        undefined = []
        defined = []
        
        #pprint (labels)
        
        for topic in scrape[key]['topics']:
            
            # is this topic already defined?
            if labels[topic].lower() in existingResources.keys():
                defined.append(existingResources[labels[topic].lower()])
                
            else:
                undefined.append(labels[topic].lower())
                
        
        # build a dict of undefined topics as name:uri
        definitionsToDo = {}
        for u in undefined:
            definitionsToDo.update({u:'http://ons.floop.org.uk/resource/topic/' + str(topicsCount)})
            topicsCount += 1
        
        # use the lit of already defined topics/resources and definitionsToDo to build the 'depends' list for dataset 
        depends = []
        for d in defined:
            depends.append(d)
        for defo in definitionsToDo:
            depends.append(definitionsToDo[defo])
        
        # build the dataset dict
        dset = 'http://ons.floop.org.uk/resource/dataset/' + str(datasetCount)
        _type = 'http://ons.floop.org.uk/resource/organisation/' + str(orgCount)
        name = scrape[key]['name']
        _id = key
        docsLink = "https://www.ons.gov.uk/"
        
        ammendments.update({dset:{'type':_type, 'name':name, 'id':_id, 'docslink':docsLink, 'depends':depends, 'dependedOnBy':[]}})
        
        # add dataset counter
        datasetCount +=1 
        
        
        # build the dicts for out newly minted resources
        # this is what we built the definitionsToDo dict for 
        """
        http://ons.floop.org.uk/resource/topic/2: {
            type: "topic",
            name: "gender",
            id: "http://ons.floop.org.uk/resource/topic/2",
            depends: [ ],
            dependedOnBy: [ ]
            }
        """
        for reslabel in definitionsToDo.keys():
            uri = definitionsToDo[reslabel]
            ammendments.update({uri:{'type':'topic', 'name':reslabel, 'id':uri, 'depends':[], 'dependedOnBy':[]}})
        
        
        # Add out newly defined recources to existingResources ready for next entry
        for key in definitionsToDo:
            existingResources.update({key:definitionsToDo[key]})

# Export to json file
with open("justTheAmmendments.json", "w") as outfile:
    json.dump(ammendments, outfile)

        

## Preview The Json we're going to append onto data.json

In [48]:
# Preview the ammendments
pprint(ammendments)

{'http://ons.floop.org.uk/resource/dataset/1000': {'dependedOnBy': [],
                                                   'depends': ['http://ons.floop.org.uk/resource/topic/70',
                                                               'http://ons.floop.org.uk/resource/topic/66',
                                                               'http://ons.floop.org.uk/resource/topic/67'],
                                                   'docslink': 'https://www.ons.gov.uk/',
                                                   'id': 'LC6118EW 2',
                                                   'name': 'Industry by sex',
                                                   'type': 'http://ons.floop.org.uk/resource/organisation/24'},
 'http://ons.floop.org.uk/resource/dataset/1001': {'dependedOnBy': [],
                                                   'depends': ['http://ons.floop.org.uk/resource/topic/98',
                                                               'http://ons

## Make the combined file

In [49]:

# Build the Combined Json
with open("data.json", "r") as main_file:   
    final = json.load(main_file)

    for key in ammendments.keys():
        final.update({key:ammendments[key]})
        
# Export to our final hacked json file
with open("FINAL_WDAhackedintofloopdata.json", "w") as outfile:
    json.dump(final, outfile)
    