In [52]:
import json
from pathlib import Path
import os

import pandas as pd
import s3fs


def read_cluster_csv(file_path, endpoint_url='https://storage.budsc.midwest-datascience.com'):
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    return pd.read_csv(s3.open(file_path, mode='rb'))

current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
kv_data_dir = results_dir.joinpath('kvdb')
kv_data_dir.mkdir(parents=True, exist_ok=True)

people_json = kv_data_dir.joinpath('people.json')
visited_json = kv_data_dir.joinpath('visited.json')
sites_json = kv_data_dir.joinpath('sites.json')
measurements_json = kv_data_dir.joinpath('measurements.json')

In [53]:
class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        return self._db.get(key)

    def set_value(self, key, value):
        self._db[key] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)

In [54]:
def create_sites_kvdb():
    db = KVDB(sites_json)
    df = pd.read_csv('data/external/tidynomicon/site.csv')
    for site_id, group_df in df.groupby('site_id'):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_people_kvdb():
    db = KVDB(people_json)
    df = pd.read_csv('data/external/tidynomicon/person.csv')
    for person_id, group_df in df.groupby('person_id'):
        db.set_value(person_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_visits_kvdb():
    db = KVDB(visited_json)
    df = pd.read_csv('data/external/tidynomicon/visited.csv')
    for composite_id,group_df in df.groupby(['visit_id','site_id']):
        db.set_value(f"{composite_id[0]}:{composite_id[1]}:",group_df.to_dict(orient='records')[0])   
    db.save()


def create_measurements_kvdb():
    db = KVDB(measurements_json)
    df = pd.read_csv('data/external/tidynomicon/measurements.csv')
    for composite_id,group_df in df.groupby(['visit_id','person_id','quantity']):
        db.set_value(f"{composite_id[0]}:{composite_id[1]}:{composite_id[2]}",group_df.to_dict(orient='records')[0]) 
    db.save()

In [55]:
create_sites_kvdb()
create_people_kvdb()
create_visits_kvdb()
create_measurements_kvdb()

In [56]:
with open("results/kvdb/measurements.json",'r') as j:
    j = json.load(j)
    print(json.dumps(j, indent=4))

{
    "619:dyer:rad": {
        "visit_id": 619,
        "person_id": "dyer",
        "quantity": "rad",
        "reading": 9.82
    },
    "619:dyer:sal": {
        "visit_id": 619,
        "person_id": "dyer",
        "quantity": "sal",
        "reading": 0.13
    },
    "622:dyer:rad": {
        "visit_id": 622,
        "person_id": "dyer",
        "quantity": "rad",
        "reading": 7.8
    },
    "622:dyer:sal": {
        "visit_id": 622,
        "person_id": "dyer",
        "quantity": "sal",
        "reading": 0.09
    },
    "734:lake:sal": {
        "visit_id": 734,
        "person_id": "lake",
        "quantity": "sal",
        "reading": 0.05
    },
    "734:pb:rad": {
        "visit_id": 734,
        "person_id": "pb",
        "quantity": "rad",
        "reading": 8.41
    },
    "734:pb:temp": {
        "visit_id": 734,
        "person_id": "pb",
        "quantity": "temp",
        "reading": -21.5
    },
    "735:pb:rad": {
        "visit_id": 735,
        "person_id": 

In [57]:
with open("results/kvdb/visited.json",'r') as j:
    j = json.load(j)
    print(json.dumps(j, indent=4))

{
    "619:DR-1:": {
        "visit_id": 619,
        "site_id": "DR-1",
        "visit_date": "1927-02-08"
    },
    "622:DR-1:": {
        "visit_id": 622,
        "site_id": "DR-1",
        "visit_date": "1927-02-10"
    },
    "734:DR-3:": {
        "visit_id": 734,
        "site_id": "DR-3",
        "visit_date": "1930-01-07"
    },
    "735:DR-3:": {
        "visit_id": 735,
        "site_id": "DR-3",
        "visit_date": "1930-01-12"
    },
    "751:DR-3:": {
        "visit_id": 751,
        "site_id": "DR-3",
        "visit_date": "1930-02-26"
    },
    "752:DR-3:": {
        "visit_id": 752,
        "site_id": "DR-3",
        "visit_date": NaN
    },
    "837:MSK-4:": {
        "visit_id": 837,
        "site_id": "MSK-4",
        "visit_date": "1932-01-14"
    },
    "844:DR-1:": {
        "visit_id": 844,
        "site_id": "DR-1",
        "visit_date": "1932-03-22"
    }
}


In [58]:
with open("results/kvdb/people.json",'r') as j:
    j = json.load(j)
    print(json.dumps(j, indent=4))

{
    "danforth": {
        "person_id": "danforth",
        "personal_name": "Frank",
        "family_name": "Danforth"
    },
    "dyer": {
        "person_id": "dyer",
        "personal_name": "William",
        "family_name": "Dyer"
    },
    "lake": {
        "person_id": "lake",
        "personal_name": "Anderson",
        "family_name": "Lake"
    },
    "pb": {
        "person_id": "pb",
        "personal_name": "Frank",
        "family_name": "Pabodie"
    },
    "roe": {
        "person_id": "roe",
        "personal_name": "Valentina",
        "family_name": "Roerich"
    }
}


In [59]:
with open("results/kvdb/sites.json",'r') as j:
    j = json.load(j)
    print(json.dumps(j, indent=4))

{
    "DR-1": {
        "site_id": "DR-1",
        "latitude": -49.85,
        "longitude": -128.57
    },
    "DR-3": {
        "site_id": "DR-3",
        "latitude": -47.15,
        "longitude": -126.72
    },
    "MSK-4": {
        "site_id": "MSK-4",
        "latitude": -48.87,
        "longitude": -123.4
    }
}
