In [3]:
!pip install s3fs

Collecting s3fs
  Downloading s3fs-2022.1.0-py3-none-any.whl (25 kB)
Collecting aiohttp<=4
  Downloading aiohttp-3.8.1-cp36-cp36m-win_amd64.whl (551 kB)
Collecting aiobotocore~=2.1.0
  Downloading aiobotocore-2.1.2.tar.gz (58 kB)
Collecting fsspec==2022.01.0
  Downloading fsspec-2022.1.0-py3-none-any.whl (133 kB)
Collecting idna-ssl>=1.0; python_version < "3.7"
  Downloading idna-ssl-1.1.0.tar.gz (3.4 kB)
Collecting charset-normalizer<3.0,>=2.0
  Downloading charset_normalizer-2.1.0-py3-none-any.whl (39 kB)
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting asynctest==0.13.0; python_version < "3.8"
  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp36-cp36m-win_amd64.whl (83 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-5.2.0-cp36-cp36m-win_amd64.whl (45 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.7.2-cp36-cp36m-win_amd64.whl (121 kB)
Collect

In [4]:
import json
from pathlib import Path
import os

import pandas as pd
import s3fs


def read_cluster_csv(file_path, endpoint_url='https://storage.budsc.midwest-datascience.com'):
    s3 = s3fs.S3FileSystem(
        anon=True,
        client_kwargs={
            'endpoint_url': endpoint_url
        }
    )
    return pd.read_csv(s3.open(file_path, mode='rb'))

current_dir = Path(os.getcwd()).absolute()
results_dir = current_dir.joinpath('results')
kv_data_dir = results_dir.joinpath('kvdb')
kv_data_dir.mkdir(parents=True, exist_ok=True)

people_json = kv_data_dir.joinpath('people.json')
visited_json = kv_data_dir.joinpath('visited.json')
sites_json = kv_data_dir.joinpath('sites.json')
measurements_json = kv_data_dir.joinpath('measurements.json')

In [5]:
class KVDB(object):
    def __init__(self, db_path):
        self._db_path = Path(db_path)
        self._db = {}
        self._load_db()

    def _load_db(self):
        if self._db_path.exists():
            with open(self._db_path) as f:
                self._db = json.load(f)

    def get_value(self, key):
        return self._db.get(key)

    def set_value(self, key, value):
        self._db[key] = value

    def save(self):
        with open(self._db_path, 'w') as f:
            json.dump(self._db, f, indent=2)

In [12]:
def create_sites_kvdb():
    db = KVDB(sites_json)
    df = pd.read_csv('C:/Users/taylo/OneDrive/Documents/dsc650/data/external/tidynomicon/site.csv')
    for site_id, group_df in df.groupby('site_id'):
        db.set_value(site_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_people_kvdb():
    db = KVDB(people_json)
    ## TODO: Implement code
    df = pd.read_csv('C:/Users/taylo/OneDrive/Documents/dsc650/data/external/tidynomicon/person.csv')
    for person_id, group_df in df.groupby('person_id'):
        db.set_value(person_id, group_df.to_dict(orient='records')[0])
    db.save()


def create_visits_kvdb():
    db = KVDB(visited_json)
    ## TODO: Implement code
    df = pd.read_csv('C:/Users/taylo/OneDrive/Documents/dsc650/data/external/tidynomicon/visited.csv')
    key = str(str(df['visit_id']) + str(df['site_id']))
    value = df['visit_date']
    for key, value in df.iterrows():
        db.set_value(key, value)
    db.save()


def create_measurements_kvdb():
    db = KVDB(measurements_json)
    ## TODO: Implement code
    df = pd.read_csv('C:/Users/taylo/OneDrive/Documents/dsc650/data/external/tidynomicon/measurements.csv')
    key = str(str(df['visit_id']) + str(df['person_id']) + str(df['quantity']))
    value = df['reading']
    for key, value in df.iterrows():
        db.set_value(key, value)
    db.save()

In [10]:
visits = pd.read_csv('C:/Users/taylo/OneDrive/Documents/dsc650/data/external/tidynomicon/visited.csv')
visits.head()

Unnamed: 0,visit_id,site_id,visit_date
0,619,DR-1,1927-02-08
1,622,DR-1,1927-02-10
2,734,DR-3,1930-01-07
3,735,DR-3,1930-01-12
4,751,DR-3,1930-02-26


In [11]:
m = pd.read_csv('C:/Users/taylo/OneDrive/Documents/dsc650/data/external/tidynomicon/measurements.csv')
m.head()

Unnamed: 0,visit_id,person_id,quantity,reading
0,619,dyer,rad,9.82
1,619,dyer,sal,0.13
2,622,dyer,rad,7.8
3,622,dyer,sal,0.09
4,734,pb,rad,8.41


In [13]:
create_sites_kvdb()
create_people_kvdb()
create_visits_kvdb()
create_measurements_kvdb()

TypeError: Object of type 'Series' is not JSON serializable