# Simple Linked Example

Uses shared state to recall unique IDs between datasets.

In [8]:
!pip install randomdataset exetera

Collecting randomdataset
  Using cached RandomDataset-0.1.4-py3-none-any.whl (14 kB)
Installing collected packages: randomdataset
Successfully installed randomdataset-0.1.4


In [1]:
%%writefile user_assessments.yaml

- typename: randomdataset.generators.CSVGenerator
  num_lines: 10
  dataset:
    name: users
    typename: randomdataset.Dataset
    fields:
    - name: id
      typename: randomdataset.UIDFieldGen
      shared_state_name: user_ids
    - name: FirstName
      typename: randomdataset.AlphaNameGen
    - name: LastName
      typename: randomdataset.AlphaNameGen
      is_first_name: False
    - name: bmi
      typename: randomdataset.IntFieldGen
      vmin: 20
      vmax: 40
    - name: has_diabetes
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: height_cm
      typename: randomdataset.IntFieldGen
      vmin: 100
      vmax: 200
    - name: year_of_birth
      typename: randomdataset.IntFieldGen
      vmin: 1920
      vmax: 2010
    
- typename: randomdataset.generators.CSVGenerator
  num_lines: 30
  dataset:
    name: assessments
    typename: randomdataset.Dataset
    fields:
    - name: id
      typename: randomdataset.UIDFieldGen
    - name: date
      typename: randomdataset.DateTimeFieldGen
      as_string: True
    - name: user_id
      typename: randomdataset.SharedDataGen
      source_state_name: user_ids
      field_type: int
    - name: abdominal_pain
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: brain_fog
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: loss_of_smell
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: tested_covid_positive
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 3
    - name: temperature_f
      typename: randomdataset.FloatFieldGen
      vmin: 95
      vmax: 110
    

Overwriting user_assessments.yaml


In [3]:
import os
import sys
import randomdataset

In [4]:
randomdataset.application.generate_dataset.callback("user_assessments.yaml",".")

Schema: 'user_assessments.yaml'
Output: '.'
Generating dataset 'users'
Generating dataset 'assessments'


In [5]:
!cat users.csv

id,FirstName,LastName,bmi,has_diabetes,height_cm,year_of_birth
0,"Xavier","Unknown",26,0,193,1934
1,"Peggy","Nemo",39,0,164,1982
2,"Kylie","Bar",39,0,111,1941
3,"Mallory","Anon",34,0,171,2009
4,"Kylie","Anon",38,0,167,1949
5,"Peggy","Thunk",23,1,197,1926
6,"Uriel","Blargs",37,0,175,1961
7,"Laura","Bar",38,0,174,2005
8,"Alice","Unknown",25,0,128,1990
9,"Grace","Anon",20,1,156,1940


In [6]:
!cat assessments.csv

id,date,user_id,abdominal_pain,brain_fog,loss_of_smell,tested_covid_positive,temperature_f
0,2022-01-31 13:10:19.470142+00:00,0,0,1,0,2,107.49023514725474
1,2021-11-12 04:17:56.782553+00:00,1,0,1,1,0,105.6942570124724
2,2022-01-01 14:31:02.775222+00:00,2,1,0,0,0,96.54598657012015
3,2021-06-11 19:21:16.500351+00:00,3,0,1,1,0,105.10060277211116
4,2022-01-24 17:25:45.546875+00:00,4,1,0,1,2,104.78580411362383
5,2021-05-19 19:28:55.347014+00:00,5,1,1,0,0,99.1889494912643
6,2021-05-11 00:53:23.349521+00:00,6,0,1,1,2,109.46725811260491
7,2021-07-10 11:37:36.951190+00:00,7,0,1,0,0,95.14845661460399
8,2021-11-12 16:39:51.889504+00:00,8,1,0,0,0,97.73890279974026
9,2022-02-11 04:39:26.788180+00:00,9,1,1,1,0,98.02984864593004
10,2021-06-06 21:30:19.920080+00:00,0,1,1,0,1,104.13524032621973
11,2022-02-08 03:18:02.527770+00:00,1,1,1,1,1,105.399991356878
12,2021-02-20 14:45:30.235950+00:00,2,1,1,1,2,105.81507610779525
13,2021-03-25 15:01:35.580187+00:00,3,0,0,0,2,103.39740560144419
14,2021-06-25 12:4

In [9]:
%%writefile user_assessments.json

{
  "exetera": {
    "version": "1.0.0"
  },
  "schema": {
    "users": {
      "primary_keys": [
        "id"
      ],
      "fields": {
        "id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "FirstName": {
          "field_type": "string"
        },
        "LastName": {
          "field_type": "string"
        },
        "bmi": {
          "field_type": "numeric",
          "value_type": "int32"
        },
        "has_diabetes": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "height_cm": {
          "field_type": "numeric",
          "value_type": "int32"
        },   
        "year_of_birth": {
          "field_type": "numeric",
          "value_type": "int32"
        }
      }
    },
    "assessments": {
      "primary_keys": [
        "id"
      ],
      "foreign_keys": {
        "user_id_key": {
          "space": "users",
          "key": "id"
        }
      },
      "fields": {
        "id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "date": {
          "field_type": "datetime"
        },
        "user_id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "abdominal_pain": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "brain_fog": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "loss_of_smell": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "tested_covid_positive": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "waiting": 1,
              "no": 2,
              "yes": 3
            }
          }
        },
        "temperature_f": {
          "field_type": "numeric",
          "value_type": "float32"
        }
      }
    }
  }
}

Overwriting user_assessments.json


In [5]:
#Import csv to hdf5 through import_with_schema function

import exetera

from exetera.io import importer
from exetera.core import session
from datetime import datetime, timezone

with session.Session() as s:
    importer.import_with_schema(
        session=s,
        timestamp=str(datetime.now(timezone.utc)),
        dataset_alias="UserAssessments",
        dataset_filename="user_assessments.hdf5",
        schema_file="user_assessments.json",
        files={"users": "users.csv", "assessments":"assessments.csv"},
        overwrite=True,
    )

read_file_using_fast_csv_reader: 1 chunks, 10 accumulated_written_rows parsed in 0.0049591064453125s
completed in 0.00712895393371582 seconds
Total time 0.007568359375s
read_file_using_fast_csv_reader: 1 chunks, 30 accumulated_written_rows parsed in 0.0041615962982177734s
completed in 0.006209611892700195 seconds
Total time 0.006479978561401367s


In [4]:
#Import csv to hdf5 through command line:
%%bash

exetera import -w -s user_assessments.json -i "users:users.csv, assessments:assessments.csv" -o user_assessments.hdf5
ls -lh

total 259M
-rw-rw-r-- 1 jd21 jd21  23K Feb 18 15:46 advanced_operations.ipynb
-rw-rw-r-- 1 jd21 jd21 2.0K Feb 18 15:44 assessments.csv
-rw-rw-r-- 1 jd21 jd21  24K Feb 18 15:46 basic_concept.ipynb
-rw-rw-r-- 1 jd21 jd21 253M Feb 18 15:24 dataset.hdf5
drwxrwxr-x 3 jd21 jd21 4.0K Feb 18 14:39 example-css
-rw-rw-r-- 1 jd21 jd21 1.7K Feb 18 15:24 exeteraschema.json
-rw-rw-r-- 1 jd21 jd21 6.0M Jan 25 16:17 name_gender_dataset.hdf5
-rw-rw-r-- 1 jd21 jd21  676 Jan 25 16:08 name_gender_dataset_schema.json
-rw-rw-r-- 1 jd21 jd21 6.5K Feb 18 15:23 names_dataset.ipynb
-rw-rw-r-- 1 jd21 jd21  137 Feb 18 15:20 README.md
-rw-rw-r-- 1 jd21 jd21  18K Feb 18 15:47 simple_linked_dataset.ipynb
-rw-rw-r-- 1 jd21 jd21 1.8K Feb 18 15:25 temp.hdf5
-rw-rw-r-- 1 jd21 jd21 2.6K Feb 18 15:45 user_assessments.json
-rw-rw-r-- 1 jd21 jd21 1.7K Feb 18 15:44 user_assessments.yaml
drwxrwxr-x 2 jd21 jd21 4.0K Feb 18 14:36 users_assessments
-rw-rw-r-- 1 jd21 jd21  383 Feb 18 15:44 users.csv


bash: line 3: exetera: command not found
