# Simple Linked Example

Uses shared state to recall unique IDs between datasets.

In [1]:
%%writefile user_assessments.yaml

- typename: randomdataset.generators.CSVGenerator
  num_lines: 10
  dataset:
    name: users
    typename: randomdataset.Dataset
    fields:
    - name: id
      typename: randomdataset.UIDFieldGen
      shared_state_name: user_ids
    - name: FirstName
      typename: randomdataset.AlphaNameGen
    - name: LastName
      typename: randomdataset.AlphaNameGen
      is_first_name: False
    - name: bmi
      typename: randomdataset.IntFieldGen
      vmin: 20
      vmax: 40
    - name: has_diabetes
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: height_cm
      typename: randomdataset.IntFieldGen
      vmin: 100
      vmax: 200
    - name: year_of_birth
      typename: randomdataset.IntFieldGen
      vmin: 1920
      vmax: 2010
    
- typename: randomdataset.generators.CSVGenerator
  num_lines: 30
  dataset:
    name: assessments
    typename: randomdataset.Dataset
    fields:
    - name: id
      typename: randomdataset.UIDFieldGen
    - name: date
      typename: randomdataset.DateTimeFieldGen
      as_string: True
    - name: user_id
      typename: randomdataset.SharedDataGen
      source_state_name: user_ids
      field_type: int
    - name: abdominal_pain
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: brain_fog
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: loss_of_smell
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: tested_covid_positive
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 3
    - name: temperature_f
      typename: randomdataset.FloatFieldGen
      vmin: 95
      vmax: 110
    

Overwriting user_assessments.yaml


In [1]:
import os
import sys

sys.path.append(os.path.abspath(".."))

import randomdataset

In [4]:
randomdataset.application.generate_dataset.callback("user_assessments.yaml",".")

Schema: 'user_assessments.yaml'
Output: '.'
Generating dataset 'users'
Generating dataset 'assessments'


In [5]:
!cat users.csv

id,FirstName,LastName,bmi,has_diabetes,height_cm,year_of_birth
0,"Quin","Blargs",20,0,189,2003
1,"Peggy","Unknown",36,1,121,1952
2,"Grace","Bar",23,1,135,1947
3,"Zoe","Unknown",26,0,196,2003
4,"Kylie","Thunk",28,1,104,1985
5,"Ivan","Thunk",27,1,152,1944
6,"Judy","Anon",28,1,178,1951
7,"Judy","Anon",21,1,193,2005
8,"Yan","Blargs",32,0,155,1947
9,"Dan","Nemo",23,0,179,1961


In [6]:
!cat assessments.csv

id,date,user_id,abdominal_pain,brain_fog,loss_of_smell,tested_covid_positive,temperature_f
0,2022-01-31 13:10:19.470142+00:00,0,0,1,0,2,107.49023514725474
1,2021-11-12 04:17:56.782553+00:00,1,0,1,1,0,105.6942570124724
2,2022-01-01 14:31:02.775222+00:00,2,1,0,0,0,96.54598657012015
3,2021-06-11 19:21:16.500351+00:00,3,0,1,1,0,105.10060277211116
4,2022-01-24 17:25:45.546875+00:00,4,1,0,1,2,104.78580411362383
5,2021-05-19 19:28:55.347014+00:00,5,1,1,0,0,99.1889494912643
6,2021-05-11 00:53:23.349521+00:00,6,0,1,1,2,109.46725811260491
7,2021-07-10 11:37:36.951190+00:00,7,0,1,0,0,95.14845661460399
8,2021-11-12 16:39:51.889504+00:00,8,1,0,0,0,97.73890279974026
9,2022-02-11 04:39:26.788180+00:00,9,1,1,1,0,98.02984864593004
10,2021-06-06 21:30:19.920080+00:00,0,1,1,0,1,104.13524032621973
11,2022-02-08 03:18:02.527770+00:00,1,1,1,1,1,105.399991356878
12,2021-02-20 14:45:30.235950+00:00,2,1,1,1,2,105.81507610779525
13,2021-03-25 15:01:35.580187+00:00,3,0,0,0,2,103.39740560144419
14,2021-06-25 12:4

In [5]:
%%writefile user_assessments.json

{
  "exetera": {
    "version": "1.0.0"
  },
  "schema": {
    "users": {
      "primary_keys": [
        "id"
      ],
      "fields": {
        "id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "FirstName": {
          "field_type": "string"
        },
        "LastName": {
          "field_type": "string"
        },
        "bmi": {
          "field_type": "numeric",
          "value_type": "int32"
        },
        "has_diabetes": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "height_cm": {
          "field_type": "numeric",
          "value_type": "int32"
        },   
        "year_of_birth": {
          "field_type": "numeric",
          "value_type": "int32"
        }
      }
    },
    "assessments": {
      "primary_keys": [
        "id"
      ],
      "foreign_keys": {
        "user_id_key": {
          "space": "users",
          "key": "id"
        }
      },
      "fields": {
        "id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "date": {
          "field_type": "datetime"
        },
        "user_id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "abdominal_pain": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "brain_fog": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "loss_of_smell": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "tested_covid_positive": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "waiting": 1,
              "no": 2,
              "yes": 3
            }
          }
        },
        "temperature_f": {
          "field_type": "numeric",
          "value_type": "float32"
        }
      }
    }
  }
}

Overwriting user_assessments.json


In [1]:
import sys, os
sys.path.append(os.path.abspath("../../exetera/ExeTera"))
import exetera

In [7]:
import exetera

from exetera.io import importer
from exetera.core import session
from datetime import datetime, timezone

with session.Session() as s:
    importer.import_with_schema(
        session=s,
        timestamp=str(datetime.now(timezone.utc)),
        dataset_alias="UserAssessments",
        dataset_filename="user_assessments.hdf5",
        schema_file="user_assessments.json",
        files={"users": "users.csv", "assessments":"assessments.csv"},
        overwrite=True,
    )

read_file_using_fast_csv_reader: 1 chunks, 10 accumulated_written_rows parsed in 0.8907837867736816s
completed in 0.8990933895111084 seconds
Total time 0.8994894027709961s
read_file_using_fast_csv_reader: 1 chunks, 30 accumulated_written_rows parsed in 0.12972235679626465s
completed in 0.13634133338928223 seconds
Total time 0.1364729404449463s


In [None]:
%%bash

rm -f user_assessments.hdf5
exetera import -w -s user_assessments.json -i "users:users.csv, assessments:assessments.csv" -o user_assessments.hdf5
ls -lh