# Simple Linked Example

Uses shared state to recall unique IDs between datasets.

In [8]:
!mkdir -p users_assessments

In [9]:
%%writefile users_assessments/user_assessments.yaml

- typename: randomdataset.generators.CSVGenerator
  num_lines: 10
  dataset:
    name: users
    typename: randomdataset.Dataset
    fields:
    - name: id
      typename: randomdataset.UIDFieldGen
      shared_state_name: user_ids
    - name: FirstName
      typename: randomdataset.AlphaNameGen
    - name: LastName
      typename: randomdataset.AlphaNameGen
      is_first_name: False
    - name: bmi
      typename: randomdataset.IntFieldGen
      vmin: 20
      vmax: 40
    - name: has_diabetes
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: height_cm
      typename: randomdataset.IntFieldGen
      vmin: 100
      vmax: 200
    - name: year_of_birth
      typename: randomdataset.IntFieldGen
      vmin: 1920
      vmax: 2010
    
- typename: randomdataset.generators.CSVGenerator
  num_lines: 30
  dataset:
    name: assessments
    typename: randomdataset.Dataset
    fields:
    - name: id
      typename: randomdataset.UIDFieldGen
    - name: date
      typename: randomdataset.DateTimeFieldGen
      as_string: True
    - name: user_id
      typename: randomdataset.SharedDataGen
      source_state_name: user_ids
      field_type: int
    - name: abdominal_pain
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: brain_fog
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: loss_of_smell
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 2
    - name: tested_covid_positive
      typename: randomdataset.IntFieldGen
      vmin: 0
      vmax: 3
    - name: temperature_f
      typename: randomdataset.FloatFieldGen
      vmin: 95
      vmax: 110
    

Writing users_assessments/user_assessments.yaml


In [1]:
import os
import sys

sys.path.append(os.path.abspath(".."))

import randomdataset

In [10]:
randomdataset.application.generate_dataset.callback("users_assessments/user_assessments.yaml","users_assessments")

Schema: 'users_assessments/user_assessments.yaml'
Output: 'users_assessments'
Generating dataset 'users'
Generating dataset 'assessments'


In [11]:
!cat users_assessments/users.csv

id,FirstName,LastName,bmi,has_diabetes,height_cm,year_of_birth
0,"Grace","None",39,1,130,1967
1,"Carol","Nobody",38,0,119,1975
2,"Wendy","Random",28,0,128,1926
3,"Mallory","Nobody",25,0,117,1944
4,"Xavier","Unknown",29,1,190,1974
5,"Olivia","Thunk",26,0,107,2004
6,"Xavier","Anon",30,0,175,1973
7,"Xavier","Null",37,0,140,1963
8,"Ivan","Bloggs",37,0,134,1999
9,"Trudy","Bar",28,0,116,1929


In [12]:
!cat users_assessments/assessments.csv

id,date,user_id,abdominal_pain,brain_fog,loss_of_smell,tested_covid_positive,temperature_f
0,2021-10-24 11:45:43.677374+00:00,0,0,1,1,2,103.22149054047082
1,2021-12-16 13:09:58.380573+00:00,1,0,1,0,0,100.62518751030662
2,2021-08-05 17:51:30.943546+00:00,2,0,0,1,0,105.18487884609749
3,2021-04-09 14:47:54.599226+00:00,3,1,0,1,0,96.4302053852154
4,2021-09-29 00:15:42.142405+00:00,4,1,1,1,0,109.63616106818489
5,2021-04-24 09:53:44.215726+00:00,5,1,0,0,1,107.69840121429907
6,2021-11-13 07:35:32.840341+00:00,6,0,0,0,1,97.00309019318361
7,2022-02-14 00:08:04.885913+00:00,7,1,0,0,1,95.22598358524823
8,2022-02-07 15:36:57.841132+00:00,8,0,0,0,2,95.48740949212532
9,2021-02-21 01:48:38.675272+00:00,9,0,1,1,0,106.27664175133276
10,2021-08-05 00:06:12.343504+00:00,0,0,1,1,0,103.07544677653925
11,2021-11-07 21:52:41.868990+00:00,1,1,0,0,2,102.81942527899108
12,2021-05-20 14:49:01.700189+00:00,2,0,0,0,2,103.25591242165508
13,2021-09-28 03:13:05.410689+00:00,3,0,1,1,1,98.99925665317788
14,2022-01-21 1

In [13]:
%%writefile users_assessments/user_assessments.json

{
  "exetera": {
    "version": "1.0.0"
  },
  "schema": {
    "users": {
      "primary_keys": [
        "id"
      ],
      "fields": {
        "id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "FirstName": {
          "field_type": "string"
        },
        "LastName": {
          "field_type": "string"
        },
        "bmi": {
          "field_type": "numeric",
          "value_type": "int32"
        },
        "has_diabetes": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "height_cm": {
          "field_type": "numeric",
          "value_type": "int32"
        },   
        "year_of_birth": {
          "field_type": "numeric",
          "value_type": "int32"
        }
      }
    },
    "assessments": {
      "primary_keys": [
        "id"
      ],
      "foreign_keys": {
        "user_id_key": {
          "space": "users",
          "key": "id"
        }
      },
      "fields": {
        "id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "date": {
          "field_type": "datetime"
        },
        "user_id": {
          "field_type": "fixed_string",
          "length": 32
        },
        "abdominal_pain": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "brain_fog": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "loss_of_smell": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "False": 1,
              "True": 2
            }
          }
        },
        "tested_covid_positive": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "": 0,
              "waiting": 1,
              "no": 2,
              "yes": 3
            }
          }
        },
        "temperature_f": {
          "field_type": "numeric",
          "value_type": "float32"
        }
      }
    }
  }
}

Writing users_assessments/user_assessments.json


In [14]:
import exetera

from exetera.io import importer
from exetera.core import session
from datetime import datetime, timezone

# exetera import -w -s user_assessments.json -i "users:users.csv, assessments:assessments.csv" -o user_assessments.hdf5
with session.Session() as s:
    importer.import_with_schema(
        session=s,
        timestamp=str(datetime.now(timezone.utc)),
        dataset_alias="UserAssessments",
        dataset_filename="user_assessments.hdf5",
        schema_file="users_assessments/user_assessments.json",
        files={"users": "users_assessments/users.csv", "assessments":"users_assessments/assessments.csv"},
        overwrite=True,
    )

read_file_using_fast_csv_reader: 1 chunks, 10 accumulated_written_rows parsed in 0.010171890258789062s
completed in 0.018576860427856445 seconds
Total time 0.01882028579711914s
read_file_using_fast_csv_reader: 1 chunks, 30 accumulated_written_rows parsed in 0.006245851516723633s
completed in 0.01291799545288086 seconds
Total time 0.013108015060424805s
