# Prepare synthetic data to define census of wards in the tower flow report
It is going to be much easier to develop if you have realistic synthetic data.
Here we take a sql query that generates a single tabular output.
We run that query against the live identifiable data once.
We then use the [Synthetic Data Vault](https://sdv.dev/SDV/index.html) to prepare a synthetic model of those data.
The code below serves as a vignette for that process but will need adjusting to match the exact contents of the original query.

More complex examples that include multiple tables with joins and dependencies are also possible.

This notebook should be run interactively just once

In [1]:
import os
import pandas as pd

from pathlib import Path
from sqlalchemy import create_engine

In [2]:
# Construct the PostgreSQL connection
uds_host = os.getenv('EMAP_DB_HOST')
uds_user = os.getenv('EMAP_DB_USER')
uds_passwd = os.getenv('EMAP_DB_PASSWORD')
dsn = f'postgresql://{uds_user}:{uds_passwd}@{uds_host}:5432/uds'
emapdb_engine = create_engine(dsn)

In [3]:
from wards import wards

In [4]:
wards[49:]

['WMS W01 CRITICAL CARE', 'WMS W02 SHORT STAY', 'WMS W03 WARD', 'WMS W04 WARD']

In [5]:
# Read the sql file into a query 'q' and the query into a dataframe
q = Path('beds.sql').read_text()
_wards = wards[49:] # for testing just work with WMS
_wards = wards[:] 
_locations = ['T06C^T06C BY08^BY08-36']

In [6]:
_wards = list(_wards) if type(_wards) is str else _wards
df = pd.read_sql_query(sql=q, con=emapdb_engine, params={'wards': _wards, 'locations': []})
df.head()

Unnamed: 0,location_id,department,location_string,ovl_admission,ovl_hv_id,open_visits_n,cvl_admission,cvl_discharge,cvl_hv_id,ovl_ghost,occupied,modified_at,patient_class,encounter,mrn,lastname,firstname,date_of_birth
0,332136309,UCH T08S ARCU,10201000174^null^null,,,,,,,0,0,2022-07-18 21:30:58.087946+00:00,,,,,,
1,332136340,UCH T08S ARCU,10201000174^T08SARCU SR40^SR40-40,,,,2022-03-31 02:27:00+01:00,2022-04-01 11:01:00+01:00,517922209.0,0,0,2022-07-18 21:30:58.087946+00:00,,,,,,
2,332136317,UCH T08S ARCU,10201000174^T08SARCU SR41^SR41-41,,,,2022-03-18 16:08:00+00:00,2022-04-01 11:01:00+01:00,513178157.0,0,0,2022-07-18 21:30:58.087946+00:00,,,,,,
3,332136333,UCH T08S ARCU,10201000174^T08SARCU SR42^SR42-42,2022-02-03 00:15:00+00:00,380450349.0,1.0,2022-03-31 15:36:00+01:00,2022-04-01 11:01:00+01:00,316691497.0,1,0,2022-07-18 21:30:58.087946+00:00,INPATIENT,1033341786.0,21220982.0,GAAL,ALI,1954-12-17
4,332136326,UCH T08S ARCU,10201000174^T08SARCU SR43^SR43-43,,,,2022-03-26 15:02:00+00:00,2022-04-01 11:06:00+01:00,517600411.0,0,0,2022-07-18 21:30:58.087946+00:00,,,,,,


## Fake Personsally Identifiable Information

In [7]:
from faker import Faker
fake = Faker()

In [8]:
df['encounter'] = df['encounter'].map(lambda x: int(fake.numerify('10########')), na_action='ignore')
df['mrn'] = df['mrn'].map(lambda x: int(fake.numerify('40######')), na_action='ignore')
df['lastname'] = df['lastname'].map(lambda x: fake.last_name().upper(), na_action='ignore')
df['firstname'] = df['firstname'].map(lambda x: fake.first_name().upper(), na_action='ignore')
df['date_of_birth'] = df['date_of_birth'].map(lambda x: fake.date_of_birth(), na_action='ignore')
df['cvl_hv_id'] = df['cvl_hv_id'].map(lambda x: fake.random_number(digits=6, fix_len=True), na_action='ignore')
df['ovl_hv_id'] = df['ovl_hv_id'].map(lambda x: fake.random_number(digits=6, fix_len=True), na_action='ignore')

In [9]:
# inspect an example
df.iloc[0]

location_id                               332136309
department                            UCH T08S ARCU
location_string               10201000174^null^null
ovl_admission                                  None
ovl_hv_id                                       NaN
open_visits_n                                   NaN
cvl_admission                                  None
cvl_discharge                                  None
cvl_hv_id                                       NaN
ovl_ghost                                         0
occupied                                          0
modified_at        2022-07-18 21:30:58.087946+00:00
patient_class                                  None
encounter                                       NaN
mrn                                             NaN
lastname                                       None
firstname                                      None
date_of_birth                                  None
Name: 0, dtype: object

### Save the synthetic data via sqlite


In [10]:
engine_sqlite = create_engine('sqlite:///beds.db')
con = engine_sqlite.connect()
df.to_sql('beds', con=con, if_exists='replace', index=False)

1310

In [11]:
pd.read_sql('beds',  con=con)

Unnamed: 0,location_id,department,location_string,ovl_admission,ovl_hv_id,open_visits_n,cvl_admission,cvl_discharge,cvl_hv_id,ovl_ghost,occupied,modified_at,patient_class,encounter,mrn,lastname,firstname,date_of_birth
0,332136309,UCH T08S ARCU,10201000174^null^null,NaT,,,NaT,NaT,,0,0,2022-07-18 21:30:58.087946,,,,,,NaT
1,332136340,UCH T08S ARCU,10201000174^T08SARCU SR40^SR40-40,NaT,,,2022-03-31 02:27:00,2022-04-01 11:01:00,461317.0,0,0,2022-07-18 21:30:58.087946,,,,,,NaT
2,332136317,UCH T08S ARCU,10201000174^T08SARCU SR41^SR41-41,NaT,,,2022-03-18 16:08:00,2022-04-01 11:01:00,648038.0,0,0,2022-07-18 21:30:58.087946,,,,,,NaT
3,332136333,UCH T08S ARCU,10201000174^T08SARCU SR42^SR42-42,2022-02-03 00:15:00,830711.0,1.0,2022-03-31 15:36:00,2022-04-01 11:01:00,939191.0,1,0,2022-07-18 21:30:58.087946,INPATIENT,1.035432e+09,40725580.0,COLLINS,KEVIN,1913-12-29
4,332136326,UCH T08S ARCU,10201000174^T08SARCU SR43^SR43-43,NaT,,,2022-03-26 15:02:00,2022-04-01 11:06:00,941394.0,0,0,2022-07-18 21:30:58.087946,,,,,,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,740719,WMS W03 WARD,WSU3^W03W SR01^SR01-304,2022-07-17 15:28:00,844084.0,1.0,2022-07-10 17:42:00,2022-07-17 11:09:00,228616.0,0,1,2022-07-18 21:30:58.087946,INPATIENT,1.093675e+09,40710285.0,KELLY,TAMMY,1913-08-08
1306,44380,WMS W03 WARD,WSU3^W03W SR02^SR02-308,2022-07-13 12:56:00,303583.0,2.0,2022-07-04 11:57:00,2022-07-12 15:34:00,895955.0,0,1,2022-07-18 21:30:58.087946,INPATIENT,1.047211e+09,40448047.0,BAKER,JAMES,1914-09-30
1307,23207,WMS W03 WARD,WSU3^W03W SR03^SR03-311,2022-07-05 19:27:00,141708.0,1.0,2022-07-01 21:34:00,2022-07-05 19:06:00,713642.0,0,1,2022-07-18 21:30:58.087946,INPATIENT,1.098822e+09,40360621.0,KANE,KATRINA,1944-01-23
1308,429038,WMS W03 WARD,WSU3^W03W SR04^SR04-312,2022-07-14 19:35:00,464233.0,1.0,2022-07-13 10:11:00,2022-07-14 13:37:00,190470.0,0,1,2022-07-18 21:30:58.087946,INPATIENT,1.074042e+09,40542123.0,PARKER,COURTNEY,1935-02-08
