In [1]:
import random
import time

import pandas as pd
from datasets import load_dataset

import numpy as np
import pandas as pd
import pyodbc
import sklearn

import matplotlib.pyplot as plt

2021-11-30 19:34:17.218243: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-30 19:34:17.218257: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Generating a Synthetic Dataset
This notebook goes over how a synthetic dataset was generated.   
The data from Region Halland contains very sensitive patient information and can, therefore, not be included in a notebook. To showcase what has been done and why, we decided to create a synthetic dataset that follows the same general structure and distribution as the original dataset. 

All values in the dataset are generated radomly to either follow the distribution of the original dataset, as in the case for the dates (`Inpatient_Admissiondatetime`, `Inpatient_Departure`, `omvantDT`), patient gender (`Patient_Gender`), patient age (`Patient_Age`), the number of people having a fall injury (`Class_2016`).   

Other values, such as the journal text for each patient (`omvtext_concat`), the medical codes (`sokkod`), patient IDs (`Patient_ID`) are generated to appear similar to the original data and serve the same function. The journal text for each patient, for instance, are free text from the Amazon review dataset with a classification task. We used this dataset because:  
1. The dataset is in a free-text form with large and short sentences
2. It is a classification task with known labels
3. There are more than enough entries in the Amazon review dataset to replace the text needed for 2M patient journal entries. Even though it is not in Swedish and is not about medical text, it should serve a useful dataset for showcasing our methodology

# Dataset
The original and the synthetic dataset has the following columns:
- omv_pk: str
- Patient_ID: str
- Inpatient_Admissiondatetime: datetime str
- Inpatient_Departure: datetime str
- omvantDT: str
- sokkod: str
- omvtext_concat: text
- Class_2016: float
- Patient_Gender: char
- Patient_Age: float

# Randomize dates
To create randomized hospital admission dates between 2015 and 2020 according to how Swedish medical hospitals note down the time
  
code from https://stackoverflow.com/questions/553303/generate-a-random-date-between-two-other-dates

In [2]:
%%time

def convert_to_strtime(t, time_format='%Y-%m-%d %H:%M'):
    return time.strftime(time_format, time.localtime(t))


def generate_patient_visits(start, N, time_format='%Y-%m-%d %H:%M'):
    """Generates N visits after a given start date in assending order (Y-m-d HH:mm)
    
    :start str:
        Start time in YY-mm-dd HH:mm format
    :N int: 
        Number of journal entries to generate for the patient
        
    returns: [start_time, journal_entry_times, end_time]
    """
    times = []
    times_in_hours = []
    j_entry = time.mktime(time.strptime(start, time_format))
    
    for i in range(N):
        # how many hours from previous entry
        entry_next = random.choices(np.arange(1, 18), k=1)[0] * 3600 
        j_entry += entry_next
        times.append(convert_to_strtime(j_entry))
        times_in_hours.append(j_entry)
    
    # when the patient leaves the hospital
    end = random.choices(np.arange(3, 12), k=1)[0] * 3600 
    end += times_in_hours[-1]
    end = convert_to_strtime(end)
    
    end = [end]*N
    start = [start]*N
    return start, times, end


def get_random_start_datetime() -> str:
    """Generate random year, month, day, hour.
    
    Returns datetime formated string
    
    >>> # calculate total time 
    >>> timedate = get_random_start_datetime()
    >>> time.mktime(time.strptime(timedate, '%Y-%m-%d %H:%M'))
    """
    year = random.choices(np.arange(2010, 2018), k=1)[0]
    month = random.choices(np.arange(1, 13), k=1)[0]
    day = random.choices(np.arange(1, 31), k=1)[0]
    hour = random.choices(np.arange(1, 24), k=1)[0]
    if month == 2 and day != 28:
        day = day % 28 
    
    return  f"{year}-{month}-{day} {hour}:00" 

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 5.96 µs


In [3]:
# TODO test if works and then replace the below duplicate calls

In [4]:
population = np.arange(50, 300)
weights=(population[::-1] % 5)
N = random.choices(population, weights=weights, k=1)[0]

"""Most patients have 1-25 journal entries."""
acc_patients_count = 0
limit = 500_000 # how many journals in total to be generated with 1-15 journals
    


def create_entries_up_to_limit(population, weights=None):
    """Checks how many journal entries are created. Fills up to a limit of transformed/syntehtic samples."""
    
    global acc_patients_count, limit
    global Patient_ID, Patient_journals
    global pat_arrived, pat_j_entry, pat_departure

    
    while acc_patients_count < limit:
        N = random.choices(population, weights=weights, k=1)[0]
        acc_patients_count += N
        num_unique_patients += 1
        
        Patient_ID.append(num_unique_patients)
        Patient_journals.append((num_unique_patients, N))
        
        start = get_random_start_datetime()
        start, journal_entry, end = generate_patient_visits(start, N) 
        
        pat_arrived.append(start)
        pat_j_entry.append(journal_entry)
        pat_departure.append(end)

In [5]:
"""
times = [
    "Inpatient_Admissiondatetime": start, 
    "Inpatient_Departure": end,
    "omvantDT": journal_entry,
]
"""

max_entities = 2_351_348
num_falls = 302
num_2016 = 190_705

num_unique_patients = 0
Patient_ID = []
Patient_journals = [] # (Patient_ID, Num_entries)

pat_arrived = []
pat_j_entry = []
pat_departure = []


"""Most patients have 1-25 journal entries."""
acc_patients_count = 0
limit = 500_000 # how many journals in total to be generated with 1-15 journals
while acc_patients_count < limit:
    N = random.choices(np.arange(1, 25), weights=(np.arange(1, 25) % 5), k=1)[0]
    acc_patients_count += N
    num_unique_patients += 1
        
    start = get_random_start_datetime()
    start, journal_entry, end = generate_patient_visits(start, N) 
    
    pat_arrived.append(start)
    pat_j_entry.append(journal_entry)
    pat_departure.append(end)
    
    Patient_ID.append(num_unique_patients)
    Patient_journals.append((num_unique_patients, N))
    # TODO extract and generate times in loop
    

"""...then some have between 20-300 journal entries."""
limit = 1_500_000 
while acc_patients_count < limit:
    N = random.choices(np.arange(50, 300), weights=(np.arange(50, 300)[::-1] % 5), k=1)[0]
    acc_patients_count += N
    num_unique_patients += 1
        
    start = get_random_start_datetime()
    start, journal_entry, end = generate_patient_visits(start, N) 
    
    pat_arrived.append(start)
    pat_j_entry.append(journal_entry)
    pat_departure.append(end)
    
    Patient_ID.append(num_unique_patients)
    Patient_journals.append((num_unique_patients, N))
    # TODO extract and generate times in loop
    
    
"""...Far fewer have between 300-600 journal entries."""
limit = max_entities - 600
while acc_patients_count < limit:
    N = random.choices(np.arange(300, 600), k=1)[0]
    acc_patients_count += N
    num_unique_patients += 1
        
    start = get_random_start_datetime()
    start, journal_entry, end = generate_patient_visits(start, N) 
    
    pat_arrived.append(start)
    pat_j_entry.append(journal_entry)
    pat_departure.append(end)
    
    Patient_ID.append(num_unique_patients)
    Patient_journals.append((num_unique_patients, N))
    # TODO extract and generate times in loop


"""
To ensure that we have the same number of total journal entries as the orignal dataset, 
We create some more patients, which have only one journal entry.
"""
limit = max_entities
while acc_patients_count < limit:
    N = random.choices(np.arange(1, 2), k=1)[0]
    acc_patients_count += N
    num_unique_patients += 1
        
    start = get_random_start_datetime()
    start, journal_entry, end = generate_patient_visits(start, N) 
    
    pat_arrived.append(start)
    pat_j_entry.append(journal_entry)
    pat_departure.append(end)
    
    Patient_ID.append(num_unique_patients)
    Patient_journals.append((num_unique_patients, N))
    # TODO extract and generate times in loop
    

Inpatient_Admissiondatetime = pat_arrived
omvantDT = pat_j_entry
Inpatient_Departure = pat_departure


# On average how many entries/notes does each patient have during their stay
total_visits = 0
for k, v in Patient_journals:
    total_visits += v
    
avg_num_journal_entries = total_visits/len(Patient_journals)
print(f"The average patient had {avg_num_journal_entries} visits")

The average patient had 50.400788802435 visits


## Add free-text journal notes
Based on the Amazon Review dataset  
Reviews are in fre-text form in English  

- Positive reviews (4+ stars) are the most plentiful, and therefor assigned to all patients
- Negative reviews (1 stars) occur more seldom, and is assigned to patients that have had a fall injury

In [6]:
%%time

d = load_dataset('amazon_us_reviews', 'Books_v1_00')

# Score 5 - simulates NOT having a fall injury
# Score 1 - simulates having a fall injury
score = {
    1: [], 
    2: [],
    3: [],
    4: [],
    5: [],
}

i = 0
for review in d['train']:
    if len(score[5]) < max_entities:
        r = review['star_rating']
        if r == 1 or r == 5 or r == 4:
            if r == 4:
                r = 5
            
            text = review['review_body']
            num_words = len(text.split(" "))
            if num_words >= 5:
                score[r].append(review['review_body'])

    i += 1


np.random.shuffle(score[5])
np.random.shuffle(score[1])

score[1][:3]
#pp.pprint(score)

Reusing dataset amazon_us_reviews (/home/markussagen/.cache/huggingface/datasets/amazon_us_reviews/Books_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 17min 4s, sys: 1.99 s, total: 17min 6s
Wall time: 17min 17s


['Politics of Character Design. Most people will have heard about all the psychological ideas in this book, whether or not you believe some of it depends on your experience and level of intelligence. I didn\'t read the whole thing, it\'s not that interesting, but I skimmed through more than half of it. The majority of the book is really about how for many people their inherited politics will distort their interpretation of characters, you shouldn\'t need a book to figure this out and if you know the principle then the specifics are easy to figure out on a case by case basis. Various conclusions can be inferred from this predicate, which don\'t need to be written about here, but in short my advice is don\'t be a politician. The only thing useful is that there is a pretty clear admission by the Japanese regional guys that they think African types, Polynesians, and very Asian looking people are ugly, it\'s very difficult to find this type of statement from these types of crypto-white supr

# sokkod 
Generate random sokkods from English dictionary
https://stackoverflow.com/a/18835426

In [7]:
import random
import requests


max_entities = 2_351_348
num_falls = 302
num_2016 = 190_705

word_site = "https://www.mit.edu/~ecprice/wordlist.10000"
response = requests.get(word_site)
WORDS = response.content.splitlines()

words = []
for w in WORDS:
    if len(w)> 4:
        words.append(w)


sokkod = []
omv_pk = []
s = random.choices(words, k=100)

for pat_id, entries in Patient_journals:
    r = random.choices(np.arange(77, 3319))[0] # random numbers only
    for _ in range(entries):
        sokord = random.choices(s, k=1)[0].decode("utf-8")
        omv_text = f'{pat_id*r % 91781}_{sokord}'
        
        sokkod.append(sokord)
        omv_pk.append(omv_text)

In [8]:
# Generate ages from a predefined dist
age = pd.read_csv('../data/age_dist.csv', index_col=0)
age = list(age['0'])
Patient_Age = []
for i in range(len(age)):
    Patient_Age.append(float(age[i])) # since the age from database was floats

Patient_Age = np.array(Patient_Age)
np.random.shuffle(Patient_Age)
Patient_Age = Patient_Age[:len(Patient_ID)]
Patient_Age = list(Patient_Age)

  mask |= (ar1 == a)


#### Assign which patient has fallen
The risk for a patient to have a fall injury is related to their age and gender. 
Since it is known how many patients have had a fall injury during 2016, those numbers are also assigned here

In [43]:
idx = 0
omvtext_concat = []
Class_2016 = []
Patient_IDs_from_2016 = []

def is_from_2016(lst):
    return [True if len(lst[0].split("2016"))>1 else False][0]

# Set all default to have no fall injury
for pat_id, num_entries in Patient_journals:
    texts   = []
    labels  = []
    is_2016 = is_from_2016(pat_arrived[pat_id-1])
    for i in range(num_entries):
        texts.append(score[5][idx])
        if is_2016:
            labels.append(float(0))
            Patient_IDs_from_2016.append(pat_id)
        else:
            labels.append(np.nan)
        idx += 1
    
    omvtext_concat.append(texts)
    Class_2016.append(labels)


ids_with_falls = random.choices(Patient_IDs_from_2016, k=302)

In [44]:
# randomly choose to replace one of the entries with a fall injury
# Label those examples
for i, idx in enumerate(ids_with_falls):
    entries = omvtext_concat[idx]
    idx_replace = random.choices(np.arange(len(entries)), k=1)[0]
    omvtext_concat[idx][idx_replace] = score[1][i]
    Class_2016[idx][idx_replace] = float(1)
    
# Also create some patients with fall injuries but not labeled
ids_unlabeled_with_fall = random.choices(np.arange(len(Patient_ID)), k=5500)
for i, idx in enumerate(ids_unlabeled_with_fall):
    entries = omvtext_concat[idx]
    idx_replace = random.choices(np.arange(len(entries)), k=1)[0]
    omvtext_concat[idx][idx_replace] = score[1][i]

# Also select some random 

In [45]:
# Generate Patient Gender. Males are more likely to fall
Patient_Gender = random.choices(("M", "F"), weights=(48, 52), k=len(Patient_ID))
Patient_Gender = np.array(Patient_Gender)
Patient_Gender[ids_with_falls] = random.choices(("M", "F"), weights=(69, 31), k=302)
Patient_Gender = list(Patient_Gender)

# Bringing it all toghether
We take all categories of the patient and, if the data is in a list of list, it is flattened. All categories are assembled to a Pandas DataFrame and saved as a csv.

In [46]:
import itertools

def flatten(list_of_lists):
    return list(itertools.chain(*list_of_lists))

def extend(lst):
    global Patient_journals
    extended_list = []
    for pat_id, entries in Patient_journals:
        idx = pat_id-1
        values = [lst[idx]]*entries # replicate for all entries
        extended_list.append(values)
    return extended_list
            
_omv_pk = omv_pk
_sokkod = sokkod

_Inpatient_Admissiondatetime = flatten(Inpatient_Admissiondatetime)
_Inpatient_Departure = flatten(Inpatient_Departure)
_omvantDT = flatten(omvantDT)
_omvtext_concat = flatten(omvtext_concat)
_Class_2016 = flatten(Class_2016)

_Patient_Age = extend(Patient_Age)
_Patient_Age = flatten(_Patient_Age)
_Patient_Gender = extend(Patient_Gender)
_Patient_Gender = flatten(_Patient_Gender)
_Patient_ID = extend(Patient_ID)
_Patient_ID = flatten(_Patient_ID)

In [47]:
# Remove the label on some of the patients for the 2016 dataset
# Since the real dataset had some data points from 2016 with unknown label

num_unlabeled    = 0
known_fall_injuries = [v for v in _Class_2016 if v == 0]
total_labeled_examples = 172_250
total_to_relabel = len(known_fall_injuries) - total_labeled_examples

for i, _ in enumerate(_Class_2016):
    if _Class_2016[i] == 0:
        if random.random() > 0.4 and num_unlabeled < total_to_relabel:
            _Class_2016[i] = np.nan
            num_unlabeled += 1

### Update format for all notes related to times
Noticed that the hours start with a leading 0 if the hour of the day is before 10AM     
And since generating the dataset took quite some time, the dataset is modified to be in the correct format afterwards instead

In [14]:
# def convert strings to correct format
def convert_datetime_str_format(datetime_str_list):
    """Converts list of Datetime Strings to expected format.
    
    >>> convert_datetime_str_format(['YYYY-MM-DD H:MM'])
    >>> returns ['YYYY-MM-DD HH:MM:SS:000']
    """
    converted_datetimes_list = []
    for time_str in datetime_str_list:
        year_month_day, hour_minutes = time_str.split(" ")
        hour, minutes = hour_minutes.split(":")

        # If has no leading 0 or before 10AM
        if len(hour) != 2:
            hour = '0'+hour

        converted_datetimes_list.append(year_month_day + " " + hour + ":00:00.000")
    return converted_datetimes_list


# Convert the entries with datetime strings
_Inpatient_Admissiondatetime = convert_datetime_str_format(_Inpatient_Admissiondatetime)
_Inpatient_Departure         = convert_datetime_str_format(_Inpatient_Departure)
_omvantDT                    = convert_datetime_str_format(_omvantDT)

## Create the dataset

In [52]:
# Create the dataset
d = {
    "omv_pk": _omv_pk,
    "Patient_ID": _Patient_ID,
    "Inpatient_Admissiondatetime": _Inpatient_Admissiondatetime,
    "Inpatient_Departure": _Inpatient_Departure,
    "omvantDT": _omvantDT,
    "sokkod": _sokkod,
    "omvtext_concat": _omvtext_concat,
    "Class_2016": _Class_2016,
    "Patient_Gender": _Patient_Gender,
    "Patient_Age": _Patient_Age,
}

df = pd.DataFrame(d)
df.to_csv("../data/synthethetic_data_medical.csv", index=False)

## Load the saved dataset

In [53]:
df = pd.read_csv("../data/synthethetic_data_medical.csv")
df.head()

Unnamed: 0,omv_pk,Patient_ID,Inpatient_Admissiondatetime,Inpatient_Departure,omvantDT,sokkod,omvtext_concat,Class_2016,Patient_Gender,Patient_Age
0,1082_establishment,1,2010-5-4 6:00,2010-05-05 23:00,2010-05-04 16:00,establishment,"This is a wonderful, wonderful book.",,M,88.0
1,1082_specified,1,2010-5-4 6:00,2010-05-05 23:00,2010-05-04 19:00,specified,I enjoyed the book.<br /><br />Thanks,,M,88.0
2,1082_living,1,2010-5-4 6:00,2010-05-05 23:00,2010-05-04 23:00,living,A wonderful time capsule to Havana's glory days.,,M,88.0
3,1082_bookmarks,1,2010-5-4 6:00,2010-05-05 23:00,2010-05-05 12:00,bookmarks,"The book is by a Rabbi, but is for people who ...",,M,88.0
4,4398_helicopter,2,2011-8-17 10:00,2011-08-19 15:00,2011-08-18 00:00,helicopter,Very important and powerfully documented book ...,,F,78.0
