# Creating the fake database





Each database has a <code>schema</code>. This represents the structure of the whole database. It shows the relationships between database entities (i.e. tables) and defines any constraints placed on the data. This can also be represented as a diagram. To represent this I have used <a href="https://dbdiagram.io/d" target="_blank">dbdiagram.io</a>. It can be seen the relation of each table and by which fields.



In [None]:
# schema here

In [71]:
# Populating Patient_data dataframe
# 2000 patients with ID number, Name, NHS number, Age, Gender, Ethnicity and postcode.

# Import libraries
import pandas as pd
from pandas import DataFrame
import numpy as np
import random
import datetime
from datetime import timedelta
import names
from faker.providers.person.en import Provider

size = 2000

In [2]:
# Let’s create some functions to randomly generate our data 

def random_id(size):
    id_patient = random.sample(range(100000), size)
    return id_patient

In [3]:
def random_NHS_number(size):
    NHS_numbers = random.sample(range(100000000,999999999),size)
    return NHS_numbers

In [4]:
def random_names(name_type, size):
    """
    Generate n-length ndarray of person names.
    name_type: a string, either first_names or last_names
    """
    names = getattr(Provider, name_type)
    return np.random.choice(names, size=size)

In [5]:
def random_genders(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # Equal probability of gender
        p = (0.5, 0.5)
    gender = ("M", "F")
    return np.random.choice(gender, size=size, p=p)

In [6]:
def random_Ethnicity(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # 5 groups with different probability
        p = (0.49, 0.10, 0.11, 0.01, 0.29)
    Ethnicity = ("White British", "Black British people", "British Indians", "White Gypsy or Irish Traveller", "Other White")
    return np.random.choice(Ethnicity, size=size, p=p)

In [7]:
def random_Postcode(size, p=None):
    """
    A real Faker's UK postcode generation can be found here
    https://github.com/joke2k/faker/blob/07ca4ede54c26554fdb5c7a4f55432cb0498d338/faker/providers/address/en_GB/__init__.py
    However, for this small fake database there is no need to populate it with real UK ppostcode.
    Instead of this, only a few fake postcodes are generate manually"""
    if not p:
        # 10 postcodes, same p
        p = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,0.1)
    Postcode = ("SO15 5FL", "SP01 10MA", "BE01 5SA", "CB19 5US", "SO15 10FL", "LO12 8HG", "WE1 7YG", "SO01 7JH", "SP2 8BJ", "SP3 8BJ")
    return np.random.choice(Postcode, size=size, p=p)

In [8]:
def random_Hospital_names(size, p=None):
    """
    3 Fake hospitals names with similar probability
    """
    if not p:
        # 10 postcodes, same p
        p = (0.3, 0.3, 0.4)
    Hospital_names = ("Robin Hood Hospital", "Alfred Hitchcock Hospital", "Chris Martin Hospital")
    return np.random.choice(Hospital_names, size=size, p=p)

In [9]:
def random_dates(start, end, size):
    """
    Generate random dates within range between start and end.    
    Adapted from: https://stackoverflow.com/a/50668285
    """
    # Unix timestamp is in nanoseconds by default, so divide it by
    # 24*60*60*10**9 to convert to days.
    divide_by = 24 * 60 * 60 * 10**9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")

## patient_data_df

In [76]:
# Empty df with headers
patients_admitted = pd.DataFrame(columns=['ID', 'NHS_Number','Full_Name','Gender', 'Birthdate', 'Ethnicity', 'Postcode'])

patients_admitted['ID'] = random_id(size) 
patients_admitted['NHS_Number'] = random_NHS_number(size)
patients_admitted['first_names'] = random_names('first_names', size)
patients_admitted['last_names'] = random_names('last_names', size)
patients_admitted['Full_Name'] = patients_admitted['first_names']  + ' ' + patients_admitted['last_names']
del patients_admitted['first_names']
del patients_admitted['last_names']
patients_admitted['Gender'] = random_genders(size)
patients_admitted['Birthdate'] = random_dates(start=pd.to_datetime('1900-01-01'), end=pd.to_datetime('2008-01-01'), 
                                              size=size)
patients_admitted['Ethnicity'] = random_Ethnicity(size)
patients_admitted['Postcode'] = random_Postcode(size)
patients_admitted

Unnamed: 0,ID,NHS_Number,Full_Name,Gender,Birthdate,Ethnicity,Postcode
0,9134,693410245,Berlin Schinner,F,1903-08-19,White British,CB19 5US
1,2592,700062626,Leatrice Gusikowski,M,1908-03-23,White British,SP01 10MA
2,62882,763102227,Coletta Bernhard,F,1983-11-16,White British,BE01 5SA
3,16761,905363822,Rory Cremin,F,1915-02-01,White British,SP2 8BJ
4,95870,808415529,Freeman Williamson,F,1978-11-12,White British,SP01 10MA
...,...,...,...,...,...,...,...
1995,58015,825039663,Michelina Batz,F,2002-05-21,White British,SO01 7JH
1996,65269,808423282,Coolidge Friesen,M,1959-11-06,Other White,SO01 7JH
1997,84454,421790006,Gaylen Heaney,M,1995-03-06,White British,SP3 8BJ
1998,19341,151564944,Mat Mosciski,M,1925-07-14,Other White,WE1 7YG


In [None]:
# To save this as CSV

#patients_admitted.to_csv('patient_data_df.csv')

## covid_19_admission_df

In [77]:
# Populating COVID-19 admission dataframe
# Admission of COVID patients in three hispitales of the same TRUST for one month period
# This table contain the 5% of the patients found in the patient_data_df
patients_admitted_length = int(len(patients_admitted)*0.05)
# ID from patient_data links with  Patient_admitted_id from covid_19_admission

ID = patients_admitted['ID'].tolist() # ID to list to select the 5% of the values of the ID column

ID2df = (random.choices(ID, k=patients_admitted_length))

# Empty df with headers
covid_19_admission_df = pd.DataFrame(columns=['Patient_admitted_id', 'Date', 'Hospital_name','Hospital_Postcode' ])

covid_19_admission_df['Patient_admitted_id'] = ID2df
#Date from 1 January 2021 to 31 January 2021
covid_19_admission_df['Date'] = random_dates(start=pd.to_datetime('2021-01-01'), 
                                             end=pd.to_datetime('2021-01-31'), size=patients_admitted_length)
covid_19_admission_df['Hospital_name'] = random_Hospital_names(patients_admitted_length)

# To link same hospital names with same postcodes
for idx, value in covid_19_admission_df.iloc[:,2].iteritems():
    if 'Chris Martin Hospital' in value:
        covid_19_admission_df.loc[idx, ['Hospital_Postcode']] = "SO15 5FL"
    if 'Robin Hood Hospital' in value:
        covid_19_admission_df.loc[idx, ['Hospital_Postcode']] = "LO12 8HG"
    if 'Alfred Hitchcock Hospital' in value:
        covid_19_admission_df.loc[idx, ['Hospital_Postcode']] = "BE01 5SA"
covid_19_admission_df

Unnamed: 0,Patient_admitted_id,Date,Hospital_name,Hospital_Postcode
0,68829,2021-01-26,Robin Hood Hospital,LO12 8HG
1,70299,2021-01-02,Alfred Hitchcock Hospital,BE01 5SA
2,96976,2021-01-17,Chris Martin Hospital,SO15 5FL
3,50168,2021-01-09,Alfred Hitchcock Hospital,BE01 5SA
4,68635,2021-01-28,Chris Martin Hospital,SO15 5FL
...,...,...,...,...
95,5865,2021-01-23,Alfred Hitchcock Hospital,BE01 5SA
96,3431,2021-01-28,Alfred Hitchcock Hospital,BE01 5SA
97,37077,2021-01-19,Alfred Hitchcock Hospital,BE01 5SA
98,19087,2021-01-13,Alfred Hitchcock Hospital,BE01 5SA


In [None]:
# To save this as CSV
#covid_19_admission_df.to_csv('covid_19_admission_df.csv')

## covid_19_death_df

Similar that covid_19_admission_df in which a small proportion of the admitted patients died a few days after admission.


In [89]:

# Crete a dict with ID number and date of admission
id2date_admission = pd.Series(covid_19_admission_df.Date.values,covid_19_admission_df.Patient_admitted_id.values).to_dict()

# Select the 5% of the total number of items  covid_19_admission_df
covid_19_admission_length = int(len(covid_19_admission_df)*0.05)

# Convert these items in a list
#entry_list = list(id2date_admission.items())

#Take the 5% of total number of items 
random_entry = random.sample(list(id2date_admission.items()), k=covid_19_admission_length)

# Populate a new df with the patients who are going to die and their date of admission
covid_19_death_df = DataFrame (random_entry,columns=['Patient_admitted_id','Date_admission'])

# Now, we suppose they will die a few random days later, between the 3rd and the 20th day after admission for example.
covid_19_death_df["Death_dates"] = covid_19_death_df["Date_admission"] + timedelta(days=random.randint(3, 20))

# we need the hospital where they were admitted and they died
covid_19_death_df = covid_19_admission_df.merge(covid_19_death_df, on="Patient_admitted_id")

# Some deletion of the columns I dont need
covid_19_death_df = covid_19_death_df.drop(['Date', 'Hospital_Postcode', 'Date_admission'], axis = 1)

# I dont need the date_admission column
#
covid_19_death_df


Unnamed: 0,Patient_admitted_id,Hospital_name,Death_dates
0,77532,Robin Hood Hospital,2021-01-24
1,77532,Robin Hood Hospital,2021-01-24
2,15182,Alfred Hitchcock Hospital,2021-02-05
3,45892,Alfred Hitchcock Hospital,2021-02-08
4,27838,Chris Martin Hospital,2021-02-14
5,63184,Robin Hood Hospital,2021-02-04


## Hospital_features_df
Some characteristics of the hospitals of this fake database

In [None]:

data = {'Hospital_name':  ['Chris Martin Hospital', 'Alfred Hitchcock Hospital','Robin Hood Hospital'],
        'Hospital_location': ['SO15 5FL', 'BE01 5SA','LO12 8HG'],
        'Number_of_beds' : [100,200,150],
        'Number_of_staff' : [300,600,400],
        'Number_of_ITU_Beds' : [10,20,15]
        
        }
Hospital_features_df = pd.DataFrame(data, columns=['Hospital_name', 'Hospital_location', 
                                                   'Number_of_beds','Number_of_staff', 'Number_of_ITU_Beds' ])


In [None]:
#Hospital_features_df.to_csv('Hospital_features_df.csv')
Hospital_features_df

## Covid_19_epidemiology_df
Some Covid-19 data reported by the Trust from the 3 hospitals involved

In [None]:
date_count = Covid_19_epidemiology_df.groupby('Date').count()

date_count