# Creating the fake database
****





Each database has a <code>schema</code>. This represents the structure of the whole database. It shows the relationships between database entities (i.e. tables) and defines any constraints placed on the data. This can also be represented as a diagram. To represent the fake database I want to create in this notebook I have used <a href="https://dbdiagram.io/d" target="_blank">dbdiagram.io</a>. It can be seen the relation of each table and by which fields.


![Image of Yaktocat](https://raw.githubusercontent.com/Manuel-DominguezCBG/SQL2Dashboard/main/Covid-19/Images/screenshot-aca1dabf.jpg)




## 1. Populate with fake people the tables that will be used to create the database

In [1]:
# Populating Patient_data dataframe
# 2000 patients with ID number, Name, NHS number, Age, Gender, Ethnicity and postcode.

# Import libraries
import pandas as pd
from pandas import DataFrame
import numpy as np
import random
import datetime
from datetime import timedelta
import names                                     # pip install names
from faker.providers.person.en import Provider   # pip install faker
import sqlite3


size = 2000

In [2]:
# Let’s create some functions to randomly generate our data 

def random_id(size):
    id_patient = random.sample(range(100000), size)
    return id_patient

In [3]:
def random_NHS_number(size):
    NHS_numbers = random.sample(range(100000000,999999999),size)
    return NHS_numbers

In [4]:
def random_names(name_type, size):
    """
    Generate n-length ndarray of person names.
    name_type: a string, either first_names or last_names
    """
    names = getattr(Provider, name_type)
    return np.random.choice(names, size=size)

In [5]:
def random_genders(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # Equal probability of gender
        p = (0.5, 0.5)
    gender = ("M", "F")
    return np.random.choice(gender, size=size, p=p)

In [6]:
def random_Ethnicity(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # 5 groups with different probability
        p = (0.49, 0.10, 0.11, 0.01, 0.29)
    Ethnicity = ("White British", "Black British people", "British Indians", "White Gypsy or Irish Traveller", "Other White")
    return np.random.choice(Ethnicity, size=size, p=p)

In [7]:
def random_Postcode(size, p=None):
    """
    A real Faker's UK postcode generation can be found here
    https://github.com/joke2k/faker/blob/07ca4ede54c26554fdb5c7a4f55432cb0498d338/faker/providers/address/en_GB/__init__.py
    However, for this small fake database there is no need to populate it with real UK ppostcode.
    Instead of this, only a few fake postcodes are generate manually"""
    if not p:
        # 10 postcodes, same p
        p = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,0.1)
    Postcode = ("SO15 5FL", "SP01 10MA", "BE01 5SA", "CB19 5US", "SO15 10FL", "LO12 8HG", "WE1 7YG", "SO01 7JH", "SP2 8BJ", "SP3 8BJ")
    return np.random.choice(Postcode, size=size, p=p)

In [8]:
def random_Hospital_names(size, p=None):
    """
    3 Fake hospitals names with similar probability
    """
    if not p:
        # 10 postcodes, same p
        p = (0.3, 0.3, 0.4)
    Hospital_names = ("Robin Hood Hospital", "Alfred Hitchcock Hospital", "Chris Martin Hospital")
    return np.random.choice(Hospital_names, size=size, p=p)

In [9]:
def random_dates(start, end, size):
    """
    Generate random dates within range between start and end.    
    Adapted from: https://stackoverflow.com/a/50668285
    """
    # Unix timestamp is in nanoseconds by default, so divide it by
    # 24*60*60*10**9 to convert to days.
    divide_by = 24 * 60 * 60 * 10**9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")

## patient_data_df

In [10]:
# Empty df with headers
patients_admitted = pd.DataFrame(columns=['ID', 'NHS_Number','Full_Name','Gender', 'Birthdate', 'Ethnicity', 'Postcode'])

patients_admitted['ID'] = random_id(size) 
patients_admitted['NHS_Number'] = random_NHS_number(size)
patients_admitted['first_names'] = random_names('first_names', size)
patients_admitted['last_names'] = random_names('last_names', size)
patients_admitted['Full_Name'] = patients_admitted['first_names']  + ' ' + patients_admitted['last_names']
del patients_admitted['first_names']
del patients_admitted['last_names']
patients_admitted['Gender'] = random_genders(size)
patients_admitted['Birthdate'] = random_dates(start=pd.to_datetime('1900-01-01'), end=pd.to_datetime('2008-01-01'), 
                                              size=size)
patients_admitted['Ethnicity'] = random_Ethnicity(size)
patients_admitted['Postcode'] = random_Postcode(size)
patients_admitted

Unnamed: 0,ID,NHS_Number,Full_Name,Gender,Birthdate,Ethnicity,Postcode
0,80076,477821126,Drew Grady,F,1942-06-26,White British,SO15 10FL
1,97222,822880200,Gwyn Kuphal,F,1907-10-27,British Indians,SO15 5FL
2,34336,344791895,Leontine Konopelski,F,1987-03-18,Other White,SP2 8BJ
3,61045,799690897,Roxanne Nitzsche,F,1964-11-25,Black British people,SO15 5FL
4,14716,282551991,Anabelle Thompson,F,1957-10-02,White British,SO01 7JH
...,...,...,...,...,...,...,...
1995,98552,289715999,Lalla Wolff,F,1987-08-21,White British,WE1 7YG
1996,31808,484477826,Fate Oberbrunner,F,1917-01-23,British Indians,LO12 8HG
1997,17860,181305759,Ashton Maggio,F,1979-12-21,White British,SP3 8BJ
1998,31487,650943056,Tracie Bechtelar,F,1938-04-12,White British,SO15 10FL


In [11]:
# To save this as CSV

#patients_admitted.to_csv('./patient_data_df.csv')

## covid_19_admission_df

In [12]:
# Populating COVID-19 admission dataframe

# Empty df
covid_19_admission_df = pd.DataFrame(columns=['Patient_admitted_id', 'Date', 'Hospital_ID' ])


# Admission of COVID patients in three hospitals of the same TRUST for one month period

# Populate the Pt_admited_id with the ID of people from patients_admitted
# This table contain the 5% of the patients found in the patient_data_df
patients_admitted_length = int(len(patients_admitted)*0.05)
ID = patients_admitted['ID'].tolist() # ID to list to select the 5% of the values of the ID column
ID2df = (random.choices(ID, k=patients_admitted_length))
covid_19_admission_df['Patient_admitted_id'] = ID2df

#Date from 1 January 2021 to 31 January 2021
covid_19_admission_df['Date'] = random_dates(start=pd.to_datetime('2021-01-01'), 
                                             end=pd.to_datetime('2021-01-31'), size=patients_admitted_length)



# So far, each hospital will get ramdon number of patients

The_hospitals_list = [214321,224323,3234234]
covid_19_admission_df['Hospital_ID'] = np.random.choice(list(The_hospitals_list), len(covid_19_admission_df))

covid_19_admission_df

Unnamed: 0,Patient_admitted_id,Date,Hospital_ID
0,68469,2021-01-19,224323
1,77475,2021-01-12,3234234
2,42524,2021-01-30,214321
3,700,2021-01-26,224323
4,84468,2021-01-03,224323
...,...,...,...
95,42747,2021-01-06,3234234
96,56492,2021-01-18,3234234
97,96181,2021-01-01,214321
98,99994,2021-01-06,3234234


In [13]:
# To save this as CSV
#covid_19_admission_df.to_csv('./covid_19_admission_df.csv')

## covid_19_death_df

Similar that covid_19_admission_df in which a small proportion of the admitted patients died a few days after admission.


In [14]:

# Crete a dict with ID number and date of admission
id2date_admission = pd.Series(covid_19_admission_df.Date.values,covid_19_admission_df.Patient_admitted_id.values).to_dict()

# Select the 5% of the total number of items  covid_19_admission_df
covid_19_admission_length = int(len(covid_19_admission_df)*0.05)

# Convert these items in a list
#entry_list = list(id2date_admission.items())

#Take the 5% of total number of items 
random_entry = random.sample(list(id2date_admission.items()), k=covid_19_admission_length)

# Populate a new df with the patients who are going to die and their date of admission
covid_19_death_df = DataFrame (random_entry,columns=['Patient_admitted_id','Date_admission'])

# Now, we suppose they will die a few random days later, between the 3rd and the 20th day after admission for example.
covid_19_death_df["Death_dates"] = covid_19_death_df["Date_admission"] + timedelta(days=random.randint(3, 20))

# we need the hospital where they were admitted and they died
covid_19_death_df = covid_19_admission_df.merge(covid_19_death_df, on="Patient_admitted_id")

# Some deletion of the columns I dont need
covid_19_death_df = covid_19_death_df.drop(['Date',  'Date_admission'], axis = 1)

# I dont need the date_admission column
#
covid_19_death_df


Unnamed: 0,Patient_admitted_id,Hospital_ID,Death_dates
0,1232,224323,2021-01-31
1,57517,3234234,2021-02-07
2,6056,3234234,2021-02-08
3,26508,214321,2021-01-22
4,12999,3234234,2021-02-05


## Hospital_features_df
Some characteristics of the hospitals of this fake database

In [15]:

data = {'Hospital_ID': [214321,224323,3234234],
        'Hospital_name':  ['Chris Martin Hospital', 'Alfred Hitchcock Hospital','Robin Hood Hospital'],
        'Hospital_location': ['SO15 5FL', 'BE01 5SA','LO12 8HG'],
        'Number_of_beds' : [100,200,150],
        'Number_of_staff' : [300,600,400],
        'Number_of_ITU_Beds' : [10,20,15]
        
        }
Hospital_features_df = pd.DataFrame(data, columns=['Hospital_ID','Hospital_name', 'Hospital_location', 
                                                   'Number_of_beds','Number_of_staff', 'Number_of_ITU_Beds' ])
Hospital_features_df

Unnamed: 0,Hospital_ID,Hospital_name,Hospital_location,Number_of_beds,Number_of_staff,Number_of_ITU_Beds
0,214321,Chris Martin Hospital,SO15 5FL,100,300,10
1,224323,Alfred Hitchcock Hospital,BE01 5SA,200,600,20
2,3234234,Robin Hood Hospital,LO12 8HG,150,400,15


In [16]:
#Hospital_features_df.to_csv('./Hospital_features_df.csv')


## Once all tables have been done, we can continue creating of the database.

In [17]:
# Create a database connection and cursor to execute queries.
conn = sqlite3.connect('./fake_db.sqlite')
c = conn.cursor()

In [18]:
### Load CSV file into sqlite table

#   1. patient_data_df

c.execute('''DROP TABLE IF EXISTS patient_data''')
c.execute(''' CREATE TABLE patient_data (ID, NHS_Number,Full_Name,Gender, Birthdate, Ethnicity, Postcode)''')
patients_admitted.to_sql('patient_data', conn, if_exists='append', index = False)
c.execute('''SELECT * FROM patient_data''').fetchall()

[(80076,
  477821126,
  'Drew Grady',
  'F',
  '1942-06-26 00:00:00',
  'White British',
  'SO15 10FL'),
 (97222,
  822880200,
  'Gwyn Kuphal',
  'F',
  '1907-10-27 00:00:00',
  'British Indians',
  'SO15 5FL'),
 (34336,
  344791895,
  'Leontine Konopelski',
  'F',
  '1987-03-18 00:00:00',
  'Other White',
  'SP2 8BJ'),
 (61045,
  799690897,
  'Roxanne Nitzsche',
  'F',
  '1964-11-25 00:00:00',
  'Black British people',
  'SO15 5FL'),
 (14716,
  282551991,
  'Anabelle Thompson',
  'F',
  '1957-10-02 00:00:00',
  'White British',
  'SO01 7JH'),
 (16040,
  610582604,
  'Ruffin Johnston',
  'F',
  '1951-09-15 00:00:00',
  'White British',
  'WE1 7YG'),
 (73926,
  622631094,
  'Jackie Raynor',
  'F',
  '1918-01-05 00:00:00',
  'White British',
  'BE01 5SA'),
 (37176,
  483913232,
  'Cher Hermann',
  'M',
  '1979-09-10 00:00:00',
  'White British',
  'SP2 8BJ'),
 (5610,
  326495586,
  'Wess Wintheiser',
  'M',
  '1945-07-31 00:00:00',
  'Other White',
  'SO15 10FL'),
 (79638,
  164650307,
 

In [19]:
#   2. covid_19_admission_df

c.execute('''DROP TABLE IF EXISTS covid_19_admission''')
c.execute(''' CREATE TABLE covid_19_admission (Patient_admitted_id, Date, Hospital_ID)''')
covid_19_admission_df.to_sql('covid_19_admission', conn, if_exists='append', index = False)
c.execute('''SELECT * FROM covid_19_admission''').fetchall()

[(68469, '2021-01-19 00:00:00', 224323),
 (77475, '2021-01-12 00:00:00', 3234234),
 (42524, '2021-01-30 00:00:00', 214321),
 (700, '2021-01-26 00:00:00', 224323),
 (84468, '2021-01-03 00:00:00', 224323),
 (37847, '2021-01-28 00:00:00', 224323),
 (83005, '2021-01-21 00:00:00', 224323),
 (74160, '2021-01-18 00:00:00', 224323),
 (78333, '2021-01-11 00:00:00', 3234234),
 (27126, '2021-01-21 00:00:00', 214321),
 (79190, '2021-01-19 00:00:00', 3234234),
 (515, '2021-01-09 00:00:00', 3234234),
 (17509, '2021-01-04 00:00:00', 214321),
 (77760, '2021-01-27 00:00:00', 214321),
 (63264, '2021-01-01 00:00:00', 214321),
 (50143, '2021-01-17 00:00:00', 3234234),
 (28546, '2021-01-15 00:00:00', 3234234),
 (73277, '2021-01-11 00:00:00', 3234234),
 (87645, '2021-01-06 00:00:00', 3234234),
 (73277, '2021-01-18 00:00:00', 3234234),
 (86096, '2021-01-21 00:00:00', 224323),
 (52827, '2021-01-04 00:00:00', 224323),
 (72156, '2021-01-23 00:00:00', 214321),
 (97294, '2021-01-10 00:00:00', 3234234),
 (81562, '

In [20]:
#   3. covid_19_death_df

c.execute('''DROP TABLE IF EXISTS covid_19_death''')
c.execute(''' CREATE TABLE covid_19_death (Patient_admitted_id, Hospital_ID, Death_dates)''')
covid_19_death_df.to_sql('covid_19_death', conn, if_exists='append', index = False)
c.execute('''SELECT * FROM covid_19_death''').fetchall()

[(1232, 224323, '2021-01-31 00:00:00'),
 (57517, 3234234, '2021-02-07 00:00:00'),
 (6056, 3234234, '2021-02-08 00:00:00'),
 (26508, 214321, '2021-01-22 00:00:00'),
 (12999, 3234234, '2021-02-05 00:00:00')]

In [21]:
#   4. Hospital_features_df

c.execute('''DROP TABLE IF EXISTS Hospital_features''')
c.execute(''' CREATE TABLE Hospital_features ( Hospital_ID, Hospital_name, Hospital_location, Number_of_beds, Number_of_staff, Number_of_ITU_Beds)''')
Hospital_features_df.to_sql('Hospital_features', conn, if_exists='append', index = False)
c.execute('''SELECT * FROM Hospital_features''').fetchall()

[(214321, 'Chris Martin Hospital', 'SO15 5FL', 100, 300, 10),
 (224323, 'Alfred Hitchcock Hospital', 'BE01 5SA', 200, 600, 20),
 (3234234, 'Robin Hood Hospital', 'LO12 8HG', 150, 400, 15)]

## Database created. Now we can manipulate this data. Let's start doing some quieries

In [25]:
# The most basic thing is to select and show one of the tables of the database
c.execute("SELECT * FROM Hospital_features;").fetchall()


[(214321, 'Chris Martin Hospital', 'SO15 5FL', 100, 300, 10),
 (224323, 'Alfred Hitchcock Hospital', 'BE01 5SA', 200, 600, 20),
 (3234234, 'Robin Hood Hospital', 'LO12 8HG', 150, 400, 15)]

In [26]:
# Now, select one column from a particular table
c.execute("SELECT Patient_admitted_id FROM covid_19_death;").fetchall()

[(1232,), (57517,), (6056,), (26508,), (12999,)]

In [27]:
# Let's see what error I get if I select a column that doesnt exist in a table
c.execute("SELECT Patient_admitted_id FROM Hospital_features;").fetchall()

OperationalError: no such column: Patient_admitted_id

In [28]:
# Can I select more than one column?
c.execute("SELECT Patient_admitted_id,Hospital_ID FROM covid_19_death;").fetchall()

[(1232, 224323),
 (57517, 3234234),
 (6056, 3234234),
 (26508, 214321),
 (12999, 3234234)]

In [30]:
# Could I insert a new patient in the patient_data table??
# Let's first count how many patient I have before and after insert a new patient

c.execute("SELECT COUNT(ID) from patient_data ;").fetchall()


[(2000,)]

In [31]:
c.execute("INSERT INTO  patient_data VALUES ('00001','000001', 'Manuel Dominguez','M','1990-04-07','Spanish', 'SO15 10FL');").fetchall()

[]

In [32]:
c.execute("SELECT COUNT(ID) from patient_data ;").fetchall()


[(2001,)]

In [39]:
# Can I update a value??
c.execute("UPDATE patient_data SET 'ID'= '88888' WHERE Full_Name = 'Manuel Dominguez';").fetchall()


[]

In [40]:
c.execute("select * FROM patient_data WHERE Full_name = 'Manuel Dominguez';").fetchall()


[('88888',
  '000001',
  'Manuel Dominguez',
  'M',
  '1990-04-07',
  'Spanish',
  'SO15 10FL')]

## Covid_19_Total_cases_df
This data frame collect the Covid-19 cases. This is like the one provided by the UK Goverment.

In a more real scenario, this data frame would collect data from many covid_19_admission_df from different Truts, hospital, etc.

In [None]:
Covid_19_Total_cases_df = covid_19_admission_df.groupby('Date').size().to_frame('Cases').reset_index()
Covid_19_Total_cases_df['Cumulative'] = Covid_19_Total_cases_df['Cases'].cumsum()
Covid_19_Total_cases_df

## Covid_19_Total_deaths_df
As the previous one but now with deaths people

In [None]:
Covid_19_Total_deaths_df = covid_19_death_df.groupby('Death_dates').size().to_frame('Cases').reset_index()
Covid_19_Total_deaths_df['Cumulative'] = Covid_19_Total_deaths_df['Cases'].cumsum()
Covid_19_Total_deaths_df

In [None]:
#covid_19_admission_df['Hospital_name'] = random_Hospital_names(patients_admitted_length)

# To link same hospital names with same postcodes
#for idx, value in covid_19_admission_df.iloc[:,2].iteritems():
#    if 'Chris Martin Hospital' in value:
#        covid_19_admission_df.loc[idx, ['Hospital_Postcode']] = "SO15 5FL"
#    if 'Robin Hood Hospital' in value:
#        covid_19_admission_df.loc[idx, ['Hospital_Postcode']] = "LO12 8HG"
#    if 'Alfred Hitchcock Hospital' in value:
#        covid_19_admission_df.loc[idx, ['Hospital_Postcode']] = "BE01 5SA"

Patient_admitted_id 	Date 	Hospital_ID
0 	66497 	2021-01-27 	214321

In [None]:

data = {'Patient_admitted_id': [214321,224323,3234234,23423],
        'Date':  ['2021-01-27', '2021-01-27','2021-01-27','2021-01-20'],
        'Hospital_ID': ['1', '2','3','2'],
        }
X = pd.DataFrame(data, columns=['Patient_admitted_id','Date', 'Hospital_ID' ])
X

## Once all tables have been done, we can continue creating of the database.