# Creating the fake database
****

<a id="top"></a>

<b>Table of contents</b><br>

1. [Creating fake population](#population)
2. [Populating the database](#Database)

In this notebook I am going to create a few CSV files with fake data. Then I will use this data to create tables in a new database. The grafic representation (schema) of the database I want to create can be seen below



![Image of Yaktocat](https://raw.githubusercontent.com/Manuel-DominguezCBG/SQL2Dashboard/main/Covid-19/Images/screenshot-aca1dabf.jpg)




<a id="population"></a>
## 1. Creating fake people 

In [1]:
# Populating Patient_data dataframe
# 2000 patients with ID number, Name, NHS number, Age, Gender, Ethnicity and postcode.

# Import libraries
import pandas as pd
from pandas import DataFrame
import numpy as np
import random
import datetime
from datetime import timedelta
import names                                     # pip install names
from faker.providers.person.en import Provider   # pip install faker
import sqlite3


size = 2000

In [2]:
# Let’s create some functions to randomly generate our data 

def random_id(size):
    id_patient = random.sample(range(100000000), size)
    return id_patient

In [3]:
def random_NHS_number(size):
    NHS_numbers = random.sample(range(100000000,999999999),size)
    return NHS_numbers

In [4]:
def random_names(name_type, size):
    """
    Generate n-length ndarray of person names.
    name_type: a string, either first_names or last_names
    """
    names = getattr(Provider, name_type)
    return np.random.choice(names, size=size)

In [5]:
def random_genders(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # Equal probability of gender
        p = (0.5, 0.5)
    gender = ("M", "F")
    return np.random.choice(gender, size=size, p=p)

In [6]:
def random_Ethnicity(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # 5 groups with different probability
        p = (0.49, 0.10, 0.11, 0.01, 0.29)
    Ethnicity = ("White British", "Black British people", "British Indians", "White Gypsy or Irish Traveller", "Other White")
    return np.random.choice(Ethnicity, size=size, p=p)

In [7]:
def random_Postcode(size, p=None):
    """
    A real Faker's UK postcode generation can be found here
    https://github.com/joke2k/faker/blob/07ca4ede54c26554fdb5c7a4f55432cb0498d338/faker/providers/address/en_GB/__init__.py
    However, for this small fake database there is no need to populate it with real UK ppostcode.
    Instead of this, only a few fake postcodes are generate manually"""
    if not p:
        # 10 postcodes, same p
        p = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,0.1)
    Postcode = ("SO15 5FL", "SP01 10MA", "BE01 5SA", "CB19 5US", "SO15 10FL", "LO12 8HG", "WE1 7YG", "SO01 7JH", "SP2 8BJ", "SP3 8BJ")
    return np.random.choice(Postcode, size=size, p=p)

In [8]:
def random_Hospital_names(size, p=None):
    """
    3 Fake hospitals names with similar probability
    """
    if not p:
        # 10 postcodes, same p
        p = (0.3, 0.3, 0.4)
    Hospital_names = ("Robin Hood Hospital", "Alfred Hitchcock Hospital", "Chris Martin Hospital")
    return np.random.choice(Hospital_names, size=size, p=p)

In [9]:
def random_dates(start, end, size):
    """
    Generate random dates within range between start and end.    
    Adapted from: https://stackoverflow.com/a/50668285
    """
    # Unix timestamp is in nanoseconds by default, so divide it by
    # 24*60*60*10**9 to convert to days.
    divide_by = 24 * 60 * 60 * 10**9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")

## patient_data_df

In [10]:
# Empty df with headers
patients_admitted = pd.DataFrame(columns=['ID', 'NHS_Number','Full_Name','Gender', 'Birthdate', 'Ethnicity', 'Postcode'])

patients_admitted['ID'] = random_id(size) 
patients_admitted['NHS_Number'] = random_NHS_number(size)
patients_admitted['first_names'] = random_names('first_names', size)
patients_admitted['last_names'] = random_names('last_names', size)
patients_admitted['Full_Name'] = patients_admitted['first_names']  + ' ' + patients_admitted['last_names']
del patients_admitted['first_names']
del patients_admitted['last_names']
patients_admitted['Gender'] = random_genders(size)
patients_admitted['Birthdate'] = random_dates(start=pd.to_datetime('1900-01-01'), end=pd.to_datetime('2008-01-01'), 
                                              size=size)
patients_admitted['Ethnicity'] = random_Ethnicity(size)
patients_admitted['Postcode'] = random_Postcode(size)
patients_admitted

Unnamed: 0,ID,NHS_Number,Full_Name,Gender,Birthdate,Ethnicity,Postcode
0,45169425,284823266,Osborn Senger,M,1965-03-08,British Indians,SP3 8BJ
1,44543143,916138477,Darwin Corwin,F,1921-04-16,White British,CB19 5US
2,23119565,569332256,Brandin Terry,M,1916-04-07,Other White,SP2 8BJ
3,31457816,415488712,Gerda Greenholt,F,1980-07-23,White British,BE01 5SA
4,13522872,908102658,Mindi Hilpert,F,1969-01-17,Other White,WE1 7YG
...,...,...,...,...,...,...,...
1995,86959687,702722630,Mendy Runolfsson,F,1951-07-14,Other White,BE01 5SA
1996,34958158,630923112,Gayle Schaden,F,1923-06-22,British Indians,WE1 7YG
1997,73271058,365447792,Wash Kub,F,1974-06-14,White British,SO15 5FL
1998,68523233,825099011,Chaim Morissette,M,1988-08-11,White British,SO15 5FL


In [11]:
# To save this as CSV

#patients_admitted.to_csv('./patient_data_df.csv')

## covid_19_admission_df

In [12]:
# Populating COVID-19 admission dataframe

# Empty df
covid_19_admission_df = pd.DataFrame(columns=['Patient_admitted_id', 'Date', 'Hospital_ID' ])


# Admission of COVID patients in three hospitals of the same TRUST for one month period

# Populate the Pt_admited_id with the ID of people from patients_admitted
# This table contain the 5% of the patients found in the patient_data_df
patients_admitted_length = int(len(patients_admitted)*0.05)
ID = patients_admitted['ID'].tolist() # ID to list to select the 5% of the values of the ID column
ID2df = (random.choices(ID, k=patients_admitted_length))
covid_19_admission_df['Patient_admitted_id'] = ID2df

#Date from 1 January 2021 to 31 January 2021
covid_19_admission_df['Date'] = random_dates(start=pd.to_datetime('2021-01-01'), 
                                             end=pd.to_datetime('2021-01-31'), size=patients_admitted_length)



# So far, each hospital will get ramdon number of patients

The_hospitals_list = [214321,224323,3234234]
covid_19_admission_df['Hospital_ID'] = np.random.choice(list(The_hospitals_list), len(covid_19_admission_df))

covid_19_admission_df

Unnamed: 0,Patient_admitted_id,Date,Hospital_ID
0,26880783,2021-01-13,3234234
1,71941151,2021-01-30,3234234
2,74376208,2021-01-30,3234234
3,7493509,2021-01-03,224323
4,99797479,2021-01-27,214321
...,...,...,...
95,48627040,2021-01-13,224323
96,88536011,2021-01-23,214321
97,38556569,2021-01-25,214321
98,66679051,2021-01-17,3234234


In [13]:
# To save this as CSV
#covid_19_admission_df.to_csv('./covid_19_admission_df.csv')

## covid_19_death_df

Similar that covid_19_admission_df in which a small proportion of the admitted patients died a few days after admission.


In [14]:

# Crete a dict with ID number and date of admission
id2date_admission = pd.Series(covid_19_admission_df.Date.values,covid_19_admission_df.Patient_admitted_id.values).to_dict()

# Select the 5% of the total number of items  covid_19_admission_df
covid_19_admission_length = int(len(covid_19_admission_df)*0.05)

# Convert these items in a list
#entry_list = list(id2date_admission.items())

#Take the 5% of total number of items 
random_entry = random.sample(list(id2date_admission.items()), k=covid_19_admission_length)

# Populate a new df with the patients who are going to die and their date of admission
covid_19_death_df = DataFrame (random_entry,columns=['Patient_admitted_id','Date_admission'])

# Now, we suppose they will die a few random days later, between the 3rd and the 20th day after admission for example.
covid_19_death_df["Death_dates"] = covid_19_death_df["Date_admission"] + timedelta(days=random.randint(3, 20))

# we need the hospital where they were admitted and they died
covid_19_death_df = covid_19_admission_df.merge(covid_19_death_df, on="Patient_admitted_id")

# Some deletion of the columns I dont need
covid_19_death_df = covid_19_death_df.drop(['Date',  'Date_admission'], axis = 1)

# I dont need the date_admission column
#
covid_19_death_df


Unnamed: 0,Patient_admitted_id,Hospital_ID,Death_dates
0,14581069,3234234,2021-02-07
1,73274526,3234234,2021-02-01
2,73274526,3234234,2021-02-01
3,73968829,224323,2021-01-20
4,69874155,214321,2021-01-27
5,43369835,224323,2021-01-24


## Hospital_features_df
Some characteristics of the hospitals of this fake database

In [15]:
data = {'Hospital_ID': [214321,224323,3234234],
        'Hospital_name':  ['Chris Martin Hospital', 'Alfred Hitchcock Hospital','Robin Hood Hospital'],
        'Hospital_location': ['SO15 5FL', 'BE01 5SA','LO12 8HG'],
        'Number_of_beds' : [100,200,150],
        'Number_of_staff' : [300,600,400],
        'Number_of_ITU_Beds' : [10,20,15]}
Hospital_features_df = pd.DataFrame(data, columns=['Hospital_ID','Hospital_name', 'Hospital_location', 
                                                   'Number_of_beds','Number_of_staff', 'Number_of_ITU_Beds' ])
Hospital_features_df

Unnamed: 0,Hospital_ID,Hospital_name,Hospital_location,Number_of_beds,Number_of_staff,Number_of_ITU_Beds
0,214321,Chris Martin Hospital,SO15 5FL,100,300,10
1,224323,Alfred Hitchcock Hospital,BE01 5SA,200,600,20
2,3234234,Robin Hood Hospital,LO12 8HG,150,400,15


In [16]:
#Hospital_features_df.to_csv('./Hospital_features_df.csv')


<a id="Database"></a>
## Once all CSV have been done, we can use them to create the database.

In [17]:
# Create a database connection and cursor to execute queries.
conn = sqlite3.connect('./fake_db.db') # This create an empty database in the current directory
c = conn.cursor()

In [18]:
#   1. patient_data_df

### Add and empty table and load patient_data_df into sql table

c.execute('''DROP TABLE IF EXISTS patient_data''')
c.execute(''' CREATE TABLE patient_data (ID NOT NULL,
NHS_Number,
Full_Name,
Gender,
Birthdate,
Ethnicity, 
Postcode,
PRIMARY KEY (ID),
FOREIGN KEY (ID) REFERENCES  covid_19_admission (Patient_admitted_id) ON DELETE CASCADE)''')

# patient_data[ID] ---> covid_19_admission[Patient_admitted_id]
patients_admitted.to_sql('patient_data', conn, if_exists='append', index = False) #LOAD
c.execute('''SELECT * FROM patient_data''').fetchall() #SHOW

[(45169425,
  284823266,
  'Osborn Senger',
  'M',
  '1965-03-08 00:00:00',
  'British Indians',
  'SP3 8BJ'),
 (44543143,
  916138477,
  'Darwin Corwin',
  'F',
  '1921-04-16 00:00:00',
  'White British',
  'CB19 5US'),
 (23119565,
  569332256,
  'Brandin Terry',
  'M',
  '1916-04-07 00:00:00',
  'Other White',
  'SP2 8BJ'),
 (31457816,
  415488712,
  'Gerda Greenholt',
  'F',
  '1980-07-23 00:00:00',
  'White British',
  'BE01 5SA'),
 (13522872,
  908102658,
  'Mindi Hilpert',
  'F',
  '1969-01-17 00:00:00',
  'Other White',
  'WE1 7YG'),
 (61404313,
  818683468,
  'Darron Daugherty',
  'M',
  '1941-03-12 00:00:00',
  'Other White',
  'SP2 8BJ'),
 (15925900,
  716056415,
  'Jayla Watsica',
  'F',
  '1910-12-22 00:00:00',
  'Black British people',
  'SO15 5FL'),
 (54570905,
  356800246,
  'Cheryll Ankunding',
  'M',
  '1912-09-14 00:00:00',
  'White British',
  'SP3 8BJ'),
 (68322272,
  863122748,
  'Cristal Thiel',
  'M',
  '2007-06-16 00:00:00',
  'White British',
  'SP01 10MA'),
 (

In [22]:
#   2. covid_19_admission_df

c.execute('''DROP TABLE IF EXISTS covid_19_admission''')
c.execute(''' CREATE TABLE covid_19_admission (Patient_admitted_id NOT NULL,
Date,
Hospital_ID,
PRIMARY KEY (Patient_admitted_id),
FOREIGN KEY (Hospital_ID) REFERENCES  Hospital_features (Hospital_ID) ON DELETE CASCADE,
FOREIGN KEY (Patient_admitted_id) REFERENCES covid_19_death (Patient_admitted_id) ON DELETE CASCADE)''')
covid_19_admission_df.to_sql('covid_19_admission', conn, if_exists='append', index = False)
c.execute('''SELECT * FROM covid_19_admission''').fetchall()

[(26880783, '2021-01-13 00:00:00', 3234234),
 (71941151, '2021-01-30 00:00:00', 3234234),
 (74376208, '2021-01-30 00:00:00', 3234234),
 (7493509, '2021-01-03 00:00:00', 224323),
 (99797479, '2021-01-27 00:00:00', 214321),
 (14126079, '2021-01-28 00:00:00', 224323),
 (28244362, '2021-01-15 00:00:00', 3234234),
 (36147239, '2021-01-29 00:00:00', 214321),
 (58060415, '2021-01-28 00:00:00', 214321),
 (39029102, '2021-01-14 00:00:00', 3234234),
 (50780661, '2021-01-17 00:00:00', 224323),
 (45629523, '2021-01-23 00:00:00', 224323),
 (65923283, '2021-01-07 00:00:00', 214321),
 (54567891, '2021-01-05 00:00:00', 3234234),
 (34814235, '2021-01-29 00:00:00', 214321),
 (38038655, '2021-01-15 00:00:00', 214321),
 (31580440, '2021-01-10 00:00:00', 224323),
 (26002007, '2021-01-09 00:00:00', 214321),
 (76159917, '2021-01-13 00:00:00', 3234234),
 (16836588, '2021-01-27 00:00:00', 3234234),
 (44888776, '2021-01-13 00:00:00', 224323),
 (17229206, '2021-01-30 00:00:00', 224323),
 (90953217, '2021-01-29 0

In [20]:
#   3. covid_19_death_df

c.execute('''DROP TABLE IF EXISTS covid_19_death''')
c.execute(''' CREATE TABLE covid_19_death (Patient_admitted_id NOT NULL,
Hospital_ID,
Death_dates,
PRIMARY KEY (Patient_admitted_id),
FOREIGN KEY (Hospital_ID) REFERENCES  Hospital_features (Hospital_ID) ON DELETE CASCADE)''')
covid_19_death_df.to_sql('covid_19_death', conn, if_exists='append', index = False)
c.execute('''SELECT * FROM covid_19_death''').fetchall()

IntegrityError: UNIQUE constraint failed: covid_19_death.Patient_admitted_id

In [21]:
#   4. Hospital_features_df

c.execute('''DROP TABLE IF EXISTS Hospital_features''')
c.execute(''' CREATE TABLE Hospital_features ( Hospital_ID NOT NULL PRIMARY KEY,
Hospital_name,
Hospital_location,
Number_of_beds,
Number_of_staff,
Number_of_ITU_Beds
PRIMARY KEY (Patient_admitted_id),)''')
Hospital_features_df.to_sql('Hospital_features', conn, if_exists='append', index = False)
c.execute('''SELECT * FROM Hospital_features''').fetchall()

OperationalError: near "(": syntax error

## Database created. 

In [None]:
# Let's ensure everything is ok
c.execute("PRAGMA table_info(patient_data);").fetchall()
# id 	name 	type 	notnull 	dflt_value 	pk

In [None]:
c.execute("PRAGMA table_info(covid_19_admission);").fetchall()

In [None]:
c.execute("PRAGMA table_info(covid_19_death);").fetchall()

In [None]:
c.execute("PRAGMA table_info(Hospital_features);").fetchall()

### Notebook details
<br>
<i>Notebook created by <strong>Manuel Dominguez</strong> 

Creation date: May 2021<br>


//// -- Tables and References

// Creating tables
Table Patient_data {
  id int [pk, increment] // auto-increment
  Full_name varchar
  NHS_number int
  Birthdate int
  Gender varchar
  Ethnicity varchar
  Postcode varchar
}

Table Covid_19_admission {
  Patients_admitted_id int [ref: > Patient_data.id]  // inline relationship (many-to-one)
  Date_adm int
  Hospital_ID varchar [ref: > Hospital_features.Hospital_ID]
  Indexes {
    (Patients_admitted_id) [pk]
  }
}



Table Hospital_features {
 Hospital_ID varchar
 Hospital_name varchar 
 Hospital_location varchar
 Number_of_beds varchar
 Number_of_ITU_beds varchar
 

}



Table covid_19_death {
 Patients_admitted_id  int [ref: > Covid_19_admission.Patients_admitted_id]
 Hospital_ID varchar [ref: > Hospital_features.Hospital_ID]
 Death_dates varchar 
 Indexes {
    (Patients_admitted_id) [pk]
  }
 

}