### `Context`
the data represents summary information of the patient treatment in each health facility licensed by CDPH. It includes discharge disposition, expected payer, preferred language spoken, age groups, race groups, sex, principal diagnosis groups, principal procedure groups, and principal external cause of injury/morbidity groups.

## `1- Imports and Definitions`

In [1]:
import random
import datetime
from datetime import timedelta
import itertools
import uuid

import numpy as np
import pandas as pd

from faker import Faker

In [2]:
hdf_path = "Data/statistical_data.xlsx" 
hospital_df1 = pd.read_excel(io=hdf_path, sheet_name='Data')

In [3]:
all_cols = hospital_df1.columns.to_list()

# id_cols = ['oshpd_id2', 'oshpd_id']
general_cols = ['oshpd_id2', 'oshpd_id', 'FACILITY_NAME', 'COUNTY_NAME', 'control_type_desc', 'DBA_ADDRESS1',\
                'DBA_CITY', 'DBA_ZIP_CODE', 'MSSA_NAME', 'MSSA_DESIGNATION', 'license_type', \
                'CONTROL_TYPE_CATEGORY_DESC', 'CONGRESSIONAL_DISTRICT_NUM','ASSEMBLY_DIST', 'SENATE_DIST']

diagnosis_cols = list(filter(lambda i: i.startswith("Dx"), all_cols))
injury_cause_cols = list(filter(lambda i: i.startswith("EC"), all_cols))
medicine_cols = list(filter(lambda i: i.startswith("Medicine"), all_cols))
surgery_cols = list(filter(lambda i: i.startswith("Surgery"), all_cols))
other_procedure_cols = ['Path_Lab', 'Radiology', 'Blank_Invalid', 'other_proc', 'Eval_Management', 'Anesthesia']

age_group_cols = ["Age_Under_1", "Age_01_09", "Age_10_19", "Age_20_29", "Age_30_39", "Age_40_49", "Age_50_59",\
                  "Age_60_69", "Age_70_79", "Age_80_", "Age_Unknown"]
race_group_cols = ["White", "Black", "Hispanic","Asian_Pacific_Islander", "American_Indian_Alaska_Native",\
              "Other_Race", "Unknown_Race"]
sex_cols = ["Sex_Male", "Sex_Female", "Sex_Unknown_Invalid"]
language_cols = ["Chinese", "English", "Spanish", "Tagalog", "Vietnamese", "All_Other"]


disposition_cols = ["Routine", "Acute_Care", "SN_IC_Care", "Residential_Care_Facility", "Prison_Jail",\
    "Against_Medical_Advice", "Died", "Hospice_Care", "Childrens_or_Cancer_Center","Critical_Access_Hospital",\
    "Psychiatric_Care", "Home_Health_Service", "Inpatient_Rehab_Care", "Not_Defined_Elsewhere", "Other_Unknown"]


payer_source_cols = ["Medicare", "Medi_Cal", "Private_Coverage", "Workers_Comp", "Self_Pay", "Other_Payer",\
                     "Unknown_Payer"]


In [4]:
hospital_df1[general_cols].nunique()

oshpd_id2                     388
oshpd_id                      388
FACILITY_NAME                 387
COUNTY_NAME                    56
control_type_desc               8
DBA_ADDRESS1                  387
DBA_CITY                      240
DBA_ZIP_CODE                  322
MSSA_NAME                     257
MSSA_DESIGNATION                3
license_type                    2
CONTROL_TYPE_CATEGORY_DESC      3
CONGRESSIONAL_DISTRICT_NUM     53
ASSEMBLY_DIST                  77
SENATE_DIST                    40
dtype: int64

## `2- Exploration`

In [5]:
age_group_cols = ["Age_Under_1", "Age_01_09", "Age_10_19", "Age_20_29", "Age_30_39", "Age_40_49", "Age_50_59",\
                  "Age_60_69", "Age_70_79", "Age_80_", "Age_Unknown"]
race_group_cols = ["White", "Black", "Hispanic","Asian_Pacific_Islander", "American_Indian_Alaska_Native",\
              "Other_Race", "Unknown_Race"]
sex_cols = ["Sex_Male", "Sex_Female", "Sex_Unknown_Invalid"]
language_cols = ["Chinese", "English", "Spanish", "Tagalog", "Vietnamese", "All_Other"]


disposition_cols = ["Routine", "Acute_Care", "SN_IC_Care", "Residential_Care_Facility", "Prison_Jail",\
    "Against_Medical_Advice", "Died", "Hospice_Care", "Childrens_or_Cancer_Center","Critical_Access_Hospital",\
    "Psychiatric_Care", "Home_Health_Service", "Inpatient_Rehab_Care", "Not_Defined_Elsewhere", "Other_Unknown"]


payer_source_cols = ["Medicare", "Medi_Cal", "Private_Coverage", "Workers_Comp", "Self_Pay", "Other_Payer",\
                     "Unknown_Payer"]


In [6]:
hospital_df1

Unnamed: 0,oshpd_id2,FACILITY_NAME,COUNTY_NAME,control_type_desc,DBA_ADDRESS1,DBA_CITY,DBA_ZIP_CODE,MSSA_NAME,MSSA_DESIGNATION,oshpd_id,...,Chinese,Vietnamese,Tagalog,Hcount,Ccount,CONTROL_TYPE_CATEGORY_DESC,CONGRESSIONAL_DISTRICT_NUM,ASSEMBLY_DIST,SENATE_DIST,datayear
0,306244032,ADVANCED ENDOSCOPY CENTER,MERCED,Investor - Corporation,386 W OLIVE AVE,MERCED,95348,Merced Central and North/Merced Southeast,Urban,244032,...,,,,0,1,Investor Owned,16,21,12,2014
1,106164029,ADVENTIST MEDICAL CENTER,KINGS,Nonprofit - Corporation,115 MALL DRIVE,HANFORD,93230,Armona/Hanford/Lemoore,Urban,164029,...,1.0,3.0,1.0,1,0,Nonprofit,21,32,14,2014
2,106100797,ADVENTIST MEDICAL CENTER - REEDLEY,FRESNO,Nonprofit - Corporation,372 WEST CYPRESS AVENUE,REEDLEY,93654,Orange Cove/Parlier/Reedley/Squaw Valley/Tivy ...,Rural,100797,...,,,,1,0,Nonprofit,22,31,14,2014
3,306304093,AESTHETICARE OUTPATIENT SURGERY CENTER,ORANGE,Investor - Corporation,30260 RANCHO VIEJO RD,SAN JUAN CAPISTRANO,92675,Coto de Caza/Las Flores/Mission Viejo Northwes...,Urban,304093,...,,,1.0,0,1,Investor Owned,49,73,36,2014
4,106301098,AHMC ANAHEIM REGIONAL MEDICAL CENTER,ORANGE,Investor - Partnership,1111 WEST LA PALMA AVENUE,ANAHEIM,92801,Anaheim Central,Urban,301098,...,26.0,31.0,32.0,1,0,Investor Owned,46,69,29,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,106301566,WESTERN MEDICAL CENTER - SANTA ANA,ORANGE,Investor - Corporation,1001 NORTH TUSTIN AVENUE,SANTA ANA,92705,City of Orange South/Santa Ana North/Tustin Fo...,Urban,301566,...,63.0,25.0,,1,0,Investor Owned,46,69,34,2014
384,106301188,WESTERN MEDICAL CENTER ANAHEIM,ORANGE,Investor - Corporation,1025 SOUTH ANAHEIM BLVD.,ANAHEIM,92805,Anaheim Central,Urban,301188,...,,1.0,1.0,1,0,Investor Owned,46,69,34,2014
385,106190878,WHITE MEMORIAL MEDICAL CENTER,LOS ANGELES,Nonprofit - Corporation,1720 CESAR E. CHAVEZ AVENUE,LOS ANGELES,90033,Boyle Heights Northwest/Chinatown/Downtown Nor...,Urban,190878,...,4.0,6.0,1.0,1,0,Nonprofit,34,53,24,2014
386,106190883,WHITTIER HOSPITAL MEDICAL CENTER,LOS ANGELES,Investor - Corporation,9080 COLIMA ROAD,WHITTIER,90605,La Habra Heights/Whittier,Urban,190883,...,697.0,57.0,1.0,1,0,Investor Owned,38,57,32,2014


In [7]:
hospital_df1.isnull().sum()

oshpd_id2                     0
FACILITY_NAME                 0
COUNTY_NAME                   0
control_type_desc             0
DBA_ADDRESS1                  0
                             ..
CONTROL_TYPE_CATEGORY_DESC    0
CONGRESSIONAL_DISTRICT_NUM    0
ASSEMBLY_DIST                 0
SENATE_DIST                   0
datayear                      0
Length: 137, dtype: int64

In [8]:
hospital_df1[general_cols + diagnosis_cols + injury_cause_cols + medicine_cols + surgery_cols].head(2)

Unnamed: 0,oshpd_id2,oshpd_id,FACILITY_NAME,COUNTY_NAME,control_type_desc,DBA_ADDRESS1,DBA_CITY,DBA_ZIP_CODE,MSSA_NAME,MSSA_DESIGNATION,...,Surgery_Hemic_Lymphatic,Surgery_Integumentary,Surgery_Maternity,Surgery_Musculoskeletal,Surgery_Nervous,Surgery_Respiratory,Surgery_Urinary,Surgery_Auditory,Surgery_Endocrine,Surgery_General_Other
0,306244032,244032,ADVANCED ENDOSCOPY CENTER,MERCED,Investor - Corporation,386 W OLIVE AVE,MERCED,95348,Merced Central and North/Merced Southeast,Urban,...,,,,,,,,,,
1,106164029,164029,ADVENTIST MEDICAL CENTER,KINGS,Nonprofit - Corporation,115 MALL DRIVE,HANFORD,93230,Armona/Hanford/Lemoore,Urban,...,17.0,575.0,67.0,1079.0,296.0,211.0,193.0,92.0,58.0,


## `3- OLTP Data Modeling`

In [9]:
all_cols = hospital_df1.columns.to_list()

# id_cols = ['oshpd_id2', 'oshpd_id']
general_cols = ['oshpd_id2', 'oshpd_id', 'FACILITY_NAME', 'COUNTY_NAME', 'control_type_desc', 'DBA_ADDRESS1',\
                'DBA_CITY', 'DBA_ZIP_CODE', 'MSSA_NAME', 'MSSA_DESIGNATION', 'license_type', \
                'CONTROL_TYPE_CATEGORY_DESC', 'CONGRESSIONAL_DISTRICT_NUM','ASSEMBLY_DIST', 'SENATE_DIST']

diagnosis_cols = list(filter(lambda i: i.startswith("Dx"), all_cols))
injury_cause_cols = list(filter(lambda i: i.startswith("EC"), all_cols))
medicine_cols = list(filter(lambda i: i.startswith("Medicine"), all_cols))
surgery_cols = list(filter(lambda i: i.startswith("Surgery"), all_cols))
other_procedure_cols = ['Path_Lab', 'Radiology', 'Blank_Invalid', 'other_proc', 'Eval_Management', 'Anesthesia']


age_group_cols = ["Age_Under_1", "Age_01_09", "Age_10_19", "Age_20_29", "Age_30_39", "Age_40_49", "Age_50_59",\
                  "Age_60_69", "Age_70_79", "Age_80_", "Age_Unknown"]
race_group_cols = ["White", "Black", "Hispanic","Asian_Pacific_Islander", "American_Indian_Alaska_Native",\
              "Other_Race", "Unknown_Race"]
sex_cols = ["Sex_Male", "Sex_Female", "Sex_Unknown_Invalid"]
language_cols = ["Chinese", "English", "Spanish", "Tagalog", "Vietnamese", "All_Other"]
disposition_cols = ["Routine", "Acute_Care", "SN_IC_Care", "Residential_Care_Facility", "Prison_Jail",\
    "Against_Medical_Advice", "Died", "Hospice_Care", "Childrens_or_Cancer_Center","Critical_Access_Hospital",\
    "Psychiatric_Care", "Home_Health_Service", "Inpatient_Rehab_Care", "Not_Defined_Elsewhere", "Other_Unknown"]
payer_source_cols = ["Medicare", "Medi_Cal", "Private_Coverage", "Workers_Comp", "Self_Pay", "Other_Payer",\
                     "Unknown_Payer"]


In [10]:
def get_col_data2d(col_entry, col_list, data_i):
    col_id = p_characteristics_columns.index("ch_id")
    return list(data_i[:, col_id])

### 3.1.1 patient characteristics

In [11]:
import itertools

characteristic_data = []
for i, combination in enumerate(itertools.product(sex_cols, race_group_cols, age_group_cols,\
                                     language_cols, payer_source_cols, disposition_cols)):
    characteristic_data.append((i,) + combination)
    
characteristic_data = np.array(characteristic_data)

In [12]:
p_characteristics_columns = ['ch_id', 'sex', 'race_group', 'age_group', 'language', 'payer_source', 'disposition']

# p_characteristics_df = pd.DataFrame(characteristic_combs, columns=p_characteristics_columns)
# del characteristic_combs

In [13]:
patient_ch_ids = get_col_data2d("ch_id", p_characteristics_columns, characteristic_data)

### 3.1.2 patient procedures

In [14]:
general_procedures_cols = diagnosis_cols + injury_cause_cols + surgery_cols + medicine_cols + other_procedure_cols

In [15]:
general_procedures = {}

In [16]:
def get_gp_id(txt):
    txt = txt.split("_")
    txt = list(map(lambda x_i: x_i[0], txt))
    txt = "".join(txt)
    txt = txt.upper()
    name_pt1 = ""
    for i in range(5):
        unique_filename = str(uuid.uuid4()).replace("-", "")
        id_char_i = random.randrange(0, len(unique_filename))
        name_pt1 += unique_filename[id_char_i]
    
    final_id_name = (txt + name_pt1).rjust(15).replace(" ", "X")
    return final_id_name

general_procedures['id'] = list(map(get_gp_id, general_procedures_cols))
general_procedures['name'] = general_procedures_cols

In [17]:
general_procedures['description'] = [
    "Pregnancy is the time during which one or more offspring develops (gestates) inside a woman's uterus (womb). A multiple pregnancy involves more than one offspring, such as with twins. Pregnancy usually occurs by sexual intercourse, but can also occur through assisted reproductive technology procedures. The diagnosis of pregnancy requires a multifaceted approach using 3 main diagnostic tools. These are history and physical examination, laboratory evaluation, and ultrasonography.",
    "Circulatory system diseases affect your heart and blood vessels and make it harder for blood to flow throughout your body. Some conditions have symptoms, but others are silent. Common symptoms include chest pain, edema, heart palpitations and shortness of breath. these diseases can be diagnosed through the Cardiovascular diagnostic procedures and screening tests can provide a wealth of information about the electrical activity of the heart, heartbeat rhythm, how well blood is pumping through the heart’s chambers and valves, how easily blood is flowing through the coronary arteries to the heart muscle, and whether there are tumors or abnormalities in the structure of the cardiovascular system.",
    "comprise a wide range of abnormalities of body structure or function that are present at birth and are of prenatal origin. For efficiency and practicality, the focus is commonly on major structural anomalies. These are defined as structural changes that have significant medical, social or cosmetic consequences for the affected individual, and typically require medical intervention. early detection of these congenital anomalies is vital and can be achieved through fetal ultrasonography. Studies have proven that antenatal ultrasound can successfully diagnose fetal abnormalities in many cases and therefore aid in counseling of parents and planning for early intervention.",
    "A digestive disease is any health problem that occurs in the digestive tract. Conditions may range from mild to serious. Some common problems include heartburn, cancer, irritable bowel syndrome, and lactose intolerance. Tests for digestive problems can include colonoscopy, upper GI endoscopy, capsule endoscopy, endoscopic retrograde cholangiopancreatography (ERCP), and endoscopic ultrasound.",
    "The hormones created and released by the glands in your body’s endocrine system control nearly all the processes in your body. These chemicals help coordinate your body’s functions, from metabolism to growth and development, emotions, mood, sexual function and even sleep. tests and tools are used to diagnose and evaluate endocrine disorders: CT scan, Dual-energy X-ray absorptiometry (DXA), Nuclear medicine studies, Parathyroid ultrasound, Post-thyroidectomy ultrasound, Thyroglobulin stimulation studies, Thyroid ultrasound, Ultrasound-guided fine needle aspiration.",
    "Genitourinary is a word that refers to the urinary and genital organs.The parts of the body that play a role in reproduction, getting rid of waste products in the form of urine, or both. for diagnosis, your doctor will begin by taking a detailed medical history to determine whether you experienced any events, such as an accident or fall, that may have injured your genitourinary tract. A physical examination will then be performed to check for any symptoms associated with injuries to the genitourinary tract.",
    "Infectious diseases are disorders caused by organisms — such as bacteria, viruses, fungi or parasites. Many organisms live in and on our bodies. They're normally harmless or even helpful. But under certain conditions, some organisms may cause disease. Your healthcare provider usually diagnoses infectious diseases using one or more lab tests. Your provider can look for signs of disease by: Swabbing your nose or throat. Getting blood, pee (urine), poop (stool) or spit (saliva) samples.",
    "An injury is damage to your body. It is a general term that refers to harm caused by accidents, falls, hits, weapons, and more. Drugs affect the way your body and mind function; they can change how you feel, think and behave. People take drugs for different reasons and in different ways.",
    "the performance of the locomotor system, comprising intact muscles, bones, joints and adjacent connective tissues. Laboratory tests are often helpful in making the diagnosis of a musculoskeletal disorder. For example, the erythrocyte sedimentation rate (ESR) is a test that measures the rate at which red blood cells settle to the bottom of a test tube containing blood.",
    
    "An abnormal mass of tissue that forms when cells grow and divide more than they should or do not die when they should. Neoplasms may be benign (not cancer) or malignant (cancer). Diagnosis includes both confirming the neoplastic disease and determining whether the neoplasms are benign or malignant.",
    "The sensory system is the portion of the nervous system responsible for processing input from the environment. some diagnostic tests for nervous system disorders: CT scan, Electroencephalogram (EEG), MRI, and Electrodiagnostic tests.",
    "",
    "the network of organs and tissues that help you breathe. It includes your airways, lungs and blood vessels. for diagnosis, the doctor looks at how much breath you have and how fast you breathe, identifying shallow breathing and heavy breathing. It is one of the best and most common lung function tests. oximetry: a test involving wearing a finger probe overnight to monitor your oxygen levels and heart rate.",
    "Skin diseases include all conditions that irritate, clog or damage your skin, as well as skin cancer. You may inherit a skin condition or develop a skin disease. Doctors can identify many skin disorders simply by looking at the skin. A full skin examination includes examination of the scalp, nails, and mucous membranes. Sometimes the doctor uses a hand-held lens or a dermatoscope (which includes a magnifying lens and a built-in light) to better see the areas of concern.",
    "a physical or mental feature which is regarded as indicating a condition of disease, particularly such a feature that is apparent to the patient.",
    
    "Hematology is the study of blood and blood-forming organs, including the diagnosis, treatment, and prevention of diseases of the blood, bone marrow, and immunologic, hemostatic, and vascular systems.",
    "is a significant complication of pregnancy and the postpartum period. These disorders include depression, anxiety disorders, and postpartum psychosis, which usually manifests as bipolar disorder. A medical professional determines a diagnosis by interviewing you about your history of symptoms.",
    "Neurosis and Psychosis are different types of mental disorders. Neurosis is a mild mental disorder NOT arising from organic diseases – instead, it can occur from stress, depression or anxiety. Psychosis is a major personality disorder characterised by mental and emotional disruptions. There's no test to positively diagnose psychosis. However, your GP will ask about your symptoms and possible causes. For example, they may ask you: whether you're taking any medicines. Neurosis is not currently diagnosed by healthcare professionals. Psychologists and psychiatrists now place symptoms that resemble those in neurosis within the category of depressive disorders or anxiety.",
    "",
    "the emergence of a baby or other young from the body of its mother; the start of life as a physically separate being. for diangnosis, First trimester screening is a combination of tests completed between weeks 11 and 13 of pregnancy. It is used to look for certain birth defects related to the baby's heart or chromosomal disorders, such as Down syndrome. This screen includes a maternal blood test and an ultrasound.",
    "Fall that occurs due to extrinsic environmental risk factors or hazards: spills on the floor, clutter, tubing/ cords on the floor, etc., or errors in judgment, such as not paying attention.",
    "Problems with concentration, arousal, attention, slowness. Difficulty with speech, language comprehension, word finding. Memory loss. Fatigue, sleep disturbances, other sleep disorders.",
    "an unfortunate situation that happens because of bad luck.",
    "",
    "",
    "type of suffocation induced by the submersion of the mouth and nose in a liquid. Most instances of fatal drowning occur alone or in situations where others present are either unaware of the victim's situation or unable to offer assistance.",
    "an unknown injury",
    "A side effect is usually regarded as an undesirable secondary effect which occurs in addition to the desired therapeutic effect of a drug or medication. Side effects may vary for each individual depending on the person's disease state, age, weight, gender, ethnicity and general health.",
    "injuries caused by other individuals",
    "injuries caused from the surrounding enivronment",
    "injuries caused by tranportation vehicles",
    "injuries caused by own individual",
    "injuries caused by train and other metro transporters",
    "a person unintentionally poisoning themselves and includes accidental drug overdose.",
    "an incident involving smoke, heat, and. flames causing property damage to multiple building fixtures or fittings.",
    "Common combat injuries include second and third degree burns, broken bones, shrapnel wounds, brain injuries, spinal cord injuries, nerve damage, and paralysis.",
    "also referred to as cardiac surgery or heart surgery, describes any surgical procedure that involves the heart, or the blood vessels that carry blood to and from the heart.",
    "Gastrointestinal surgery is a treatment for diseases of the parts of the body involved in digestion. This includes the esophagus (ee-sof-uh-gus), stomach, small intestine, large intestine, and rectum. It also includes the liver, gallbladder, and pancreas.",
    "used to treat a variety of conditions, including cataracts, glaucoma, detached retinas, retinal tears, diabetic retinopathy, and nearsightedness or farsightedness.",
    "Genital surgery can include hysterectomy, salpingectomy, oophorectomy, metoidioplasty, vaginectomy, scrotoplasty, and implantation of prostheses.",
    "hemic and lymphatic services involve surgical services and procedures of thespleen, lymph nodes, and lymphatic channels.", 
    "Generally, integumentary procedures include incision, biopsy, removal, paring/curettement, shaving, destruction (multiple methodology), excision, repair, adjacent tissue rearrangement, grafts, flaps, and specialized services such as burn management and Mohs micrographic surgery.",
    "Surgery performed on a pregnant woman for conditions associated with pregnancy, labor, or the puerperium.",
    "refers to a number of surgical procedures that aim to improve, manage, or treat disorders, diseases, injuries, or congenital conditions of the musculoskeletal system.",
    "refers to a number of surgical procedures for the nervous system. It is the medical specialty concerned with the diagnosis and treatment of of patients with injury to, or diseases/disorders of the brain, spinal cord and spinal column, and peripheral nerves within all parts of the body.",
    "surgery done to repair or remove lung tissue. There are many common lung surgeries, including: Biopsy of an unknown growth in or around the lung. Lobectomy, to remove one or more lobes of a lung. Lung transplant.",
    "a procedure on your urinary bladder — the organ below your kidneys but above your urethra that houses your pee. The type of bladder surgery depends on what's being treated. Conditions that may need surgery include: Bladder cancer. Cystocele (fallen bladder/bladder prolapse).",
    "using Cochlear implants can improve hearing in people with severe hearing loss who are no longer helped by using hearing aids. Cochlear implants can improve their communication and quality of life. Cochlear implants may be placed in one ear",
    "Endocrine surgery treats endocrine disorders by removing part or all of a diseased gland. You may need surgery for thyroid cancer, adrenal gland tumors or an overactive gland.",
    "The branch of surgery that covers the main areas of surgical treatment. General surgeons treat diseases of the abdomen, breast, head and neck, blood vessels, and digestive tract. They also manage care of patients who have been injured or who have deformities or other conditions that need surgery.",
    "includes: Anticoagulants, Antiplatelet medications, Angiotensin-converting enzyme (ACE) inhibitors, Angiotensin receptor blockers (ARBs)Beta blockers, Calcium channel blockers, and Digoxin.Diuretics.",
    "when fluids, medication, or blood is given through a catheter directly into a vein. A medical professional does this by using a needle to guide the catheter into a vein. The needle is then removed leaving the plastic catheter behind in the vein.",
    "Chemotherapy is most often given as an infusion into a vein (intravenously). The drugs can be given by inserting a tube with a needle into a vein in your arm or into a device in a vein in your chest. Chemotherapy pills. Some chemotherapy drugs can be taken in pill or capsule form.",
    "a medical specialty that helps people regain body functions they lost due to medical conditions or injury",
    "a branch of medicine that diagnoses and treats issues of the lung, like asthma, chronic obstructive pulmonary disease (COPD), pneumonia, tuberculosis, lung cancer, lung fibrosis, pulmonary hypertension, etc.",
    "Neuromuscular medicine is a subspecialty of neurology and physiatry that focuses the diagnosis and management of neuromuscular diseases. The field encompasses issues related to both diagnosis and management of these conditions, including rehabilitation interventions to optimize the quality of life of individuals with these conditions.",
    "specialized field of medicine that focuses on the health of the eye. It includes the anatomy, physiology and diseases that may affect the eye.",
    "Otorhinolaryngology is a surgical subspeciality within medicine that deals with the surgical and medical management of conditions of the head and neck.",
    "Gastroenterology (from the Greek gastḗr- “belly”, -énteron “intestine”, and -logía \"study of\") is the branch of medicine focused on the digestive system and its disorders.[1] The digestive system consists of the gastrointestinal tract, sometimes referred to as the GI tract, which includes the esophagus, stomach, small intestine and large intestine as well as the accessory organs of digestion which includes the pancreas, gallbladder, and liver.",
    "includes: Erythropoietin, Iron, Active Vitamin D, Phosphorus binders, B-complex Vitamin & folic acid, Topical creams & antihistamines, Vitamin E.",
    "",
    "rehydrate by consuming fluids that contain electrolytes, such as sports drinks or oral rehydration solutions. There are also foods available that have a high water content, such as fruits and vegetables. These will also help with rehydration.",
    "medicines to help you relax (a sedative) and to block pain (an anesthetic) during a medical or dental procedure.",
    "Psychiatric medication includes all drugs which can be prescribed to: treat different types of mental health problem. reduce the symptoms of mental health problems.",
    "a Laboratory where tests are carried out on clinical specimens to obtain information about the health of a PATIENT to aid in diagnosis, treatment, and prevention of disease.",
    "uses imaging technology to diagnose and treat disease.",
    "",
    "",
    "are cognitive (as opposed to procedural) services in which a physician or other qualified healthcare professional diagnoses and treats illness or injury.",
    "the use of medicines to prevent pain during surgery and other procedures."
]

In [18]:
general_procedures['type'] = [
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "diagnosis",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "injury",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "surgery",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "medicine",
    "other",
    "other",
    "other",
    "other",
    "other",
    "other",
]

In [19]:
general_procedures['price'] = \
[
    "15",
    "1400",
    "410",
    "2750",
    "235",
    "605",
    "140",
    "80",
    "250",
    "350",
    "600",
    "100",
    "90",
    "150",
    "30",
    "130",
    "300",
    "150",
    "0",
    "3000",
    "55",
    "3000",
    "1800",
    "0",
    "1000",
    "200",
    "0",
    "450",
    "8000",
    "1100",
    "16000",
    "2500",
    "21000",
    "925",
    "2000",
    "40000",
    "25000",
    "23000",
    "4000",
    "14000",
    "40000",
    "8000",
    "13000",
    "7800",
    "60000",
    "35000",
    "22500",
    "35000",
    "12500",
    "2500",
    "115",
    "200",
    "500",
    "230",
    "4735",
    "750",
    "2000",
    "460",
    "380",
    "600",
    "300",
    "35",
    "700",
    "600",
    "0",
    "0",
    "0",
    "0",
    "0",
    "0"
]

In [20]:
p_procedures_cols = list(general_procedures.keys())
p_procedures_data = np.array(list(general_procedures.values())).T

# get_col_data2d('name', p_procedures_cols, p_procedures_data)

patient_procedures_df = pd.DataFrame.from_dict(general_procedures)

In [21]:
p_procedures_cols

['id', 'name', 'description', 'type', 'price']

In [22]:
patient_procedures_ids = get_col_data2d('name', p_procedures_cols, p_procedures_data)
patient_procedures_ids[:5]

['XXXXXXXDAP88462',
 'XXXXXXXXDCf1728',
 'XXXXXXXDCA5da58',
 'XXXXXXXXDDab650',
 'XXXXXXXDEM30b02']

### 3.1.3 Locations

In [23]:
location_cols = ['DBA_CITY', 'COUNTY_NAME', 'DBA_ADDRESS1', 'DBA_ZIP_CODE', 'CONGRESSIONAL_DISTRICT_NUM', 'SENATE_DIST', 'ASSEMBLY_DIST']

In [24]:
locations = hospital_df1[location_cols]
locations = locations[~locations.duplicated()]

In [25]:
locations['location_id'] = locations.index

In [26]:
locations = locations[['location_id'] + location_cols]

In [27]:
locations

Unnamed: 0,location_id,DBA_CITY,COUNTY_NAME,DBA_ADDRESS1,DBA_ZIP_CODE,CONGRESSIONAL_DISTRICT_NUM,SENATE_DIST,ASSEMBLY_DIST
0,0,MERCED,MERCED,386 W OLIVE AVE,95348,16,12,21
1,1,HANFORD,KINGS,115 MALL DRIVE,93230,21,14,32
2,2,REEDLEY,FRESNO,372 WEST CYPRESS AVENUE,93654,22,14,31
3,3,SAN JUAN CAPISTRANO,ORANGE,30260 RANCHO VIEJO RD,92675,49,36,73
4,4,ANAHEIM,ORANGE,1111 WEST LA PALMA AVENUE,92801,46,29,69
...,...,...,...,...,...,...,...,...
383,383,SANTA ANA,ORANGE,1001 NORTH TUSTIN AVENUE,92705,46,34,69
384,384,ANAHEIM,ORANGE,1025 SOUTH ANAHEIM BLVD.,92805,46,34,69
385,385,LOS ANGELES,LOS ANGELES,1720 CESAR E. CHAVEZ AVENUE,90033,34,24,53
386,386,WHITTIER,LOS ANGELES,9080 COLIMA ROAD,90605,38,32,57


In [28]:
locations_data = locations.to_numpy()
locations_cols = locations.columns.to_list()
locations_data

array([[0, 'MERCED', 'MERCED', ..., 16, 12, 21],
       [1, 'HANFORD', 'KINGS', ..., 21, 14, 32],
       [2, 'REEDLEY', 'FRESNO', ..., 22, 14, 31],
       ...,
       [385, 'LOS ANGELES', 'LOS ANGELES', ..., 34, 24, 53],
       [386, 'WHITTIER', 'LOS ANGELES', ..., 38, 32, 57],
       [387, 'WOODLAND', 'YOLO', ..., 3, 3, 4]], dtype=object)

### 3.1.4 Listed Hospitals

In [29]:
cols = ['oshpd_id2', 'oshpd_id', 'FACILITY_NAME', 'control_type_desc', 'CONTROL_TYPE_CATEGORY_DESC',\
 'MSSA_NAME', 'MSSA_DESIGNATION', 'license_type']

In [30]:
hospital_locations = hospital_df1[location_cols]

h_indices = []
for hid, hdata in hospital_locations.iterrows():
    h_city, h_county, h_address, h_zipcode = hdata['DBA_CITY'], hdata['COUNTY_NAME'], hdata['DBA_ADDRESS1'], \
        hdata['DBA_ZIP_CODE']
    h_cdn, h_ad, h_sd = hdata['CONGRESSIONAL_DISTRICT_NUM'], hdata['ASSEMBLY_DIST'], hdata['SENATE_DIST']
    
    location_entry = locations[(locations['DBA_CITY'] == h_city) & \
                    (locations['COUNTY_NAME'] == h_county) & \
                    (locations['DBA_ADDRESS1'] == h_address) & \
                    (locations['DBA_ZIP_CODE'] == h_zipcode) & \
                    (locations['CONGRESSIONAL_DISTRICT_NUM'] == h_cdn) & \
                    (locations['ASSEMBLY_DIST'] == h_ad) & \
                    (locations['SENATE_DIST'] == h_sd)]
    location_entry_index = location_entry.index[0]
    h_indices.append(location_entry_index)


In [31]:
hopitals_listed = hospital_df1[cols]
hopitals_listed['location_id'] = h_indices

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hopitals_listed['location_id'] = h_indices


In [32]:
hospital_locations_cols = ['oshpd_id2', 'location_id']
hospital_locations_data =  np.array(list(hopitals_listed.groupby(['oshpd_id2', 'location_id']).indices.keys()))
hospital_locations_data

array([[106010735,         5],
       [106010739,         8],
       [106010776,        34],
       [106010846,       112],
       [106010856,       129],
       [106010858,       127],
       [106010937,         9],
       [106010967,       331],
       [106010987,       379],
       [106013619,       280],
       [106014050,       374],
       [106014132,       125],
       [106014233,        71],
       [106014326,       130],
       [106014337,       142],
       [106034002,       341],
       [106040802,        24],
       [106040875,        82],
       [106040937,       228],
       [106040962,        79],
       [106050932,       187],
       [106060870,        50],
       [106070904,        66],
       [106070924,        57],
       [106070934,       346],
       [106070988,       122],
       [106070990,       149],
       [106071018,       121],
       [106074017,       282],
       [106074097,       154],
       [106084001,       344],
       [106090793,        19],
       [

In [33]:
hospitals_listed_data = hopitals_listed.drop(columns=['location_id']).to_numpy()
hospitals_listed_cols = hopitals_listed.drop(columns=['location_id']).columns.to_list()
hospitals_listed_data

array([[306244032, 244032, 'ADVANCED ENDOSCOPY CENTER', ...,
        'Merced Central and North/Merced Southeast', 'Urban', 'Clinic'],
       [106164029, 164029, 'ADVENTIST MEDICAL CENTER', ...,
        'Armona/Hanford/Lemoore', 'Urban', 'Hospital'],
       [106100797, 100797, 'ADVENTIST MEDICAL CENTER - REEDLEY', ...,
        'Orange Cove/Parlier/Reedley/Squaw Valley/Tivy Valley/Wonder Valley',
        'Rural', 'Hospital'],
       ...,
       [106190878, 190878, 'WHITE MEMORIAL MEDICAL CENTER', ...,
        'Boyle Heights Northwest/Chinatown/Downtown Northwest/Little Tokyo/Westlake',
        'Urban', 'Hospital'],
       [106190883, 190883, 'WHITTIER HOSPITAL MEDICAL CENTER', ...,
        'La Habra Heights/Whittier', 'Urban', 'Hospital'],
       [106571086, 571086, 'WOODLAND MEMORIAL HOSPITAL', ..., 'Woodland',
        'Urban', 'Hospital']], dtype=object)

In [34]:
hospitals_listed_data

array([[306244032, 244032, 'ADVANCED ENDOSCOPY CENTER', ...,
        'Merced Central and North/Merced Southeast', 'Urban', 'Clinic'],
       [106164029, 164029, 'ADVENTIST MEDICAL CENTER', ...,
        'Armona/Hanford/Lemoore', 'Urban', 'Hospital'],
       [106100797, 100797, 'ADVENTIST MEDICAL CENTER - REEDLEY', ...,
        'Orange Cove/Parlier/Reedley/Squaw Valley/Tivy Valley/Wonder Valley',
        'Rural', 'Hospital'],
       ...,
       [106190878, 190878, 'WHITE MEMORIAL MEDICAL CENTER', ...,
        'Boyle Heights Northwest/Chinatown/Downtown Northwest/Little Tokyo/Westlake',
        'Urban', 'Hospital'],
       [106190883, 190883, 'WHITTIER HOSPITAL MEDICAL CENTER', ...,
        'La Habra Heights/Whittier', 'Urban', 'Hospital'],
       [106571086, 571086, 'WOODLAND MEMORIAL HOSPITAL', ..., 'Woodland',
        'Urban', 'Hospital']], dtype=object)

In [35]:
hospitals_listed_cols

['oshpd_id2',
 'oshpd_id',
 'FACILITY_NAME',
 'control_type_desc',
 'CONTROL_TYPE_CATEGORY_DESC',
 'MSSA_NAME',
 'MSSA_DESIGNATION',
 'license_type']

### 3.1.5 registering patients 

making 20000 entries

In [36]:
fake = Faker()
num_patients = 200000

# USE
hospital_ids = hospital_df1['oshpd_id2'].unique()

In [37]:
# Eg. generating patient info
patient_name,  patient_ssn, patient_dob, patient_ccn, patient_address, patient_state, patient_city = \
    fake.name(), fake.ssn(), fake.date_of_birth(), fake.credit_card_number(), fake.address(), fake.state(), \
        fake.city()
patient_id =  f"{''.join(list(map(lambda x: x[0], patient_name.split())))}-"+ str(fake.random_number(digits=20))

print(patient_id, patient_name, patient_address, patient_ssn, patient_dob, patient_ccn, patient_state, patient_city)
print()

# Eg. generating hospital id
patient_hid = hospital_ids[random.randrange(0, len(hospital_ids))]
print(patient_hid)

SB-55294782609245978465 Sherry Boyle 51639 Williams Club
Hooperside, DC 19789 813-49-5782 1933-11-23 36596444606227 Connecticut Michaelside

106190431


In [38]:
patient_cols = ["patient_id", "patient_name", "patient_chid", "patient_hid", "social_security_num",
                "date_of_birth", "credit_card_number", "address", "state", "city"]

In [39]:
patients = []

for patient_i in range(num_patients):
    patient_name,  patient_ssn, patient_dob, patient_ccn, patient_address, patient_state, patient_city = \
    fake.name(), fake.ssn(), fake.date_of_birth(), fake.credit_card_number(), fake.address(), fake.state(), \
        fake.city()
    patient_id =  f"{''.join(list(map(lambda x: x[0], patient_name.split())))}-" + \
                                                str(fake.random_number(digits=20))
    patient_hid = str(hospital_ids[random.randrange(0, len(hospital_ids))])
    patient_chid = str(patient_ch_ids[random.randrange(0, len(patient_ch_ids))])
    
    patient_i_data = [patient_id, patient_name, patient_chid, patient_hid, patient_ssn, patient_dob, 
                       patient_ccn, patient_address, patient_state, patient_city]
    patients.append(patient_i_data)

In [40]:
patients = np.array(patients)
np.shape(patients)

patients_ids = patients[:, 0]
print("patient ids (length vs unique): {} vs {}".format(patients_ids.shape[0], np.unique(patients_ids).shape[0]))

patient ids (length vs unique): 200000 vs 200000


In [41]:
# pd.DataFrame(patients, columns=patient_cols)

In [42]:
p_data = patients
p_cols = patient_cols
print(p_cols)

['patient_id', 'patient_name', 'patient_chid', 'patient_hid', 'social_security_num', 'date_of_birth', 'credit_card_number', 'address', 'state', 'city']


In [43]:
p_data[0]

array(['RW-26573665020013247958', 'Ryan Weaver', '96731', '106190756',
       '369-90-5108', datetime.date(1925, 9, 27), '6011756163113617',
       '08916 Victoria Squares\nEast Timothyport, TN 01387',
       'Massachusetts', 'West Ana'], dtype=object)

### 3.1.6 employees

In [44]:
sections = ["A", "B", "C"]

In [45]:
role = ["Physician", "Nurse", "Accountant", "Other"]
role_frac = [0.65, 0.15, 0.03, 0.17]

In [46]:
# salary feature generation
def generate_salary(role):
    if role == "Physician":
        return round(160000 + random.random() * 200000, 2)
    elif role == "Nurse":
        return round(75000 + random.random() * 50000, 2)
    elif role == "Accountant":
        return round(85000 + random.random() * 35000, 2)
    elif role == "Other":
        return round(50000 + random.random() * 100000, 2)
    else:
        return -1



In [47]:
datetime.date.today()

datetime.date(2023, 3, 21)

In [48]:
emp_dob_end_date =datetime.date.today() - timedelta(days=10000)
emp_dob_start_date = emp_dob_end_date - timedelta(days=10000)
dob_date = emp_dob_start_date + (emp_dob_end_date - emp_dob_start_date) *  random.random()
dob_date

datetime.date(1994, 11, 13)

In [49]:
def get_emp_date():
    emp_end_date = datetime.datetime.now() - timedelta(days=1000)
    emp_start_date = emp_end_date - timedelta(days=3000)
    employment_date = emp_start_date + (emp_end_date - emp_start_date) * random.random()
    return employment_date
get_emp_date()

datetime.datetime(2018, 9, 19, 20, 26, 56, 328264)

In [50]:
job_role_specialties = {"Physician": ["Acute Care Surgeon", "Addiction specialist", "Anesthesiologist", "Audiologist", 
                   "Cardiologist", "Cardiovascular Surgeon", "Critical Care Specialist", "Dermatologist", "Dermatopathologist",
                   "Diagnosing Physician", "Electrodiagnostic Physician", "Endocrinologist", "Endocrinologist Surgeon",
                   "Gastroenterologist", "Gastrointestinal Surgeon", "Genetic Specialist", "Gynecologic Oncologist",
                   "Gynecologist", "Hand Surgeon", "Hematologist", "Infectious Disease Specialist", "Internist",
                   "Interventional Cardiologist", "Lipidologist", "Microbiologist", "Neonatologist", "Nephrologist",
                   "Neurologist", "Neuromuscular Disease Specialist", "Neurosurgeon", "Obstetrician", "Oculoplastic Surgeon",
                   "Oncologist", "Ophthalmologist", "Orthopedic Surgeon", "Otolaryngologist", "Palliative Care Specialist",
                   "Pathologist", "Pediatric Cardiologist", "Pediatric Gastroenterologist", "Pediatrician", "Physiatrist",
                   "Physical medicine consultant", "Plastic Surgeon", "Psychiatrist", "Psychologist", "Pulmonologist",
                   "Radiation Oncologist", "Radiologist", "Reconstructive Surgeon", "Reproductive Endocrinologist",
                   "Rheumatologist", "Surgeon", "Surgical Oncologist", "Technologist", "Thoracic Surgeon",
                   "Transplant Pulmonologist", "Urologist", "Vascular surgeon", "radiologist", "rehabilitation consultant"],
                 "Nurse": ["Nurse"],
                 "Other": ["Clinical Informaticist", "Healthcare Professional", "Medicine Specialist",
                           "Sleep Medicine Specialist"],
                 "Accountant": ["Accountant"]
}

In [51]:
procedure_doctors = {
    "Dx_All_Pregnancies": ["Reproductive Endocrinologist",
                           "Gynecologist",
                           "Gynecologic Oncologist",
                           "Obstetrician",
                           "Nurse"],
    "Dx_Circulatory": ["Vascular surgeon",
                       "Hematologist",
                       "Interventional Cardiologist",
                       "Radiologist",
                       "Dermatologist",
                       "Nurse"],
    "Dx_Congenital_anomalies": ["Genetic Specialist",
                                "Pediatric Cardiologist",
                                "Interventional Cardiologist",
                                "Cardiologist",
                                "Pediatric Cardiologist",
                                "Nurse"],
    "Dx_Digestive": ["Gastroenterologist",
                     "Pediatric Gastroenterologist",
                     "Internist",
                     "Nurse"],
    "Dx_Endocrine_Metabolism": ["Endocrinologist",
                                "Internist",
                                "Lipidologist",
                                "Nurse"],
    "Dx_Genitourinary": ["Urologist",
                         "Internist",
                         "Nurse"],
    "Dx_Infections": ["Internist",
                      "Infectious Disease Specialist",
                      "Microbiologist",
                      "Nurse"],
    "Dx_Injuries_Drugs_Complications": ["Psychiatrist",
                                        "Psychologist",
                                        "Addiction specialist", 
                                        "Palliative Care Specialist",
                                        "Hand Surgeon",
                                        "Orthopedic Surgeon",
                                        "Nurse"],
    "Dx_Musculoskeletal": ["Rheumatologist",
                           "rehabilitation consultant",
                           "Endocrinologist",
                           "Radiologist",
                           "Physical medicine consultant",
                           "Nurse"],
    "Dx_Neoplasms" : ["Radiation Oncologist",
                      "Physiatrist",
                      "Neurosurgeon",
                      "Anesthesiologist",
                      "Surgical Oncologist",
                      "Oncologist",
                      "Nurse"],
    "Dx_Nervous_Sensory_Systems": ["Neurologist",
                                   "Neuromuscular Disease Specialist",
                                   "Electrodiagnostic Physician",
                                   "Ophthalmologist",
                                   "Nurse"],
    "Dx_Other_Reasons" : ["Internist",
                          "Medicine Specialist",
                          "Clinical Informaticist",
                          "Nurse"],
    "Dx_Respiratory" : ["Pulmonologist",
                        "Critical Care Specialist",
                        "Transplant Pulmonologist",
                        "Sleep Medicine Specialist",
                        "Nurse"],
    "Dx_Skin_disorders" : ["Dermatologist",
                           "Pediatrician",
                           "Dermatopathologist",
                           "Nurse"],
    "Dx_Symptoms" : ["radiologist",
                     "Medicine Specialist",
                     "Clinical Informaticist",
                     "Nurse"],
    "Dx_Blood_Bloodforming_organs" : ["Pediatrician",
                                      "Hematologist",
                                      "Internist",
                                      "Nephrologist",
                                      "Cardiologist",
                                      "Nurse"],
    "Dx_Perinatal_disorders" : ["Neonatologist",
                                "Pediatrician",
                                "Gynecologist",
                                "Nurse"],
    "Dx_Psychoses_Neuroses" : ["Psychologist",
                               "Psychiatrist"],
    "Dx_Invalid" : [], # might be any doctor <- section
    "Dx_Births" : ["Neonatologist", "Pediatrician", "Nurse"],
    "EC_Accidental_Falls": ["Palliative Care Specialist",
                            "Dermatologist",
                            "Internist",
                            "Nurse"],
    "EC_Late_Effects_of_Injury": ["Clinical Informaticist",
                                  "Hand Surgeon", 
                                  "Dermatologist",
                                  "Nurse"],
    "EC_Misadventures_Complication": ["Medicine Specialist", 
                                      "Clinical Informaticist",
                                      "Nurse"],
    "EC_None": ["Medicine Specialist", 
                "Clinical Informaticist",
                "Nurse"],
    "EC_Other_Accidents": ["Medicine Specialist", 
                           "Clinical Informaticist",
                           "Internist",
                           "Palliative Care Specialist", 
                           "Hand Surgeon",
                           "Dermatologist",
                           "Nurse"],
    "EC_Submersion_Suffocation_Foreig": ["Medicine Specialist", 
                                         "Clinical Informaticist",
                                         "Internist",
                                         "Pulmonologist",
                                         "Nurse"],
    "EC_Undetermined": ["Medicine Specialist", 
                        "Clinical Informaticist",
                        "Internist",
                        "Palliative Care Specialist", 
                        "Hand Surgeon",
                        "Dermatologist",
                        "Nurse"],
    "EC_Adverse_Effects_of_Drugs": ["Medicine Specialist", 
                                    "Clinical Informaticist",
                                    "Psychiatrist",
                                    "Psychologist",
                                    "Addiction specialist",
                                    "Nurse"],
    
    "EC_Inflicted_by_Others": ["Medicine Specialist", 
                               "Clinical Informaticist",
                               "Dermatologist",
                               "Internist",
                               "Nurse"],
    "EC_Natural_Environment": ["Medicine Specialist", 
                               "Clinical Informaticist",
                               "Dermatologist",
                               "Internist",
                               "Nurse"],
    "EC_Other_Vehicle_Transport": ["Medicine Specialist", 
                                   "Clinical Informaticist",
                                   "Dermatologist",
                                   "Internist",
                                   "Nurse"],
    "EC_Self_Inflicted": ["Medicine Specialist", 
                          "Clinical Informaticist",
                          "Dermatologist",
                          "Psychiatrist",
                          "Psychologist",
                          "Nurse"],
    "EC_Rail_Motor_Vehicle": ["Medicine Specialist", 
                              "Clinical Informaticist",
                              "Dermatologist",
                              "Internist",
                              "Nurse"],
    "EC_Accidental_Poisoning": ["Medicine Specialist", 
                                "Clinical Informaticist",
                                "Internist",
                                "Infectious Disease Specialist",
                                "Nurse"],
    "EC_Fire_Accidents": ["Medicine Specialist", 
                          "Clinical Informaticist",
                          "Dermatologist",
                          "Internist",
                          "Acute Care Surgeon",
                          "Nurse"],
    "EC_War": ["Medicine Specialist", 
               "Clinical Informaticist",
               "Dermatologist",
               "Internist",
               "Acute Care Surgeon",
               "Nurse"],
    "Surgery_Cardivascular": ["Cardiovascular Surgeon",
                              "Thoracic Surgeon"],
    "Surgery_Digestive": ["Gastrointestinal Surgeon",
                          ],
    "Surgery_Eye_Ocular": ["Oculoplastic Surgeon"],
    "Surgery_Genital_Reproductive": ["Gynecologist",
                                     "Urologist"],
    "Surgery_Hemic_Lymphatic": ["Plastic Surgeon", 
                                "Reconstructive Surgeon"],
      "Surgery_Integumentary": ["Dermatologist", "Hand Surgeon"],
          "Surgery_Maternity": ["Obstetrician"],
    "Surgery_Musculoskeletal": ["Orthopedic Surgeon"],
            "Surgery_Nervous": ["Neurosurgeon"],
        "Surgery_Respiratory": ["Thoracic Surgeon"],
            "Surgery_Urinary": ["Urologist"],
           "Surgery_Auditory": ["Audiologist"],
          "Surgery_Endocrine": ["Endocrinologist Surgeon"],
      "Surgery_General_Other": ["Surgeon"],
             "Medicine_Cardiovascular": ["Cardiologist", "Nurse"], 
         "Medicine_Injection_Infusion": ["Medicine Specialist", "Nurse"], 
                "Medicine_Chemo_Admin": ["Oncologist", "Nurse"], 
             "Medicine_Physical_Rehab": ["Medicine Specialist", "Psychologist", "Psychiatrist", "Nurse"], 
                  "Medicine_Pulmonary": ["Pulmonologist", "Nurse"], 
    "Medicine_Neurology_Neuromuscular": ["Neurologist", "Nurse"], 
              "Medicine_Ophthalmology": ["Medicine Specialist", "Ophthalmologist", "Nurse"], 
       "Medicine_Otorhinolaryngologic": ["Otolaryngologist", "Nurse"], 
           "Medicine_Gastroenterology": ["Gastroenterologist", "Nurse"], 
                   "Medicine_Dialysis": ["Nephrologist", "Nurse"], 
                 "Medicine_Other": ["Nurse"],
             "Medicine_Hydration": ["Medicine Specialist", "Nurse"],
    "Medicine_Sedation_Conscious": ["Anesthesiologist"],
                 "Medicine_Psych": ["Psychologist", "Psychiatrist", "Medicine Specialist"],
                       "Path_Lab": ["Pathologist", "Healthcare Professional"],
                      "Radiology": ["Radiologist", "Technologist", "Nurse"],
                  "Blank_Invalid": ["Other Staff"],
                     "other_proc": ["Other Staff"],
                "Eval_Management": ["Diagnosing Physician", "Healthcare Professional"],
                     "Anesthesia": ["Anesthesiologist"]
}

In [52]:
procedure_sections = {
    "Dx_All_Pregnancies": ["Gynecology"],
    "Dx_Circulatory": ["Surgery", "Cardiology"],
    "Dx_Congenital_anomalies": ["Cardiology", "Surgery"],
    "Dx_Digestive": ["Gastroenterology", "Surgery", "Infections"],
    "Dx_Endocrine_Metabolism": ["Endocrinology"],
    "Dx_Genitourinary": ["Urology"],
    "Dx_Infections": ["Infections"],
    "Dx_Injuries_Drugs_Complications": ["Psychiatry and Psychology", "Surgery"],
    "Dx_Musculoskeletal": ["Orthopedics", "Surgery"],
    "Dx_Neoplasms" : ["Neurology", "Oncology", "Radiology"],
    "Dx_Nervous_Sensory_Systems": ["Neurology", "Surgery"],
    "Dx_Other_Reasons": ["Internal Medicine", "Cardiology"],
    "Dx_Respiratory" : ["Pulmonology", "Surgery"],
    "Dx_Skin_disorders" : ["Dermatology"],
    "Dx_Symptoms" : ["Internal Medicine", "Radiology"],
    "Dx_Blood_Bloodforming_organs" : ["Hematology", "Nephrology", "Cardiology"],
    "Dx_Perinatal_disorders" : ["Gynecology", "Neonatology"],
    "Dx_Psychoses_Neuroses" : ["Psychiatry and Psychology"],
    "Dx_Invalid" : ["Nurse"], 
    "Dx_Births" : ["Neonatology"],
    "EC_Accidental_Falls": ["Emergency", "Dermatology"],
    "EC_Late_Effects_of_Injury": ["Emergency", "Surgery", "Dermatology"],
    "EC_Misadventures_Complication": ["Emergency"],
    "EC_None": ["Emergency"],
    "EC_Other_Accidents": ["Emergency", "Surgery", "Dermatology"],
    "EC_Submersion_Suffocation_Foreig": ["Emergency", "Surgery", "Dermatology", "Pulmonology"],
    "EC_Undetermined": ["Emergency", "Surgery", "Dermatology"],
    "EC_Adverse_Effects_of_Drugs": ["Emergency", "Psychiatry and Psychology", "Surgery"],
    "EC_Inflicted_by_Others": ["Emergency", "Surgery", "Dermatology"],
    "EC_Natural_Environment": ["Emergency", "Surgery", "Dermatology"],
    "EC_Other_Vehicle_Transport": ["Emergency", "Surgery", "Dermatology"],
    "EC_Self_Inflicted": ["Emergency", "Surgery", "Dermatology", "Psychiatry and Psychology"],
    "EC_Rail_Motor_Vehicle": ["Emergency", "Surgery", "Dermatology"],
    "EC_Accidental_Poisoning": ["Emergency", "Surgery", "Dermatology", "Infections"],
    "EC_Fire_Accidents": ["Emergency", "Surgery", "Dermatology"],
    "EC_War": ["Emergency", "Surgery", "Dermatology"],
    "Surgery_Cardivascular": ["Surgery"],
    "Surgery_Digestive": ["Surgery"],
    "Surgery_Eye_Ocular": ["Surgery"],
    "Surgery_Genital_Reproductive": ["Surgery"],
     "Surgery_Hemic_Lymphatic": ["Surgery"],
      "Surgery_Integumentary": ["Surgery"],
          "Surgery_Maternity": ["Surgery"],
    "Surgery_Musculoskeletal": ["Surgery"],
            "Surgery_Nervous": ["Surgery"],
        "Surgery_Respiratory": ["Surgery"],
            "Surgery_Urinary": ["Surgery"],
           "Surgery_Auditory": ["Surgery"],
          "Surgery_Endocrine": ["Surgery"],
      "Surgery_General_Other": ["Surgery"],
         "Medicine_Cardiovascular": ["Pharmacy"], 
         "Medicine_Injection_Infusion": ["Pharmacy"], 
                "Medicine_Chemo_Admin": ["Pharmacy"], 
             "Medicine_Physical_Rehab": ["Pharmacy"], 
                  "Medicine_Pulmonary": ["Pharmacy"], 
    "Medicine_Neurology_Neuromuscular": ["Pharmacy"], 
              "Medicine_Ophthalmology": ["Pharmacy"], 
       "Medicine_Otorhinolaryngologic": ["Pharmacy"], 
           "Medicine_Gastroenterology": ["Pharmacy"], 
                   "Medicine_Dialysis": ["Pharmacy"], 
                 "Medicine_Other": ["Pharmacy"],
             "Medicine_Hydration": ["Pharmacy"],
    "Medicine_Sedation_Conscious": ["Pharmacy"],
                 "Medicine_Psych": ["Pharmacy"],
                       "Path_Lab": ["Pathology"],
                      "Radiology": ["Radiology"],
                  "Blank_Invalid": ["Other"],
                     "other_proc": ["Other"],
                "Eval_Management": ["Management"],
                     "Anesthesia": ["Anesthesia"]
    
}

In [53]:
KEY_section_diseases = {}
for disease_i, section_i_l in procedure_sections.items():
    for section_i_i in section_i_l:
        if section_i_i in KEY_section_diseases.keys():
            KEY_section_diseases[section_i_i].append(disease_i)
        else:
            KEY_section_diseases[section_i_i] = []
            KEY_section_diseases[section_i_i].append(disease_i)
    

In [54]:
job_section_l = {}
for defining_key in procedure_doctors.keys():
    defining_doctor_l = procedure_doctors[defining_key]
    for defining_doctor_i in defining_doctor_l:
        if defining_doctor_i in job_section_l.keys():
            job_section_l[defining_doctor_i].extend(procedure_sections[defining_key])
        else:
            job_section_l[defining_doctor_i] = []
            job_section_l[defining_doctor_i].extend(procedure_sections[defining_key])
            
job_section_l["Accountant"] = ["Management", "Other"]

In [55]:
flat_list = [item for sublist in list(procedure_doctors.values()) for item in sublist]
unique_job = np.unique(np.array(flat_list))
unique_job[:3]

array(['Acute Care Surgeon', 'Addiction specialist', 'Anesthesiologist'],
      dtype='<U32')

In [56]:
flat_list2 = [item for sublist in list(procedure_sections.values()) for item in sublist]
roles_distribution = np.array(flat_list2)
roles_distribution

array(['Gynecology', 'Surgery', 'Cardiology', 'Cardiology', 'Surgery',
       'Gastroenterology', 'Surgery', 'Infections', 'Endocrinology',
       'Urology', 'Infections', 'Psychiatry and Psychology', 'Surgery',
       'Orthopedics', 'Surgery', 'Neurology', 'Oncology', 'Radiology',
       'Neurology', 'Surgery', 'Internal Medicine', 'Cardiology',
       'Pulmonology', 'Surgery', 'Dermatology', 'Internal Medicine',
       'Radiology', 'Hematology', 'Nephrology', 'Cardiology',
       'Gynecology', 'Neonatology', 'Psychiatry and Psychology', 'Nurse',
       'Neonatology', 'Emergency', 'Dermatology', 'Emergency', 'Surgery',
       'Dermatology', 'Emergency', 'Emergency', 'Emergency', 'Surgery',
       'Dermatology', 'Emergency', 'Surgery', 'Dermatology',
       'Pulmonology', 'Emergency', 'Surgery', 'Dermatology', 'Emergency',
       'Psychiatry and Psychology', 'Surgery', 'Emergency', 'Surgery',
       'Dermatology', 'Emergency', 'Surgery', 'Dermatology', 'Emergency',
       'Surgery', 'D

In [57]:
iter_time = 100000
role_times = list(map(lambda x: int(x * iter_time), role_frac))
employee_data = []
employee_cols = ["emp_id", "emp_name", "emp_dob", "emp_date_entry", "emp_hid", "building", "emp_salary", 
                 "section", "specialty"]

for role_id, role_count in enumerate(role_times):
    emp_role = role[role_id]
    print("role: {}".format(emp_role))
    print("iter {} times".format(role_count))
    for i in range(role_count):
        emp_name, emp_date = fake.name(), get_emp_date()
        emp_dob_end_date =datetime.date.today() - timedelta(days=10000)
        emp_dob_start_date = emp_dob_end_date - timedelta(days=10000)
        emp_dob = emp_dob_start_date + (emp_dob_end_date - emp_dob_start_date) *  random.random()
        
        emp_hid = str(hospital_ids[random.randrange(0, len(hospital_ids))])
        building = str(sections[random.randrange(0, len(sections))])
        emp_salary = generate_salary(emp_role)
        
        job_role_specialty_pool = job_role_specialties[emp_role]
        job_role_specialty = str(job_role_specialty_pool[random.randrange(0, len(job_role_specialty_pool))])
        
        
        job_section_pool = job_section_l[job_role_specialty]
        job_section = str(job_section_pool[random.randrange(0, len(job_section_pool))])
        
        emp_id_pt1 = "".join(list(map(lambda x2: x2[0].upper(), job_section.split())))   
        emp_id_pt2 = "".join(list(map(lambda x: x[-1].upper(), emp_name.split())))   
        emp_id = emp_id_pt1 + "-" + emp_id_pt2 + "-" + str(fake.random_number(digits=15))
        employee_data.append((emp_id, emp_name, emp_dob, emp_date, emp_hid, building, emp_salary,
                              job_section, job_role_specialty))

role: Physician
iter 65000 times
role: Nurse
iter 15000 times
role: Accountant
iter 3000 times
role: Other
iter 17000 times


In [58]:
employee_df = pd.DataFrame(employee_data, columns=employee_cols)

In [59]:
employee_data = np.array(employee_data)

#### `Links: `
- mayo clinic
- statistical study reports from https://www.usnews.com/
- https://www.cdc.gov/ncbddd/birthdefects/families-pcp.html


### 3.1.6 patient visits

In [60]:
import time

In [61]:
# Definition
general_procedures_cols = diagnosis_cols + injury_cause_cols + surgery_cols + medicine_cols + other_procedure_cols

pid_index_col = 0
hid_index_col = 3

end_date = datetime.datetime.now()
start_date = end_date - timedelta(days=1000)

In [62]:
# Eg. generating patient id
begin_time = time.time()
visit_pid = patients_ids[random.randrange(0, len(patients_ids))]

# Eg. generating hospital id
hid_matches = patients[np.in1d(patients[:, pid_index_col], [visit_pid])][:, hid_index_col]
visit_hid = hid_matches[random.randrange(0, len(hid_matches))]

# Eg. generating procedure id
visit_ppid = patient_procedures_ids[random.randrange(0, len(patient_procedures_ids))]

# Eg. generating visit date
visit_random_date = start_date + (end_date - start_date) * random.random()

# Eg. generating visit id
visit_id =  str(visit_random_date.year) + str(visit_random_date.month) + "-" + visit_pid[:2] + "-" + str(fake.random_number(digits=15))

emp_entry = employee_df[(employee_df["emp_hid"]==visit_hid)].sample(1)
emp_section = emp_entry.iloc[0]["section"]
procedure_by = visit_empid = emp_entry.iloc[0]["emp_id"]
procdure_pool = KEY_section_diseases[emp_section]
procedure_selected = procdure_pool[random.randrange(0, len(procdure_pool))]
visit_procedure = patient_procedures_df[patient_procedures_df["name"]==procedure_selected].iloc[0]
visit_procedure_price = float(visit_procedure['price'])
visit_price = str(random.randrange(int(visit_procedure_price - (visit_procedure_price * 0.25)) + 10, 
                 int(visit_procedure_price + (visit_procedure_price * 0.60)) + 25))
visit_ppid = str(visit_procedure['id'])
# visit_payment = 
# visit_procedure_name = p_procedures_data[patient_procedures_ids.index(visit_ppid), 1]
# visit_procedure_section_l = procedure_sections[visit_procedure_name]
# visit_procedure_section = str(visit_procedure_section_l[random.randrange(0, len(visit_procedure_section_l))])
# visit_emp_entry = employee_df[(employee_df["section"]==visit_procedure_section) &
#                               (employee_df["emp_hid"]==visit_hid)].sample(1)
# procedure_by = visit_empid = visit_emp_entry.iloc[0]["emp_id"]
print("Time: {} ms".format((time.time() - begin_time) * 1000))

visit_pid, visit_hid, visit_ppid, visit_id, visit_random_date, procedure_by, visit_price

Time: 33.000946044921875 ms


('MR-95886813195605708338',
 '106190758',
 'XXXXXXXDEM30b02',
 '20227-MR-169413567179618',
 datetime.datetime(2022, 7, 16, 16, 4, 14, 811751),
 'E-NZ-673693735601568',
 '203')

In [63]:
cols_list = ['visit_id', 'visit_pid', 'visit_ppid', 'visit_hid', 'visit_date',"procedure_by", "rating", "payment"]

In [64]:
def create_patient_list(d_subset):
    patient_l = []
    
    d_subset = d_subset[~d_subset.isna()]
    d_subset_idx = d_subset.index
    for pid, patient_group_count in enumerate(d_subset):
        patient_group_count = int(patient_group_count)
        patient_l = patient_l + ([d_subset_idx[pid]] * patient_group_count)
    return patient_l

def extend_l(x): 
    x[1].extend(visit_data_padded[x[0]]) 
    return x[1]

visits = [[] for i in range(len(cols_list))]
for i, data_entry in enumerate(list(hospital_df1.iterrows())):
    print("\n\niteration: {}".format(i+1))
    _, data_info = data_entry
    
    patients_diagnosis = create_patient_list(data_info[diagnosis_cols])
    patients_injury = create_patient_list(data_info[injury_cause_cols])
    patients_medicine = create_patient_list(data_info[medicine_cols])
    patients_surgery = create_patient_list(data_info[surgery_cols])

    max_padding = max(len(patients_diagnosis), len(patients_injury), len(patients_medicine), len(patients_surgery))
    print("{}: {}".format("max padding", max_padding))
    
    # random hospital id
    hid_matches = patients[np.in1d(patients[:, pid_index_col], [visit_pid])][:, hid_index_col]
    visit_hid = hid_matches[random.randrange(0, len(hid_matches))]
    visit_hid_l = [visit_hid] * max_padding

    # random date
    visit_dates = []
    visit_pids = []
    visit_ppids = []
    visit_ids = []
    ratings = []
    procedure_bys = []
    visit_prices = []
    
    for i in range(max_padding):
        visit_random_date = start_date + (end_date - start_date) * random.random()
        visit_id =  str(visit_random_date.year) + str(visit_random_date.month) + \
                    "-" + visit_pid[:2] + "-" + str(fake.random_number(digits=15))
        visit_pid = patients_ids[random.randrange(0, len(patients_ids))]
        rating = str(round(random.random() * 10))
        
        emp_entry = employee_df[(employee_df["emp_hid"]==visit_hid)].sample(1)
        emp_section = emp_entry.iloc[0]["section"]
        procedure_by = visit_empid = emp_entry.iloc[0]["emp_id"]
        
        procdure_pool = KEY_section_diseases[emp_section]
        procedure_selected = procdure_pool[random.randrange(0, len(procdure_pool))]
        visit_procedure = patient_procedures_df[patient_procedures_df["name"]==procedure_selected].iloc[0]
        visit_procedure_price = float(visit_procedure['price'])
        visit_price = str(random.randrange(int(visit_procedure_price - (visit_procedure_price * 0.25)) + 10, 
                                           int(visit_procedure_price + (visit_procedure_price * 0.60)) + 25))
        visit_ppid = str(visit_procedure['id'])
        
        visit_dates.append(visit_random_date)
        visit_pids.append(visit_pid)
        visit_ppids.append(visit_ppid)
        visit_ids.append(visit_id)
        ratings.append(rating)
        procedure_bys.append(procedure_by)
        visit_prices.append(visit_price)
    
    pad_token = None
    visit_data = [visit_ids, visit_pids, visit_ppids, visit_hid_l, visit_dates, procedure_bys, ratings, visit_prices]
    visit_data_padded = list((zip(*itertools.zip_longest(*visit_data, fillvalue=pad_token))))
    
    # accumalate
    visits = list(map(extend_l, list(enumerate(visits))))



iteration: 1
max padding: 1166


iteration: 2
max padding: 9859


iteration: 3
max padding: 161


iteration: 4
max padding: 231


iteration: 5
max padding: 6196


iteration: 6
max padding: 1152


iteration: 7
max padding: 2236


iteration: 8
max padding: 1085


iteration: 9
max padding: 5371


iteration: 10
max padding: 5118


iteration: 11
max padding: 4943


iteration: 12
max padding: 60


iteration: 13
max padding: 11768


iteration: 14
max padding: 5691


iteration: 15
max padding: 1944


iteration: 16
max padding: 6955


iteration: 17
max padding: 455


iteration: 18
max padding: 1380


iteration: 19
max padding: 692


iteration: 20
max padding: 3333


iteration: 21
max padding: 927


iteration: 22
max padding: 249


iteration: 23
max padding: 543


iteration: 24
max padding: 4683


iteration: 25
max padding: 546


iteration: 26
max padding: 5634


iteration: 27
max padding: 2959


iteration: 28
max padding: 18060


iteration: 29
max padding: 1719


iteration: 30
max padding: 26



iteration: 240
max padding: 717


iteration: 241
max padding: 2028


iteration: 242
max padding: 1396


iteration: 243
max padding: 3940


iteration: 244
max padding: 2637


iteration: 245
max padding: 5453


iteration: 246
max padding: 519


iteration: 247
max padding: 2466


iteration: 248
max padding: 12166


iteration: 249
max padding: 19564


iteration: 250
max padding: 3004


iteration: 251
max padding: 12


iteration: 252
max padding: 645


iteration: 253
max padding: 5211


iteration: 254
max padding: 2716


iteration: 255
max padding: 9723


iteration: 256
max padding: 6472


iteration: 257
max padding: 9232


iteration: 258
max padding: 6895


iteration: 259
max padding: 7336


iteration: 260
max padding: 16694


iteration: 261
max padding: 5824


iteration: 262
max padding: 3008


iteration: 263
max padding: 1840


iteration: 264
max padding: 3803


iteration: 265
max padding: 5307


iteration: 266
max padding: 1415


iteration: 267
max padding: 8057


iteration: 268
max p

In [79]:
p_visits_cols_list = cols_list
p_visits_data = np.array(visits).T

# get_col_data2d('visit_id', p_visits_cols_list, p_visits_data)
# pd.DataFrame(np.array(visits).T, columns=cols_list) 

In [66]:
p_visits_data 

array([['202010-MR-1855751994799', 'CL-98267006157427147031',
        'XXXXXXXXSI039b9', ..., 'S-DT-994510495633642', '3', '7955'],
       ['20219-CL-577400136921889', 'BP-21367138352960100711',
        'XXXXXXXXXR2c93b', ..., 'R-NZ-552223394592002', '1', '14'],
       ['20207-BP-566752738500882', 'JH-50537217973229584615',
        'XXXXXXXXBI6e8f3', ..., 'O-YL-476851985127005', '3', '13'],
       ...,
       ['20226-JD-799641437917152', 'JA-41506860240024888396',
        'XXXXXXXXDNa9556', ..., 'O-YN-917410210647245', '5', '290'],
       ['20215-JA-667758630936977', 'PW-12682552012746528375',
        'XXXXXXXMSC50296', ..., 'P-IR-296672278801373', '2', '688'],
       ['20213-PW-339735773467000', 'SJ-64519950825135586939',
        'XXXXXXXXDDab650', ..., 'G-YZ-638409846360997', '3', '2615']],
      dtype=object)

In [67]:
p_visits_cols_list

['visit_id',
 'visit_pid',
 'visit_ppid',
 'visit_hid',
 'visit_date',
 'procedure_by',
 'rating',
 'payment']

## `4- Storing in database`

In [68]:
import psycopg2
from psycopg2.extras import execute_values

In [69]:
user = "postgres"           # postgres is default postgresql superuser
password = "<PASSWORD>"     # please provide password for postgres user

conn = psycopg2.connect(
    host="localhost",
    database="hospital_oltp_db",
    user=user,
    password=password)

cur = conn.cursor()

### 4.1 create tables

In [70]:
sql_create_table_PatientProcedures = \
"""
CREATE TABLE IF NOT EXISTS PatientProcedures (
    pr_id VARCHAR(20) PRIMARY KEY,
    name VARCHAR(150) NOT NULL,
    description TEXT,
    type VARCHAR(50),
    price NUMERIC
);
"""

sql_create_table_PatientCharacteristics = \
"""
CREATE TABLE IF NOT EXISTS PatientCharacteristics (
    ch_id INT PRIMARY KEY,
    sex VARCHAR(50) NOT NULL,
    race_group VARCHAR(50),
    age_group VARCHAR(50) NOT NULL,
    language VARCHAR(50) NOT NULL,
    payer_source VARCHAR(50) NOT NULL,
    disposition VARCHAR(150) NOT NULL
);
"""

sql_create_table_Locations = \
"""
CREATE TABLE IF NOT EXISTS Locations (
    location_id INT PRIMARY KEY,
    city varchar(150) NOT NULL,
    county VARCHAR(250),
    address VARCHAR(500) NOT NULL,
    zip_code VARCHAR(15),
    congressional_district_num INT,
    senate_district_num INT,
    assembly_district_num INT
);
"""

sql_create_table_HealthFacilities = \
"""
CREATE TABLE IF NOT EXISTS HealthFacilities (
    h_id INT PRIMARY KEY,
    oshpd_id INT,
    name VARCHAR(250), 
    control_type_desc VARCHAR(150) NOT NULL,
    control_type_category_desc VARCHAR(100) NOT NULL,
    mssa_name VARCHAR(200),
    mssa_designation VARCHAR(50),
    license_type VARCHAR(50) NOT NULL
);
"""

sql_create_table_HealthFacilityLocations = \
"""
CREATE TABLE IF NOT EXISTS HealthFacilityLocations (
    h_id int NOT NULL,
    location_id int NOT NULL,
    
    PRIMARY KEY (h_id, location_id),
    FOREIGN KEY (h_id)
      REFERENCES HealthFacilities (h_id),
    FOREIGN KEY (location_id)
      REFERENCES Locations (location_id)
);
"""


sql_create_table_Patients = \
"""
CREATE TABLE IF NOT EXISTS Patients (
    p_id varchar(30) PRIMARY KEY,
    name varchar(150) NOT NULL,
    p_chid int,
    p_hid int NOT NULL,
    social_security_num char(11),
    date_of_birth date NOT NULL,
    credit_card_number varchar(30),
    address varchar(500),
    state varchar(100),
    city varchar(150),
    
    FOREIGN KEY (p_chid)
      REFERENCES PatientCharacteristics (ch_id),
    FOREIGN KEY (p_hid)
      REFERENCES HealthFacilities (h_id)
);
"""

sql_create_table_Employees = \
"""
CREATE TABLE IF NOT EXISTS Employees (
    emp_id varchar(30) PRIMARY KEY,
    emp_name varchar(150) NOT NULL,
    emp_dob DATE,
    emp_date_entry DATE NOT NULL,
    emp_hid INT NOT NULL,
    building char(1) NOT NULL,
    emp_salary Numeric,
    section varchar(100)  NOT NULL,
    specialty varchar(300) NOT NULL,
    
    FOREIGN KEY (emp_hid)
      REFERENCES HealthFacilities (h_id)
);
"""

sql_create_table_PatientVisits = \
"""
CREATE TABLE IF NOT EXISTS PatientVisits (
    visit_id varchar(35) PRIMARY KEY,
    visit_pid varchar(30) NOT NULL,
    procedure varchar(20),
    visit_hid int NOT NULL,
    visit_date date,
    procedure_by varchar(30),
    rating int,
    payment Numeric,
    
    FOREIGN KEY (visit_pid)
      REFERENCES Patients (p_id),
    FOREIGN KEY (procedure)
      REFERENCES PatientProcedures (pr_id),
    FOREIGN KEY (visit_hid)
      REFERENCES HealthFacilities (h_id),
    FOREIGN KEY (procedure_by)
      REFERENCES Employees (emp_id)
);
"""

### 4.2 insert data

In [71]:
print("INSERT - PatientProcedures")
cur.execute(sql_create_table_PatientProcedures)
execute_values(cur, 'INSERT INTO PatientProcedures VALUES %s', p_procedures_data.tolist())

print("INSERT - PatientCharacteristics")
cur.execute(sql_create_table_PatientCharacteristics)
execute_values(cur, 'INSERT INTO PatientCharacteristics VALUES %s', characteristic_data.tolist())

print("INSERT - Locations")
cur.execute(sql_create_table_Locations)
execute_values(cur, 'INSERT INTO Locations VALUES %s', locations_data.tolist())

print("INSERT - HealthFacilities")
cur.execute(sql_create_table_HealthFacilities)
execute_values(cur, 'INSERT INTO HealthFacilities VALUES %s', hospitals_listed_data.tolist())

print("INSERT - HealthFacilityLocations")
cur.execute(sql_create_table_HealthFacilityLocations)
execute_values(cur, 'INSERT INTO HealthFacilityLocations VALUES %s', hospital_locations_data.tolist())

print("INSERT - Patients")
cur.execute(sql_create_table_Patients)
execute_values(cur, 'INSERT INTO Patients VALUES %s', p_data.tolist())

print("INSERT - Employees")
cur.execute(sql_create_table_Employees)
execute_values(cur, 'INSERT INTO Employees VALUES %s', employee_data.tolist())

print("INSERT - PatientVisits")
cur.execute(sql_create_table_PatientVisits)
execute_values(cur, 'INSERT INTO PatientVisits VALUES %s', p_visits_data.tolist())

INSERT - PatientProcedures
INSERT - PatientCharacteristics
INSERT - Locations
INSERT - HealthFacilities
INSERT - HealthFacilityLocations
INSERT - Patients
INSERT - Employees
INSERT - PatientVisits


### 4.3 Triggers

In [72]:
sql_audit_tables_creation = \
"""
CREATE TABLE IF NOT EXISTS RemovedHealthFacilities (
   id INT GENERATED ALWAYS AS IDENTITY,
   h_id INT,
   h_name VARCHAR(250),
   removed_on TIMESTAMP(6) NOT NULL
);

CREATE TABLE IF NOT EXISTS RemovedPatientProcedures (
   id INT GENERATED ALWAYS AS IDENTITY,
   pr_id VARCHAR(20),
   pr_name VARCHAR(150),
   removed_on TIMESTAMP(6) NOT NULL
);

CREATE TABLE IF NOT EXISTS patient_visit_reschedule (
   id INT GENERATED ALWAYS AS IDENTITY,
   patient_id varchar(30) NOT NULL,
   last_visit_date DATE,
   new_visit_date DATE,
   changed_on TIMESTAMP(6) NOT NULL
);
"""

sql_trigger_record_removed_HF = \
"""
CREATE OR REPLACE FUNCTION add_removed_health_facility()
  RETURNS TRIGGER 
  AS
$$
BEGIN
    INSERT INTO RemovedHealthFacilities(h_id, h_name, removed_on)
    VALUES(OLD.h_id, OLD.name, now());

    RETURN NEW;
END;
$$ LANGUAGE PLPGSQL;

CREATE TRIGGER remove_health_facility
  BEFORE DELETE
  ON HealthFacilities
  FOR EACH ROW
  EXECUTE PROCEDURE add_removed_health_facility();
"""

sql_trigger_record_removed_PP = \
"""
CREATE OR REPLACE FUNCTION add_removed_patient_procedure()
  RETURNS TRIGGER 
  AS
$$
BEGIN
    INSERT INTO RemovedPatientProcedures(pr_id, pr_name, removed_on)
    VALUES(OLD.pr_id, OLD.name, now());

    RETURN NEW;
END;
$$ LANGUAGE PLPGSQL;

CREATE TRIGGER remove_patient_procedure
  BEFORE DELETE
  ON PatientProcedures
  FOR EACH ROW
  EXECUTE PROCEDURE add_removed_patient_procedure();
"""

sql_trigger_visit_audit = \
"""
CREATE OR REPLACE FUNCTION log_schedule_change()
  RETURNS TRIGGER 
  AS
$$
BEGIN
    IF NEW.visit_date <> OLD.visit_date THEN
         INSERT INTO patient_visit_reschedule(patient_id, last_visit_date,
                                                new_visit_date, changed_on)
         VALUES(OLD.visit_id, OLD.visit_date, NEW.visit_date, now());
    END IF;

    RETURN NEW;
END;
$$ LANGUAGE PLPGSQL;

CREATE TRIGGER schedule_change
  AFTER UPDATE
  ON PatientVisits
  FOR EACH ROW
  EXECUTE PROCEDURE log_schedule_change();
"""

In [73]:
cur.execute(sql_audit_tables_creation)
cur.execute(sql_trigger_record_removed_HF)
cur.execute(sql_trigger_record_removed_PP)
cur.execute(sql_trigger_visit_audit)

### 4.4 Views

In [74]:
sql_view_facility_general = \
"""
CREATE OR REPLACE VIEW FacilityGeneralView
AS 
SELECT hf.name as health_facility,
       hf.control_type_desc,
       hf.license_type,
       l.city,
       l.county,
       count(p.p_id) as num_patient
FROM Patients p
INNER JOIN HealthFacilities hf ON p.p_hid=hf.h_id
INNER JOIN HealthFacilityLocations hfl ON hf.h_id=hfl.h_id
INNER JOIN Locations l ON hfl.location_id=l.location_id
GROUP BY hf.h_id, hfl.location_id, l.location_id;
"""

sql_view_facility_perfomance = \
"""
CREATE OR REPLACE VIEW FacilityPerformanceView
AS 
SELECT hf.name, 
       count(DISTINCT p.p_id) as registered_patient_num, 
       ROUND(avg(DISTINCT pv.payment), 2) as average_payment, 
       ROUND(avg(DISTINCT pv.rating), 1) as average_rating
FROM HealthFacilities hf
INNER JOIN PatientVisits pv ON hf.h_id=pv.visit_hid
INNER JOIN Patients p on hf.h_id=p.p_hid
WHERE visit_date IS NOT NULL
GROUP BY hf.h_id;
"""

sql_view_patient_visit_info = \
"""
CREATE OR REPLACE VIEW PatientVisitInfoView
AS 
SELECT p.name as patient, 
       pv.visit_date as visited_on, 
       hf.name as facility_visited,
       pp.name as procedure_conducted,
       e.section as procedure_section,
       pp.price as procedure_cost,
       pv.payment as treatment_cost,
       pv.rating as visit_rating
FROM PatientVisits pv
INNER JOIN Patients p ON pv.visit_pid=p.p_id
INNER JOIN HealthFacilities hf ON pv.visit_hid=hf.h_id
INNER JOIN PatientProcedures pp ON pv.procedure=pp.pr_id
INNER JOIN Employees e ON pv.procedure_by=e.emp_id;
"""

In [75]:
cur.execute(sql_view_facility_general)
cur.execute(sql_view_facility_perfomance)
cur.execute(sql_view_patient_visit_info)

### 4.5 Indices

In [76]:
sql_indices_creation = \
"""
CREATE INDEX IF NOT EXISTS idx_HF_name ON HealthFacilities(name);
CREATE INDEX IF NOT EXISTS idx_PP_name ON PatientProcedures(name);
CREATE INDEX IF NOT EXISTS idx_E_name ON Employees(emp_name);
CREATE INDEX IF NOT EXISTS idx_HF_name ON Patients(name);
CREATE INDEX IF NOT EXISTS idx_PV_date ON PatientVisits(visit_date);
"""

In [77]:
cur.execute(sql_indices_creation)

In [78]:
print("DONE!")
cur.close()
conn.commit()

DONE!
