In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\Admin\Documents\GitHub\Healthcare-data-warehouse\diabetic_data.csv")
df.head()


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [2]:
df.shape

(101766, 50)

In [3]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [5]:
df.replace('?', None, inplace=True)
df.isna().sum()

encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

## 1. Prepare dim_patient in pandas

In [6]:
import pandas as pd

# Assuming df is already loaded and you've done df.replace('?', None, inplace=True)

dim_patient_df = df[['patient_nbr', 'gender', 'age', 'race']].copy()

# Rename age -> age_band to match our dim_patient table
dim_patient_df.rename(columns={'age': 'age_band'}, inplace=True)

# Handle missing race (NaN) by setting it to 'Unknown'
dim_patient_df['race'] = dim_patient_df['race'].fillna('Unknown')

# Drop duplicates so each patient_nbr appears only once
dim_patient_df = dim_patient_df.drop_duplicates(subset=['patient_nbr'])

dim_patient_df.head()
dim_patient_df.shape

(71518, 4)

### Installing connection libraries to Postgre

In [8]:
# pip install psycopg2-binary sqlalchemy

### Create a connection engine to PostgreSQL

In [10]:
from sqlalchemy import create_engine
engine = create_engine("postgresql+psycopg2://postgres:6932@localhost:5432/healthcare_dw")

### Load dim_patient into PostgreSQL

In [13]:
dim_patient_df.to_sql(
    'dim_patient',       # table name in PostgreSQL
    con=engine,
    if_exists='append',   # append so it fills the existing table
    index=False           # don't write the DataFrame index
)

518

## 2. Build dim_admission in pandas

In [15]:
dim_admission_df = df[[
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id",
    "payer_code",
    "medical_specialty"
]].copy()

# Keep only distinct combinations
dim_admission_df = dim_admission_df.drop_duplicates()

dim_admission_df.head()
dim_admission_df.shape

(5802, 5)

### Load dim_admission into PostgreSQL

In [17]:
dim_admission_df.to_sql(
    'dim_admission',
    con=engine,
    if_exists='append',
    index=False
)

802

## 3. Build dim_diagnosis in pandas

In [20]:
# Combine all diagnosis columns into one Series
diag_all = pd.concat([
    df['diag_1'],
    df['diag_2'],
    df['diag_3']
], axis=0)

# Drop missing values
diag_all = diag_all.dropna()

# Drop duplicates
diag_all = diag_all.drop_duplicates()

# Build dim_diagnosis dataframe
dim_diagnosis_df = pd.DataFrame({
    "diagnosis_code": diag_all
})

dim_diagnosis_df.head()
dim_diagnosis_df.shape

(915, 1)

### Load dim_diagnosis into PostgreSQL

In [21]:
dim_diagnosis_df.to_sql(
    'dim_diagnosis',
    con=engine,
    if_exists='append',
    index=False
)

915

##  4. Prepare the fact DataFrame in pandas

In [23]:
fact_df = df[[
    "encounter_id",
    "patient_nbr",
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id",
    "diag_1",
    "diag_2",
    "diag_3",
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
    "readmitted",
    "change",
    "diabetesMed"
]].copy()

### Create derived field readmitted_30d_flag

In [24]:
fact_df['readmitted_30d_flag'] = fact_df['readmitted'].apply(lambda x: True if x == '<30' else False)

### Add source_system

In [25]:
fact_df['source_system'] = 'CSV'

## 5. Assign foreign keys

### 5.1 Load dim_patient from Postgres

In [26]:
dim_patient = pd.read_sql("SELECT patient_key, patient_nbr FROM dim_patient", con=engine)

Merge onto fact:

In [27]:
fact_df = fact_df.merge(dim_patient, on="patient_nbr", how="left")

We now have patient_key in the fact.

### 5.2 Load dim_admission from Postgres

In [28]:
dim_admission = pd.read_sql("""
    SELECT admission_dim_key,
           admission_type_id,
           discharge_disposition_id,
           admission_source_id,
           payer_code,
           medical_specialty
    FROM dim_admission
""", con=engine)

Merge

In [29]:
fact_df = fact_df.merge(
    dim_admission[[
        "admission_dim_key",
        "admission_type_id",
        "discharge_disposition_id",
        "admission_source_id"
    ]],
    on=["admission_type_id", "discharge_disposition_id", "admission_source_id"],
    how="left"
)

Now the fact has admission_dim_key.

### 5.3 Load dim_diagnosis from Postgres

In [30]:
dim_diagnosis = pd.read_sql("SELECT diagnosis_key, diagnosis_code FROM dim_diagnosis", con=engine)

Rename to make merging clearer:

In [31]:
diag_map = dim_diagnosis.rename(columns={"diagnosis_code": "diag_1"})
fact_df = fact_df.merge(diag_map[["diagnosis_key", "diag_1"]], on="diag_1", how="left")
fact_df.rename(columns={"diagnosis_key": "primary_diagnosis_key"}, inplace=True)

Repeat for diag_2:

In [32]:
diag_map2 = dim_diagnosis.rename(columns={"diagnosis_code": "diag_2"})
fact_df = fact_df.merge(diag_map2[["diagnosis_key", "diag_2"]], on="diag_2", how="left")
fact_df.rename(columns={"diagnosis_key": "secondary_diagnosis_key"}, inplace=True)

Repeat for diag_3:

In [33]:
diag_map3 = dim_diagnosis.rename(columns={"diagnosis_code": "diag_3"})
fact_df = fact_df.merge(diag_map3[["diagnosis_key", "diag_3"]], on="diag_3", how="left")
fact_df.rename(columns={"diagnosis_key": "tertiary_diagnosis_key"}, inplace=True)

## Drop unused columns (clean the fact DataFrame)

In [35]:
fact_df = fact_df[[
    "encounter_id",
    "patient_key",
    "admission_dim_key",
    "primary_diagnosis_key",
    "secondary_diagnosis_key",
    "tertiary_diagnosis_key",
    "time_in_hospital",
    "num_lab_procedures",
    "num_procedures",
    "num_medications",
    "number_outpatient",
    "number_emergency",
    "number_inpatient",
    "number_diagnoses",
    "readmitted",
    "readmitted_30d_flag",
    "change",
    "diabetesMed",
    "source_system"
]]

## Load into PostgreSQL
- The fact table has 100k+ rows, so we load in chunks to avoid errors.

In [37]:
fact_df.rename(columns={'readmitted': 'readmitted_raw'}, inplace=True)

In [38]:
fact_df.columns

Index(['encounter_id', 'patient_key', 'admission_dim_key',
       'primary_diagnosis_key', 'secondary_diagnosis_key',
       'tertiary_diagnosis_key', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       'readmitted_raw', 'readmitted_30d_flag', 'change', 'diabetesMed',
       'source_system'],
      dtype='object')

In [40]:
fact_df.rename(columns={'diabetesMed': 'diabetesmed'}, inplace=True)
fact_df.columns

Index(['encounter_id', 'patient_key', 'admission_dim_key',
       'primary_diagnosis_key', 'secondary_diagnosis_key',
       'tertiary_diagnosis_key', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses',
       'readmitted_raw', 'readmitted_30d_flag', 'change', 'diabetesmed',
       'source_system'],
      dtype='object')

In [44]:
fact_df['encounter_id'].nunique(), fact_df.shape[0]

(101766, 15734810)

In [45]:
# Keep the first occurrence of each encounter_id, drop the rest
fact_df = fact_df.drop_duplicates(subset=['encounter_id'])

# Sanity check: now they should match
fact_df['encounter_id'].nunique(), fact_df.shape[0]

(101766, 101766)

In [46]:
fact_df.to_sql(
    'fact_hospital_admission',
    con=engine,
    if_exists='append',
    index=False,
    chunksize=5000
)

20766

## Reading the MySQL table in Python

#### Install MySQL connector

In [48]:
# pip install mysql-connector-python

#### Create a MySQL engine and read the data

In [51]:
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import pandas as pd

raw_password = "Hermi@tata123" 
encoded_password = quote_plus(raw_password)

mysql_engine = create_engine(
    f"mysql+mysqlconnector://root:{encoded_password}@localhost:3306/hospital_source"
)

patient_contact_df = pd.read_sql("SELECT * FROM patient_contact", con=mysql_engine)
patient_contact_df.head(), patient_contact_df.shape


(   patient_nbr          phone         city   country
 0     23468526  +250788444555       Kigali    Rwanda
 1     23974596  +254701222333      Mombasa     Kenya
 2     24267006  +255711333444       Arusha  Tanzania
 3     80751879  +251911223344  Addis Ababa  Ethiopia
 4    115014015  +254700111222      Nairobi     Kenya,
 (5, 4))

In [52]:
pg_engine = create_engine(
    "postgresql+psycopg2://postgres:6932@localhost:5432/healthcare_dw"
)

In [53]:
patient_contact_df.to_sql(
    'dim_patient_contact',
    con=pg_engine,     
    if_exists='append',
    index=False
)

5

## Call the REST Countries API and build a DataFrame

In [56]:
import requests
url = "https://restcountries.com/v3.1/all"
data = requests.get(url).json()

print(type(data))
print(data)

<class 'dict'>
{'message': "'fields' query not specified", 'status': 400}


In [57]:
import requests
import pandas as pd

# ✅ New URL with required 'fields' parameter
url = "https://restcountries.com/v2/all?fields=name,alpha2Code,region,subregion,population"
data = requests.get(url).json()

print(type(data), len(data))   # should be: <class 'list'>, ~250 countries

rows = []
for c in data:
    rows.append({
        "country_name": c.get("name"),
        "iso2_code": c.get("alpha2Code"),
        "region": c.get("region"),
        "subregion": c.get("subregion"),
        "population": c.get("population"),
        "source_system": "RESTCountriesAPI"
    })

country_df = pd.DataFrame(rows)
country_df.head(), country_df.shape

<class 'list'> 250


(     country_name iso2_code   region        subregion  population  \
 0     Afghanistan        AF     Asia    Southern Asia    40218234   
 1   Åland Islands        AX   Europe  Northern Europe       28875   
 2         Albania        AL   Europe  Southern Europe     2837743   
 3         Algeria        DZ   Africa  Northern Africa    44700000   
 4  American Samoa        AS  Oceania        Polynesia       55197   
 
       source_system  
 0  RESTCountriesAPI  
 1  RESTCountriesAPI  
 2  RESTCountriesAPI  
 3  RESTCountriesAPI  
 4  RESTCountriesAPI  ,
 (250, 6))

In [63]:
# Save CSV
country_df.to_csv(r"C:\Users\Admin\Documents\GitHub\Healthcare-data-warehouse\source_data\countries_dim.csv", index=False)

#### Filter to only countires in my DW

In [58]:
# Distinct countries in dim_patient_contact
dim_contact_df = pd.read_sql("SELECT DISTINCT country FROM dim_patient_contact", con=pg_engine)
countries_needed = dim_contact_df['country'].dropna().unique().tolist()
countries_needed

['Kenya', 'Rwanda', 'Ethiopia', 'Tanzania']

In [59]:
country_filtered_df = country_df[country_df['country_name'].isin(countries_needed)]
country_filtered_df

Unnamed: 0,country_name,iso2_code,region,subregion,population,source_system
72,Ethiopia,ET,Africa,Eastern Africa,114963583,RESTCountriesAPI
118,Kenya,KE,Africa,Eastern Africa,53771300,RESTCountriesAPI
186,Rwanda,RW,Africa,Eastern Africa,12952209,RESTCountriesAPI


In [60]:
country_filtered_df.to_sql(
    'dim_country',
    con=pg_engine,
    if_exists='append',
    index=False
)

3

In [62]:
country_df.shape

(250, 6)

In [61]:
country_filtered_df.shape

(3, 6)

In [64]:
patient_contact_df.to_csv(
    r"C:\Users\Admin\Documents\GitHub\Healthcare-data-warehouse\source_data\patient_contact.csv",
    index=False
)