## Import Required Libraries
We import all the libraries needed for data cleaning and preprocessing.

In [21]:
# The primary library for data manipulation and analysis, used for creating and working with DataFrames.
import pandas as pd

# The fundamental library for numerical computing in Python, often used for array operations and mathematical functions.
import numpy as np

# A scikit-learn class used for handling missing values (e.g., filling with mean, median, or constant).
from sklearn.impute import SimpleImputer

## Cleaning Diagnoses Dataset
Steps performed:
1. Load dataset
2. Remove duplicates
3. Fill missing values for diagnosis code & description
4. Remove invalid codes (e.g., "XXX")
5. Standardize text formatting
6. Enforce one-to-one code consistency per diagnosis
7. Convert data types
8. Export cleaned file

In [14]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("diagnoses.csv")

# Remove duplicate rows
df = df.drop_duplicates()

# Impute missing diagnosis code and description (Flatten result!)
df["diagnosis_code"] = SimpleImputer(strategy="most_frequent") \
    .fit_transform(df[["diagnosis_code"]]).ravel()

df["description"] = SimpleImputer(strategy="most_frequent") \
    .fit_transform(df[["description"]]).ravel()

# Remove invalid code entries
df = df[df["diagnosis_code"] != "XXX"]

# Format description text
df["description"] = df["description"].str.strip().str.title()
df["description"] = df["description"].str.replace("Unk", "Unknown")

# Determine primary suggested code
mode_mapping = df.groupby("description")["diagnosis_code"].agg(
    lambda x: x.mode()[0] if not x.mode().empty else np.nan
).to_dict()

primary_descriptions = [desc for desc in mode_mapping if desc != 'Unknown']

all_codes = set(df['diagnosis_code'].unique())
used_codes = set()
consistent_map_final = {}

for desc in primary_descriptions:
    suggested_code = mode_mapping[desc]

    if suggested_code not in used_codes:
        consistent_map_final[desc] = suggested_code
        used_codes.add(suggested_code)
    else:
        available_codes = sorted(all_codes - used_codes, key=str)

        if available_codes:
            new_code = available_codes[0]
        else:
            new_code = "C999"
            while new_code in used_codes:
                new_code = f"C{int(new_code[1:]) + 1}"

        consistent_map_final[desc] = new_code
        used_codes.add(new_code)

# Handle Unknown
if 'Unknown' in mode_mapping:
    consistent_map_final['Unknown'] = mode_mapping['Unknown']

# Apply mapping
df["diagnosis_code"] = df["description"].map(consistent_map_final)

# Convert to proper data types
df = df.astype({
    "diagnosis_id": "int",
    "diagnosis_code": "str",
    "description": "str"
})

print("Diagnoses Cleaned Dataset:\n", df)

# Save cleaned file
df.to_csv("diagnoses_cleaned.csv", index=False)


Diagnoses Cleaned Dataset:
      diagnosis_id diagnosis_code   description
0               1           D004      Covid-19
1               2           D003  Hypertension
2               3           D005           Flu
3               4           D004      Covid-19
4               5           D004      Covid-19
..            ...            ...           ...
115           116           D001        Asthma
116           117           D001        Asthma
117           118           D005           Flu
118           119           D005           Flu
119           120           D004      Covid-19

[118 rows x 3 columns]


## Cleaning Patients Dataset
Steps performed:
1. Load dataset
2. Fill missing names
3. Fix age errors and impute missing values
4. Normalize gender values
5. Remove duplicate patient records
6. Export cleaned file

In [16]:
# Load dataset
df = pd.read_csv("patients.csv")

# Fix missing & incorrectly formatted names
df['first_name'] = df['first_name'].fillna("Unknown")
df['last_name'] = df['last_name'].str.rstrip("#").str.title()

# Remove invalid ages (negative or unrealistic)
df.loc[(df['age'] < 0) | (df['age'] > 120), 'age'] = None

# Impute missing age with mean
df[['age']] = SimpleImputer(strategy='mean').fit_transform(df[['age']])
df['age'] = df['age'].astype(int)

# Normalize gender values
df['gender'] = df['gender'].str.strip().str.upper().replace({
    'M': 'Male',
    'F': 'Female'
})

# Remove duplicate patients
df = df.drop_duplicates(subset=['patient_id'])

print("Patients Cleaned Dataset:\n", df)

# Save cleaned file
df.to_csv("patients_cleaned.csv", index=False)

Patients Cleaned Dataset:
      patient_id first_name last_name  gender  age
0             1      Alice    Wilson  Female   52
1             2    Charlie     Smith    Male   93
2             3       Jane   Johnson  Female   15
3             4      Ethan     Smith  Female   72
4             5       Jane    Miller  Female   61
..          ...        ...       ...     ...  ...
115         116      Ethan     Smith  Female   81
116         117      Alice     Smith  Female   49
117         118      Diana     Brown    Male   35
118         119      Fiona     Brown  Female   35
119         120      Fiona    Wilson    Male   33

[120 rows x 5 columns]


## Cleaning Treatments Dataset
Steps performed:
1. Load dataset and remove duplicates
2. Fix missing doctor names and costs
3. Replace invalid entries
4. Clean text formatting
5. Convert dates properly
6. Handle negative/extreme treatment costs
7. Convert data types
8. Export cleaned file

In [18]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("treatments.csv")

# Remove duplicates
df = df.drop_duplicates().copy()

# Fix missing doctor names & treatment costs
df["doctor_name"] = SimpleImputer(strategy="most_frequent") \
    .fit_transform(df[["doctor_name"]]).ravel()
df["treatment_cost"] = SimpleImputer(strategy="median") \
    .fit_transform(df[["treatment_cost"]]).ravel()

# Replace invalid text entries
df.loc[df["doctor_name"].str.contains("\?\?\?", na=False), "doctor_name"] = "Unknown Doctor"
df.loc[df["department"] == "UnknownDept", "department"] = "Unknown Department"

# Standardize text formatting
df["doctor_name"] = df["doctor_name"].str.strip().str.title()
df["department"] = df["department"].str.strip().str.title()

# Convert date values
df["admission_date"] = pd.to_datetime(df["admission_date"], errors="coerce")

# Fix negative & too-high treatment costs
df.loc[df["treatment_cost"] < 0, "treatment_cost"] = np.nan
df["treatment_cost"] = SimpleImputer(strategy="median").fit_transform(df[["treatment_cost"]])
df.loc[df["treatment_cost"] > 200000, "treatment_cost"] = 200000

# Convert to proper data types
df = df.astype({
    "admission_id": "int",
    "patient_id": "int",
    "diagnosis_id": "int",
    "department": "str",
    "doctor_name": "str"
})

print("Treatments Cleaned Dataset:\n", df)

# Save cleaned file
df.to_csv("treatments_cleaned.csv", index=False)

Treatments Cleaned Dataset:
      admission_id  patient_id doctor_name          department  diagnosis_id  \
0               1          50   Dr. Evans            Oncology             1   
1               2          61   Dr. Evans           Neurology            78   
2               3          51   Dr. Adams           Neurology           115   
3               4          19   Dr. Adams  Unknown Department            47   
4               5          21   Dr. Evans          Pediatrics           114   
..            ...         ...         ...                 ...           ...   
115           116          64   Dr. Adams          Cardiology            74   
116           117          98   Dr. Evans             General            30   
117           118          59   Dr. Evans             General           120   
118           119          56   Dr. Clark          Pediatrics            59   
119           120          59   Dr. Adams           Neurology           110   

    admission_date  tr

## SQL Script Execution for ETL

This step runs the `script.sql` file which contains all SQL statements for:
1. Creating OLTP tables  
2.  Loading CSV datasets  
3.  Creating Data Warehouse dimension & fact tables  
4.  Transforming and inserting data into the DW

Execution is done using a database transaction:
- If all commands succeed → Commit to MySQL
- If any command fails → Rollback all changes

A log file named `etl_sql_execution.log` is generated to store execution details.

In [20]:
import mysql.connector
import logging
import os

# Logger Setup
logging.basicConfig(
    filename="etl_sql_execution.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

SQL_FILE = "script.sql"

# Check if SQL script exists
if not os.path.exists(SQL_FILE):
    raise FileNotFoundError(f"{SQL_FILE} not found!")

try:
    # Connect to MySQL
    connection = mysql.connector.connect(
        host="localhost",
        user="root",
        password="",
        database="healthcare_dw",
        allow_local_infile=True
    )

    cursor = connection.cursor()

    # Load SQL script
    with open(SQL_FILE, "r") as file:
        sql_script = file.read()

    logging.info("SQL script loaded successfully.")

    # Execute SQL script transactionally
    statements = sql_script.split(';')
    executed_count = 0

    for statement in statements:
        stmt = statement.strip()
        if stmt:
            cursor.execute(stmt)
            executed_count += 1
            logging.info(f"Executed statement: {stmt[:50]}...")

    connection.commit()
    logging.info(f"Script executed successfully. ({executed_count} statements)")

    print("SQL script executed successfully!")

except mysql.connector.Error as err:
    connection.rollback()
    logging.error(f"Error occurred: {err}")
    print(f"Execution failed. Rolled back changes: {err}")

finally:
    if cursor:
        cursor.close()
    if connection:
        connection.close()
    logging.info("Database connection closed.")

SQL script executed successfully!
