In [1]:
import pandas as pd
import os


In [2]:
# Ensure warehouse folder exists
os.makedirs("data_warehouse", exist_ok=True)

### 1. Ingestion

In [13]:
patients_df = pd.read_csv(r"raw data\patients_data_with_doctor.csv")

doctors_df = pd.read_csv(r"raw data\doctors_info.csv")
feedback_df = pd.read_json(r"raw data\patient_feedback.json")
print(feedback_df.columns)

Index(['patient_id', 'treatment_id', 'patient_feedback_score', 'review_date'], dtype='object')


### 2. Cleansing

In [14]:

# Calculate total charges from treatment and room cost
patients_df['total_charges'] = (
    pd.to_numeric(patients_df['treatment_cost'], errors='coerce').fillna(0) +
    pd.to_numeric(patients_df['room_cost'], errors='coerce').fillna(0)
)


In [15]:
# Standardize date formats
patients_df['treatment_date'] = pd.to_datetime(patients_df['treatment_date'], errors='coerce')
feedback_df['review_date'] = pd.to_datetime(feedback_df['review_date'], errors='coerce')




### 3. Transformation

In [16]:

# Merge patients with doctor info
merged_df = patients_df.merge(doctors_df, on='doctor_id', how='left')
# Keep latest feedback per patient
feedback_df = feedback_df.sort_values('review_date').drop_duplicates(
    subset=['patient_id'], keep='last'
)

# Merge feedback into patient-doctor data
merged_df = merged_df.merge(feedback_df, on='patient_id', how='left')

# Remove invalid rows (e.g., zero charges or missing doctor)
merged_df = merged_df[(merged_df['total_charges'] > 0) & (merged_df['doctor_id'].notna())]


### 4. Loading to warehouse

In [17]:

processed_path = "data_warehouse/processed_patient_data.csv"
merged_df.to_csv(processed_path, index=False)

print(f"✅ Processed data saved to {processed_path}")


✅ Processed data saved to data_warehouse/processed_patient_data.csv
