In [3]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os

# Setup path and env
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))
env_path = project_root / 'config' / '.env'
load_dotenv(env_path)

from src.database import DatabaseConnection

# Load transformed data
db = DatabaseConnection()
df_transformed = pd.read_sql("SELECT * FROM hospital_readmissions_transformed", db.engine)
df_original = pd.read_sql("SELECT * FROM hospital_readmissions", db.engine)

print("="*80)
print("TRANSFORMATION VALIDATION REPORT")
print("="*80)

TRANSFORMATION VALIDATION REPORT


In [4]:
# 1. Row count verification
print("\n1. ROW COUNT")
print(f"   Original: {len(df_original)} rows")
print(f"   Transformed: {len(df_transformed)} rows")
print(f"   ✓ PASS" if len(df_original) == len(df_transformed) else "   ✗ FAIL")


1. ROW COUNT
   Original: 25000 rows
   Transformed: 25000 rows
   ✓ PASS


In [5]:
# 2. Age transformation
print("\n2. AGE CONVERSION")
print(f"   age_numeric type: {df_transformed['age_numeric'].dtype}")
print(f"   Unique values: {df_transformed['age_numeric'].nunique()}")
print(f"   Range: {df_transformed['age_numeric'].min()}-{df_transformed['age_numeric'].max()}")
print(f"   ✓ PASS - Age successfully converted to numeric")



2. AGE CONVERSION
   age_numeric type: int64
   Unique values: 6
   Range: 45-95
   ✓ PASS - Age successfully converted to numeric


In [6]:
# 3. Test result standardization
print("\n3. TEST RESULT STANDARDIZATION")
print(f"   glucose_test_encoded unique: {df_transformed['glucose_test_encoded'].unique()}")
print(f"   A1Ctest_encoded unique: {df_transformed['A1Ctest_encoded'].unique()}")
print(f"   ✓ PASS - Tests encoded as 0/1/2")


3. TEST RESULT STANDARDIZATION
   glucose_test_encoded unique: [0 1 2]
   A1Ctest_encoded unique: [0 1 2]
   ✓ PASS - Tests encoded as 0/1/2


In [7]:
print("\n4. YES/NO TO BINARY")
print(f"   change_binary: {df_transformed['change_binary'].unique()}")
print(f"   diabetes_med_binary: {df_transformed['diabetes_med_binary'].unique()}")
print(f"   readmitted_binary: {df_transformed['readmitted_binary'].unique()}")
print(f"   ✓ PASS - All yes/no columns converted to 0/1")


4. YES/NO TO BINARY
   change_binary: [0 1]
   diabetes_med_binary: [1 0]
   readmitted_binary: [0 1]
   ✓ PASS - All yes/no columns converted to 0/1


In [8]:
print("\n5. MISSING SPECIALTY HANDLING")
print(f"   specialty_is_missing unique: {df_transformed['specialty_is_missing'].unique()}")
print(f"   Medical specialties without 'Missing': {df_transformed['medical_specialty'].nunique()}")
print(f"   'Missing' in specialty: {'Missing' in df_transformed['medical_specialty'].values}")
print(f"   ✓ PASS - Missing specialty flag created")


5. MISSING SPECIALTY HANDLING
   specialty_is_missing unique: [1 0]
   Medical specialties without 'Missing': 7
   'Missing' in specialty: False
   ✓ PASS - Missing specialty flag created


In [9]:
# 6. Derived features
print("\n6. DERIVED FEATURES")
print(f"   total_previous_visits - Mean: {df_transformed['total_previous_visits'].mean():.2f}")
print(f"   has_inpatient_history - Distribution: {df_transformed['has_inpatient_history'].value_counts().to_dict()}")
print(f"   high_medication_count - Distribution: {df_transformed['high_medication_count'].value_counts().to_dict()}")
print(f"   age_group - Unique: {df_transformed['age_group'].nunique()}")
print(f"   ✓ PASS - All derived features created")


6. DERIVED FEATURES
   total_previous_visits - Mean: 1.17
   has_inpatient_history - Distribution: {0: 16537, 1: 8463}
   high_medication_count - Distribution: {0: 13206, 1: 11794}
   age_group - Unique: 6
   ✓ PASS - All derived features created


In [10]:
# 7. New columns summary
print("\n7. NEW COLUMNS CREATED")
new_cols = set(df_transformed.columns) - set(df_original.columns)
print(f"   Total new columns: {len(new_cols)}")
for col in sorted(new_cols):
    print(f"   - {col}")


7. NEW COLUMNS CREATED
   Total new columns: 38
   - A1Ctest_encoded
   - age_group
   - age_numeric
   - change_binary
   - diabetes_med_binary
   - diag_1_Diabetes
   - diag_1_Digestive
   - diag_1_Injury
   - diag_1_Missing
   - diag_1_Musculoskeletal
   - diag_1_Other
   - diag_1_Respiratory
   - diag_2_Diabetes
   - diag_2_Digestive
   - diag_2_Injury
   - diag_2_Missing
   - diag_2_Musculoskeletal
   - diag_2_Other
   - diag_2_Respiratory
   - diag_3_Diabetes
   - diag_3_Digestive
   - diag_3_Injury
   - diag_3_Missing
   - diag_3_Musculoskeletal
   - diag_3_Other
   - diag_3_Respiratory
   - glucose_test_encoded
   - has_inpatient_history
   - high_medication_count
   - medical_specialty_Emergency/Trauma
   - medical_specialty_Family/GeneralPractice
   - medical_specialty_InternalMedicine
   - medical_specialty_Other
   - medical_specialty_Surgery
   - medical_specialty_Unknown
   - readmitted_binary
   - specialty_is_missing
   - total_previous_visits


In [11]:
# 8. Data quality checks
print("\n8. DATA QUALITY CHECKS")
print(f"   Null values: {df_transformed.isnull().sum().sum()}")
print(f"   Duplicate rows: {df_transformed.duplicated().sum()}")
print(f"   ✓ PASS - No null values or duplicates")


8. DATA QUALITY CHECKS
   Null values: 0
   Duplicate rows: 0
   ✓ PASS - No null values or duplicates


In [12]:
# 9. Target variable check
print("\n9. TARGET VARIABLE CHECK")
print(f"   readmitted_binary distribution:")
print(f"   - 0 (Not readmitted): {(df_transformed['readmitted_binary'] == 0).sum()} ({(df_transformed['readmitted_binary'] == 0).sum()/len(df_transformed)*100:.2f}%)")
print(f"   - 1 (Readmitted): {(df_transformed['readmitted_binary'] == 1).sum()} ({(df_transformed['readmitted_binary'] == 1).sum()/len(df_transformed)*100:.2f}%)")
print(f"   ✓ PASS - Target variable properly encoded")

print("\n" + "="*80)
print("VALIDATION COMPLETE - ALL CHECKS PASSED ✓")
print("="*80)



9. TARGET VARIABLE CHECK
   readmitted_binary distribution:
   - 0 (Not readmitted): 13246 (52.98%)
   - 1 (Readmitted): 11754 (47.02%)
   ✓ PASS - Target variable properly encoded

VALIDATION COMPLETE - ALL CHECKS PASSED ✓


In [13]:
# Display sample
print("\nSample of transformed data (first 5 rows, selected columns):")
sample_cols = [
    'age', 'age_numeric', 'age_group',
    'glucose_test', 'glucose_test_encoded',
    'A1Ctest', 'A1Ctest_encoded',
    'readmitted', 'readmitted_binary',
    'medical_specialty', 'specialty_is_missing',
    'total_previous_visits', 'has_inpatient_history'
]
print(df_transformed[sample_cols].head())


Sample of transformed data (first 5 rows, selected columns):
       age  age_numeric age_group glucose_test  glucose_test_encoded A1Ctest  \
0  [70-80)           75     70-80           no                     0      no   
1  [70-80)           75     70-80           no                     0      no   
2  [50-60)           55     50-60           no                     0      no   
3  [70-80)           75     70-80           no                     0      no   
4  [60-70)           65     60-70           no                     0      no   

   A1Ctest_encoded readmitted  readmitted_binary medical_specialty  \
0                0         no                  0           Unknown   
1                0         no                  0             Other   
2                0        yes                  1           Unknown   
3                0        yes                  1           Unknown   
4                0         no                  0  InternalMedicine   

   specialty_is_missing  total_previ