<a href="https://colab.research.google.com/github/GurramVishalReddy/Techsophy-EHR-Data-Quality-Auditor/blob/main/techsophy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/ChronicKidneyDisease_EHRs_from_AbuDhabi.csv')
print("✅ Dataset Loaded")
df.head()

EXPECTED_COLUMNS = [
    'Sex', 'AgeBaseline', 'HistoryDiabetes', 'HistoryCHD', 'HistoryVascular',
    'HistorySmoking', 'HistoryHTN', 'HistoryDLD', 'HistoryObesity', 'DLDmeds',
    'ACEIARB', 'CholesterolBaseline', 'CreatinineBaseline', 'eGFRBaseline',
    'sBPBaseline', 'dBPBaseline', 'BMIBaseline', 'TimeToEventMonths',
    'EventCKD35', 'TIME_YEAR'
]

COLUMN_RANGES = {
    'AgeBaseline': (0, 120),
    'CholesterolBaseline': (2, 10),
    'CreatinineBaseline': (20, 150),
    'eGFRBaseline': (0, 150),
    'sBPBaseline': (80, 200),
    'dBPBaseline': (50, 130),
    'BMIBaseline': (10, 60),
}

CATEGORICAL_COLUMNS = {
    'Sex': [0, 1],
    'HistoryDiabetes': [0, 1],
    'HistoryCHD': [0, 1],
    'HistoryVascular': [0, 1],
    'HistorySmoking': [0, 1],
    'HistoryHTN': [0, 1],
    'HistoryDLD': [0, 1],
    'HistoryObesity': [0, 1],
    'DLDmeds': [0, 1],
    'ACEIARB': [0, 1],
    'EventCKD35': [0, 1]
}

def validate_data(df):
    report = {}
    report['Missing Columns'] = [col for col in EXPECTED_COLUMNS if col not in df.columns]
    for col, valid_vals in CATEGORICAL_COLUMNS.items():
        if col in df.columns:
            invalid_rows = df[~df[col].isin(valid_vals)]
            report[f'Invalid Values in {col}'] = len(invalid_rows)
    for col, (min_val, max_val) in COLUMN_RANGES.items():
        if col in df.columns:
            outliers = df[(df[col] < min_val) | (df[col] > max_val)]
            report[f'Out of Range in {col}'] = len(outliers)

    return report

def detect_errors(df):
    errors = {
        'Missing Values per Column': df.isnull().sum().to_dict(),
        'Duplicate Rows': df.duplicated().sum()
    }
    return errors

def generate_report(validation_report, error_report):
    print("\n🔎 Data Validation Summary")
    print("-" * 40)
    for key, val in validation_report.items():
        print(f"{key}: {val}")

    print("\n⚠️ Error Detection Summary")
    print("-" * 40)
    for key, val in error_report.items():
        print(f"{key}: {val}")

validation_report = validate_data(df)
error_report = detect_errors(df)
generate_report(validation_report, error_report)


✅ Dataset Loaded

🔎 Data Validation Summary
----------------------------------------
Missing Columns: ['HistoryHTN']
Invalid Values in Sex: 0
Invalid Values in HistoryDiabetes: 0
Invalid Values in HistoryCHD: 0
Invalid Values in HistoryVascular: 0
Invalid Values in HistorySmoking: 0
Invalid Values in HistoryDLD: 0
Invalid Values in HistoryObesity: 0
Invalid Values in DLDmeds: 0
Invalid Values in ACEIARB: 0
Invalid Values in EventCKD35: 0
Out of Range in AgeBaseline: 0
Out of Range in CholesterolBaseline: 0
Out of Range in CreatinineBaseline: 1
Out of Range in eGFRBaseline: 1
Out of Range in sBPBaseline: 0
Out of Range in dBPBaseline: 3
Out of Range in BMIBaseline: 0

⚠️ Error Detection Summary
----------------------------------------
Missing Values per Column: {'Sex': 0, 'AgeBaseline': 0, 'HistoryDiabetes': 0, 'HistoryCHD': 0, 'HistoryVascular': 0, 'HistorySmoking': 0, 'HistoryHTN ': 0, 'HistoryDLD': 0, 'HistoryObesity': 0, 'DLDmeds': 0, 'DMmeds': 0, 'HTNmeds': 0, 'ACEIARB': 0, 'Choles