In [9]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import re

# Step 2: Simulated patient data
data = {
    'PatientID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', None, 'Eve'],
    'DOB': ['1985-05-12', '1990-08-01', 'InvalidDate', '2000-02-30', '1975-12-31'],
    'Gender': ['F', 'M', 'M', 'F', 'Unknown'],
    'Email': ['alice@mail.com', 'bob@mail.com', 'invalid_email', 'david@mail.com', 'eve@mail.com'],
    'ContactNumber': ['1234567890', None, '9876543210', '123456789', '1234567890']
}
df = pd.DataFrame(data)

# Step 3: Define data accuracy checks

def check_missing_names(df):
    return df[df['Name'].isnull()]

def check_invalid_dobs(df):
    return df[pd.to_datetime(df['DOB'], errors='coerce').isna()]

def check_invalid_genders(df):
    valid_genders = ['M', 'F', 'Other']
    return df[~df['Gender'].isin(valid_genders)]

def check_invalid_emails(df):
    pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    return df[~df['Email'].str.match(pattern, na=False)]

def check_invalid_contacts(df):
    return df[~df['ContactNumber'].astype(str).str.match(r'^\d{10}$', na=False)]

# Step 4: Run checks
missing_names = check_missing_names(df)
invalid_dobs = check_invalid_dobs(df)
invalid_genders = check_invalid_genders(df)
invalid_emails = check_invalid_emails(df)
invalid_contacts = check_invalid_contacts(df)

# Step 5: Print results
print("Missing Names:")
print(missing_names)

print("\nInvalid Dates of Birth:")
print(invalid_dobs)

print("\nInvalid Genders:")
print(invalid_genders)

print("\nInvalid Emails:")
print(invalid_emails)

print("\nInvalid Contact Numbers:")
print(invalid_contacts)

# Step 6: Summary Report
issues = {
    "Missing Names": len(missing_names),
    "Invalid DOBs": len(invalid_dobs),
    "Invalid Genders": len(invalid_genders),
    "Invalid Emails": len(invalid_emails),
    "Invalid Contacts": len(invalid_contacts),
    "Valid Records": len(df) - len(pd.concat([missing_names, invalid_dobs, invalid_genders, invalid_emails, invalid_contacts]).drop_duplicates())
}

summary_df = pd.DataFrame(list(issues.items()), columns=['Issue', 'Count'])
print("\nData Accuracy Summary:")
print(summary_df)

Missing Names:
   PatientID  Name         DOB Gender           Email ContactNumber
3        104  None  2000-02-30      F  david@mail.com     123456789

Invalid Dates of Birth:
   PatientID     Name          DOB Gender           Email ContactNumber
2        103  Charlie  InvalidDate      M   invalid_email    9876543210
3        104     None   2000-02-30      F  david@mail.com     123456789

Invalid Genders:
   PatientID Name         DOB   Gender         Email ContactNumber
4        105  Eve  1975-12-31  Unknown  eve@mail.com    1234567890

Invalid Emails:
   PatientID     Name          DOB Gender          Email ContactNumber
2        103  Charlie  InvalidDate      M  invalid_email    9876543210

Invalid Contact Numbers:
   PatientID  Name         DOB Gender           Email ContactNumber
1        102   Bob  1990-08-01      M    bob@mail.com          None
3        104  None  2000-02-30      F  david@mail.com     123456789

Data Accuracy Summary:
              Issue  Count
0     Missing Na