In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

# Example patient data
data = {
    'Patient_ID': [101, 102, 103, 104, 105],
    'Name': ['John Doe', 'Jane Smith', np.nan, 'Mary Johnson', 'James Brown'],
    'Date_of_Birth': ['1985-06-15', '1990-02-28', '1987-11-21', 'not_available', '1982-12-03'],
    'Email': ['john@example.com', 'jane.smith#example.com', 'mary.j@example', 'james@brown.com', ''],
    'Phone_Number': ['123-456-7890', '234-567-8901', '345-678-9012', 'not_available', ''],
    'Diagnosis': ['Flu', 'Cold', 'Covid-19', 'Asthma', 'Diabetes'],
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 1: Handle Missing Data
# Fill missing 'Name' with 'Unknown'
df['Name'].fillna('Unknown', inplace=True)

# Mark 'not_available' as NaN for 'Phone_Number' and fill NaN with 'No Contact'
df['Phone_Number'].replace('not_available', np.nan, inplace=True)
df['Phone_Number'].fillna('No Contact', inplace=True)

# Step 2: Validate Date of Birth (DOB) Format
def validate_dob(dob):
    try:
        # Try to parse the date in 'YYYY-MM-DD' format
        return datetime.strptime(dob, '%Y-%m-%d')
    except ValueError:
        return np.nan  # Return NaN for invalid dates

df['Date_of_Birth'] = df['Date_of_Birth'].apply(validate_dob)

# Step 3: Validate Email Format
def validate_email(email):
    # Simple email validation using regex pattern
    if re.match(r"[^@]+@[^@]+\.[^@]+", email):
        return email
    else:
        return np.nan  # Return NaN for invalid emails

df['Email'] = df['Email'].apply(validate_email)

# Step 4: Remove Duplicate Records
df.drop_duplicates(subset=['Patient_ID'], keep='first', inplace=True)

# Step 5: Validate Phone Numbers
def validate_phone(phone):
    if len(phone) == 12 and phone[3] == '-' and phone[7] == '-':  # Check phone format: XXX-XXX-XXXX
        return phone
    return 'Invalid Phone'  # Mark invalid phones with 'Invalid Phone'

df['Phone_Number'] = df['Phone_Number'].apply(validate_phone)

# Step 6: Handle Missing Diagnosis
df['Diagnosis'].fillna('Unknown', inplace=True)  # Fill missing diagnoses with 'Unknown'

# Display the cleaned data
print(df)