In [None]:
# Import necessary libraries
import pandas as pd
import re

# Load the data (replace with your actual file path or DataFrame)
df = pd.read_csv('your_dataset.csv')  # or pd.read_excel('your_file.xlsx')

# =======================
# UNIQUNESS CHECKS
# =======================

# 1. Check for duplicate rows
duplicate_rows = df[df.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

# 2. Check for duplicates in a specific column (e.g., 'id')
if 'id' in df.columns:
    duplicate_ids = df[df['id'].duplicated()]
    print("\nDuplicate IDs:")
    print(duplicate_ids)

# 3. Check for duplicate combinations (e.g., 'name' and 'email')
if 'name' in df.columns and 'email' in df.columns:
    duplicate_combinations = df[df.duplicated(subset=['name', 'email'])]
    print("\nDuplicate Name + Email combinations:")
    print(duplicate_combinations)

# =======================
# VALIDITY CHECKS
# =======================

# 1. Domain Check (e.g., Gender should be 'Male', 'Female', 'Other')
if 'gender' in df.columns:
    valid_genders = ['Male', 'Female', 'Other']
    invalid_gender = df[~df['gender'].isin(valid_genders)]
    print("\nInvalid Gender Values:")
    print(invalid_gender)

# 2. Format Check (e.g., Email format)
if 'email' in df.columns:
    email_pattern = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
    invalid_email = df[~df['email'].astype(str).str.match(email_pattern, na=False)]
    print("\nInvalid Email Formats:")
    print(invalid_email)

# 3. Length Check (e.g., Phone number length = 10)
if 'phone' in df.columns:
    invalid_phone_length = df[df['phone'].astype(str).str.len() != 10]
    print("\nInvalid Phone Number Lengths:")
    print(invalid_phone_length)

# 4. Missing Values Check
missing_values = df.isnull().sum()
print("\nMissing Values in Each Column:")
print(missing_values)

# Optional: Summary Report
print("\n=== Summary ===")
print(f"Total Rows: {len(df)}")
print(f"Duplicate Rows: {len(duplicate_rows)}")
print(f"Missing Value Columns: {missing_values[missing_values > 0].count()}")