In [1]:
# Ques_13.ipynb
# Introduction to Data Quality Metrics & Scoring

import pandas as pd
import numpy as np

# -------------------------------
# Step 1: Sample Dataset
# -------------------------------
data = {
    'ID': [1, 2, 3, 4, None],
    'Name': ['Alice', 'Bob', 'Charlie', None, 'Eve'],
    'Email': ['alice@mail.com', 'bob@mail.com', None, 'bob@mail.com', 'eve@mail.com'],
    'Age': [25, 30, np.nan, 22, 29]
}

df = pd.DataFrame(data)
print("🔹 Sample Dataset:")
display(df)

# -------------------------------
# Step 2: Completeness Metric
# -------------------------------
def completeness(df):
    return df.notnull().mean() * 100

# -------------------------------
# Step 3: Uniqueness Metric
# -------------------------------
def uniqueness(df):
    return df.nunique() / len(df) * 100

# -------------------------------
# Step 4: Validity Check for Age
# -------------------------------
def validity_age(df):
    valid = df['Age'].between(0, 120).sum()
    total = df['Age'].notnull().sum()
    return (valid / total) * 100 if total > 0 else 0

# -------------------------------
# Step 5: Display Results
# -------------------------------
print("\n✅ Completeness (%):")
print(completeness(df).round(2))

print("\n✅ Uniqueness (%):")
print(uniqueness(df).round(2))

print(f"\n✅ Validity of Age (0–120): {validity_age(df):.2f}%")


🔹 Sample Dataset:


Unnamed: 0,ID,Name,Email,Age
0,1.0,Alice,alice@mail.com,25.0
1,2.0,Bob,bob@mail.com,30.0
2,3.0,Charlie,,
3,4.0,,bob@mail.com,22.0
4,,Eve,eve@mail.com,29.0



✅ Completeness (%):
ID       80.0
Name     80.0
Email    80.0
Age      80.0
dtype: float64

✅ Uniqueness (%):
ID       80.0
Name     80.0
Email    60.0
Age      80.0
dtype: float64

✅ Validity of Age (0–120): 100.00%
