In [None]:
# Ques_1.ipynb – Understanding and Defining Data Quality Metrics

# Import necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime

# Load the dataset (replace with your dataset path)
df = pd.read_csv("your_dataset.csv")  # replace with actual file

# Display basic info
print("Dataset Shape:", df.shape)
print("First 5 rows:")
display(df.head())

# -----------------------------
# 1. COMPLETENESS
# -----------------------------
missing_data = df.isnull().sum()
total_cells = df.size
missing_cells = missing_data.sum()
completeness_score = 1 - (missing_cells / total_cells)
print(f"Completeness Score: {completeness_score:.2f}")

# -----------------------------
# 2. UNIQUENESS
# -----------------------------
duplicate_count = df.duplicated().sum()
uniqueness_score = 1 - (duplicate_count / len(df))
print(f"Uniqueness Score: {uniqueness_score:.2f}")

# -----------------------------
# 3. VALIDITY
# Example: Check if 'age' column only contains positive integers
# -----------------------------
if 'age' in df.columns:
    valid_age_count = df['age'].apply(lambda x: pd.notnull(x) and x > 0).sum()
    validity_score = valid_age_count / df['age'].notnull().sum()
else:
    validity_score = np.nan
    print("Column 'age' not found.")
print(f"Validity Score: {validity_score:.2f}")

# -----------------------------
# 4. CONSISTENCY
# Example: Check if end_date >= start_date
# -----------------------------
if 'start_date' in df.columns and 'end_date' in df.columns:
    df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')
    df['end_date'] = pd.to_datetime(df['end_date'], errors='coerce')
    consistent_rows = df[df['end_date'] >= df['start_date']].shape[0]
    consistency_score = consistent_rows / len(df)
else:
    consistency_score = np.nan
    print("Required date columns not found.")
print(f"Consistency Score: {consistency_score:.2f}")

# -----------------------------
# 5. TIMELINESS
# Example: Is the latest record within the last 30 days?
# -----------------------------
if 'date_column' in df.columns:
    df['date_column'] = pd.to_datetime(df['date_column'], errors='coerce')
    latest_date = df['date_column'].max()
    if pd.notnull(latest_date):
        days_diff = (datetime.now() - latest_date).days
        timeliness_score = 1 if days_diff <= 30 else 0
    else:
        timeliness_score = np.nan
else:
    timeliness_score = np.nan
    print("Column 'date_column' not found.")
print(f"Timeliness Score: {timeliness_score}")

# -----------------------------
# OVERALL DATA QUALITY SCORE (Optional)
# -----------------------------
# Assign weights based on importance
weights = {
    'completeness': 0.3,
    'uniqueness': 0.2,
    'validity': 0.2,
    'consistency': 0.2,
    'timeliness': 0.1
}

# Multiply scores by weights and ignore missing metrics (NaNs)
metrics = {
    'completeness': completeness_score,
    'uniqueness': uniqueness_score,
    'validity': validity_score,
    'consistency': consistency_score,
    'timeliness': timeliness_score
}

weighted_sum = sum(weights[k] * metrics[k] for k in metrics if not np.isnan(metrics[k]))
total_weight = sum(weights[k] for k in metrics if not np.isnan(metrics[k]))
overall_score = weighted_sum / total_weight if total_weight > 0 else np.nan

print(f"\nOverall Data Quality Score: {overall_score:.2f}")