In [17]:
# Ques1.ipynb
# Understanding and Defining Data Quality Metrics

import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('data.csv')  # Replace with your file

print("### Data Preview ###")
print(df.head())

# Define data quality metric functions
def completeness(column):
    return column.notnull().sum() / len(column)

def uniqueness(column):
    return column.nunique() / len(column)

def consistency(column):
    if column.dtype == 'object':
        return column.str.strip().str.lower().value_counts(normalize=True).max()
    return np.nan

def validity(column, valid_range=None):
    if valid_range:
        return column.between(valid_range[0], valid_range[1]).mean()
    return np.nan

def accuracy(column, reference_column):
    if column.equals(reference_column):
        return 1.0
    return (column == reference_column).mean()

def timeliness(date_column, reference_date):
    if pd.api.types.is_datetime64_any_dtype(date_column):
        return (date_column <= reference_date).mean()
    return np.nan

# Apply metrics to dataset
metrics = []
for col in df.columns:
    comp = completeness(df[col])
    uniq = uniqueness(df[col])
    cons = consistency(df[col])
    val = validity(df[col])  # You can define ranges per column if needed
    dqi = np.nanmean([comp, uniq, cons])  # Simplified Data Quality Index

    metrics.append({
        'Column': col,
        'Completeness': round(comp, 2),
        'Uniqueness': round(uniq, 2),
        'Consistency': round(cons, 2) if not pd.isna(cons) else None,
        'Validity': round(val, 2) if not pd.isna(val) else None,
        'DQI': round(dqi, 2)
    })

# Output Data Quality Metrics Table
dqi_df = pd.DataFrame(metrics)
print("\n### Data Quality Metrics ###")
print(dqi_df)

# Save to CSV
dqi_df.to_csv('defined_data_quality_metrics.csv', index=False)
print("\nMetrics saved to 'defined_data_quality_metrics.csv'")

FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'