In [None]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

# Step 2: Load Sample Dataset
data = {
    'customer_id': [1, 2, 3, 4, 5, 5],
    'email': ['a@example.com', 'b@example.com', None, 'd@example.com', 'e@example.com', 'e@example.com'],
    'signup_date': ['2025-05-01', '2025-05-02', '2025-05-03', '2025-05-04', '2025-05-05', '2025-05-05'],
    'last_login': ['2025-05-10', '2025-05-11', '2025-05-12', '2025-05-13', '2025-05-14', '2025-05-14'],
    'region': ['North', 'South', 'East', 'West', 'North', 'North'],
    'phone': ['1234567890', '2345678901', '3456789012', '4567890123', '5678901234', '5678901234']
}

df = pd.DataFrame(data)
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['last_login'] = pd.to_datetime(df['last_login'])

# Step 3: Define KPI Functions

# Completeness
def completeness(df):
    completeness_scores = df.notnull().mean() * 100
    return completeness_scores

# Accuracy (basic validation)
def accuracy(df):
    email_accuracy = df['email'].dropna().apply(lambda x: '@' in x).mean() * 100
    phone_accuracy = df['phone'].dropna().apply(lambda x: len(str(x)) == 10).mean() * 100
    return {'email_accuracy': email_accuracy, 'phone_accuracy': phone_accuracy}

# Consistency
def consistency(df):
    consistency_score = (df['signup_date'] <= df['last_login']).mean() * 100
    return {'signup_before_login': consistency_score}

# Uniqueness
def uniqueness(df):
    unique_customer_ids = df['customer_id'].nunique()
    total_customer_ids = df['customer_id'].count()
    uniqueness_score = (unique_customer_ids / total_customer_ids) * 100
    return {'customer_id_uniqueness': uniqueness_score}

# Validity
def validity(df):
    valid_regions = ['North', 'South', 'East', 'West']
    validity_score = df['region'].isin(valid_regions).mean() * 100
    return {'region_validity': validity_score}

# Timeliness (data within 7 days from current date)
def timeliness(df):
    current_date = pd.to_datetime('2025-05-14')
    df['days_since_last_login'] = (current_date - df['last_login']).dt.days
    timeliness_score = (df['days_since_last_login'] <= 7).mean() * 100
    return {'timeliness': timeliness_score}

# Step 4: Compute and Visualize All KPIs
completeness_scores = completeness(df)
accuracy_scores = accuracy(df)
consistency_scores = consistency(df)
uniqueness_scores = uniqueness(df)
validity_scores = validity(df)
timeliness_scores = timeliness(df)

# Combine all scores
all_scores = {
    **completeness_scores.to_dict(),
    **accuracy_scores,
    **consistency_scores,
    **uniqueness_scores,
    **validity_scores,
    **timeliness_scores
}

# Create DataFrame
kpi_df = pd.DataFrame.from_dict(all_scores, orient='index', columns=['Score'])

# Plot the KPIs
kpi_df.plot(kind='barh', figsize=(10, 6), legend=False, color='skyblue')
plt.title('Data Quality KPIs')
plt.xlabel('Percentage (%)')
plt.xlim(0, 100)
plt.tight_layout()
plt.show()