In [1]:
# Data Quality Metrics & Scoring Examples

# Task 1:
# Assign scores to a customer dataset based on completeness, uniqueness, and consistency.
# Analyze the overall data quality score and identify areas for improvement.
import pandas as pd

# Sample customer dataset
customer_data = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['John Doe', 'Jane Smith', 'Alice Johnson', None, 'Bob Brown'],
    'email': ['john@example.com', 'jane@example.com', None, 'bob@example.com', 'bob@example.com']
})

# Completeness: Check for missing values in 'name' and 'email' columns
completeness_score = 1 - (customer_data[['name', 'email']].isnull().mean().mean())  # Mean percentage of missing values

# Uniqueness: Check for duplicate 'email' values
uniqueness_score = 1 - (customer_data['email'].duplicated().mean())  # Proportion of duplicated emails

# Consistency: Check for inconsistent names (e.g., names with multiple parts)
consistency_score = customer_data['name'].str.contains(r'\s').mean()  # Check if there is more than one part in the name

# Calculate overall data quality score (average of completeness, uniqueness, and consistency)
overall_score = (completeness_score + uniqueness_score + (1 - consistency_score)) / 3

# Display results
print(f"Completeness Score: {completeness_score:.2f}")
print(f"Uniqueness Score: {uniqueness_score:.2f}")
print(f"Consistency Score: {consistency_score:.2f}")
print(f"Overall Data Quality Score: {overall_score:.2f}")

# Suggestions for improvement
improvement_suggestions = []
if completeness_score < 0.9:
    improvement_suggestions.append("Fill in missing 'name' or 'email' fields.")
if uniqueness_score < 0.9:
    improvement_suggestions.append("Remove duplicate emails.")
if consistency_score > 0.5:
    improvement_suggestions.append("Standardize name format to ensure consistency.")

print(f"\nImprovement Suggestions: {', '.join(improvement_suggestions)}")

# Sample dataset for an online shop
shop_data = pd.DataFrame({
    'order_id': [101, 102, 103, 104, 105],
    'product_name': ['Laptop', 'Tablet', 'Phone', 'Headset', 'Monitor'],
    'price': [1200, 500, 700, 100, 300],
    'order_date': ['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-02', '2025-01-05'],
    'shipped': [True, True, False, True, True]
})

# Accuracy: Check if all prices are positive numbers
accuracy_score = (shop_data['price'] > 0).mean()

# Timeliness: Check if orders are shipped in time (e.g., within a week)
shop_data['order_date'] = pd.to_datetime(shop_data['order_date'])
timeliness_score = ((shop_data['shipped'] & (shop_data['order_date'] < pd.Timestamp('2025-01-04'))).mean())

# Integrity: Check for missing values in essential columns (product_name, price)
integrity_score = 1 - (shop_data[['product_name', 'price']].isnull().mean().mean())  # Proportion of missing essential data

# Calculate overall data quality score (average of accuracy, timeliness, and integrity)
overall_score = (accuracy_score + timeliness_score + integrity_score) / 3

# Display results
print(f"Accuracy Score: {accuracy_score:.2f}")
print(f"Timeliness Score: {timeliness_score:.2f}")
print(f"Integrity Score: {integrity_score:.2f}")
print(f"Overall Data Quality Score: {overall_score:.2f}")

# Suggestions for improvement
improvement_suggestions = []
if accuracy_score < 0.9:
    improvement_suggestions.append("Ensure all product prices are valid positive numbers.")
if timeliness_score < 0.9:
    improvement_suggestions.append("Improve shipping timeliness for orders.")
if integrity_score < 0.9:
    improvement_suggestions.append("Fill in missing values for product_name and price.")

print(f"\nImprovement Suggestions: {', '.join(improvement_suggestions)}")

# Sample financial dataset
financial_data = pd.DataFrame({
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_amount': [5000.50, 2000.00, 15000.75, 3000.25, 12500.60],
    'transaction_date': ['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-02', '2025-01-05'],
    'account_id': [1, 2, 3, 4, 5]
})

# Validity: Check if all transaction amounts are positive
validity_score = (financial_data['transaction_amount'] > 0).mean()

# Precision: Check if the amounts are precise to two decimal places
precision_score = (financial_data['transaction_amount'] == financial_data['transaction_amount'].round(2)).mean()

# Accessibility: Check if 'transaction_date' is in valid date format and non-null
financial_data['transaction_date'] = pd.to_datetime(financial_data['transaction_date'], errors='coerce')
accessibility_score = 1 - financial_data['transaction_date'].isna().mean()

# Calculate overall data quality score (average of validity, precision, and accessibility)
overall_score = (validity_score + precision_score + accessibility_score) / 3

# Display results
print(f"Validity Score: {validity_score:.2f}")
print(f"Precision Score: {precision_score:.2f}")
print(f"Accessibility Score: {accessibility_score:.2f}")
print(f"Overall Data Quality Score: {overall_score:.2f}")

# Suggestions for improvement
improvement_suggestions = []
if validity_score < 0.9:
    improvement_suggestions.append("Ensure all transaction amounts are positive.")
if precision_score < 0.9:
    improvement_suggestions.append("Standardize transaction amount precision to two decimal places.")
if accessibility_score < 0.9:
    improvement_suggestions.append("Ensure all transaction dates are valid and accessible.")

print(f"\nImprovement Suggestions: {', '.join(improvement_suggestions)}")







# Task 2:
# Evaluate a dataset for an online shop using metrics such as accuracy, timeliness, and
# integrity. Calculate the data quality score and provide improvement suggestions.





# Task 3:
# Perform a data quality assessment on a financial dataset, scoring it based on validity,
# precision, and accessibility. Review the results and propose corrective actions.





Completeness Score: 0.80
Uniqueness Score: 0.80
Consistency Score: 1.00
Overall Data Quality Score: 0.53

Improvement Suggestions: Fill in missing 'name' or 'email' fields., Remove duplicate emails., Standardize name format to ensure consistency.
Accuracy Score: 1.00
Timeliness Score: 0.60
Integrity Score: 1.00
Overall Data Quality Score: 0.87

Improvement Suggestions: Improve shipping timeliness for orders.
Validity Score: 1.00
Precision Score: 1.00
Accessibility Score: 1.00
Overall Data Quality Score: 1.00

Improvement Suggestions: 
