In [2]:
import pandas as pd
import numpy as np
from io import StringIO

# Sample customer dataset for Task 1
csv_customer = """
CustomerID,Name,Email,Phone,Country
1,John Doe,john@example.com,1234567890,USA
2,Jane Smith,,1234567890,USA
3,Bob Lee,bob@example.com,2345678901,UK
4,Alice Brown,alice@example.com,2345678901,UK
5,Tom Clark,tom@example,3456789012,Canada
6,Mary Johnson,mary@example.com,,Canada
7,Mary Johnson,mary@example.com,3456789012,Canada
"""

# Load the dataset from the CSV string
df_customer = pd.read_csv(StringIO(csv_customer))

# Task 1: Scoring Completeness, Uniqueness, Consistency

# Completeness score = % non-missing in important fields (Email, Phone)
completeness_email = df_customer['Email'].notna().mean()
completeness_phone = df_customer['Phone'].notna().mean()

# Uniqueness score = % unique values in Email (should be unique per customer)
uniqueness_email = df_customer['Email'].nunique() / df_customer['Email'].count()

# Consistency score: Check email format validity (basic check for '@' and '.')
def valid_email(email):
    if pd.isna(email):
        return False
    return ('@' in email) and ('.' in email)

email_validity = df_customer['Email'].apply(valid_email)
consistency_email = email_validity.mean()

# Overall data quality score (simple average)
overall_score = np.mean([completeness_email, completeness_phone, uniqueness_email, consistency_email])

# Print scores
print(f"Completeness (Email): {completeness_email:.2f}")
print(f"Completeness (Phone): {completeness_phone:.2f}")
print(f"Uniqueness (Email): {uniqueness_email:.2f}")
print(f"Consistency (Email Format): {consistency_email:.2f}")
print(f"Overall Data Quality Score: {overall_score:.2f}")

# Identify areas for improvement
improvements = []
if completeness_email < 1.0:
    improvements.append("Improve Email completeness by filling missing emails.")
if completeness_phone < 1.0:
    improvements.append("Improve Phone completeness by filling missing phone numbers.")
if uniqueness_email < 1.0:
    improvements.append("Remove or merge duplicate email entries.")
if consistency_email < 1.0:
    improvements.append("Fix invalid email formats.")

print("\nSuggested Improvements:")
for imp in improvements:
    print(f"- {imp}")

Completeness (Email): 0.86
Completeness (Phone): 0.86
Uniqueness (Email): 0.83
Consistency (Email Format): 0.71
Overall Data Quality Score: 0.82

Suggested Improvements:
- Improve Email completeness by filling missing emails.
- Improve Phone completeness by filling missing phone numbers.
- Remove or merge duplicate email entries.
- Fix invalid email formats.
