In [3]:
import pandas as pd

# -------------------------------
# Part 1: Load Dataset & Check Missing Values
# -------------------------------

# Load the dataset
try:
    customer_df = pd.read_csv('customer_data.csv')
    print("✅ Customer dataset loaded successfully.\n")
except FileNotFoundError:
    print("❌ Error: customer_data.csv not found.")
    raise

# Display first few rows
print("📊 First 5 rows of customer data:")
print(customer_df.head())

# Check for missing values in 'Email' and 'Phone'
print("\n🔍 Missing Values:")
missing_values = customer_df[['Email', 'Phone']].isnull().sum()
print(missing_values)

# -------------------------------
# Part 2: Identify Duplicates & Inconsistencies
# -------------------------------

# Find duplicate emails
print("\n📋 Duplicate Emails:")
duplicate_emails = customer_df[customer_df.duplicated('Email', keep=False)]
print(duplicate_emails[['Email']].drop_duplicates())

# -------------------------------
# Part 3: Generate Data Quality Report
# -------------------------------

print("\n🧾 Data Quality Report Summary:")

# Count total duplicates (all columns)
total_duplicates = customer_df.duplicated().sum()

# Basic descriptive statistics
description = customer_df.describe(include='all')

# Compile quality report
quality_report = {
    'Missing Values': missing_values.to_dict(),
    'Total Duplicate Rows': total_duplicates,
    'Duplicate Emails': duplicate_emails['Email'].nunique()
}

print(quality_report)

# Optionally save report
# pd.DataFrame.from_dict(quality_report, orient='index').to_csv("customer_data_quality_report.csv")


✅ Customer dataset loaded successfully.

📊 First 5 rows of customer data:
   CustomerID           Name                     Email     Phone       Address
0           1       John Doe       johndoe@example.com  555-1234    123 Elm St
1           2     Jane Smith     janesmith@example.com       NaN    456 Oak St
2           3  Alice Johnson  alicejohnson@example.com  555-5678   789 Pine St
3           4      Bob Brown       johndoe@example.com  555-9876  101 Maple St
4           5  Charlie Davis                       NaN  555-1111  202 Birch St

🔍 Missing Values:
Email    1
Phone    2
dtype: int64

📋 Duplicate Emails:
                 Email
0  johndoe@example.com

🧾 Data Quality Report Summary:
{'Missing Values': {'Email': 1, 'Phone': 2}, 'Total Duplicate Rows': 0, 'Duplicate Emails': 1}
