In [1]:
import pandas as pd
from io import StringIO

# Sample customer_data.csv content with missing values and duplicates
csv_data = """
CustomerID,Name,Email,Phone
1,John Doe,john@example.com,1234567890
2,Alice Smith,alice@example.com,0987654321
3,Bob Johnson,bob@example.com,
4,Diana Prince,diana@example.com,1234509876
5,John Doe,john@example.com,1234567890
6,Evan Davis,,9876543210
7,Mike Brown,mike@example.com, 
"""

# Load dataset from string (simulate reading 'customer_data.csv')
df = pd.read_csv(StringIO(csv_data))

# Part 1: Check missing values in 'Email' and 'Phone' columns
missing_email = df['Email'].isnull().sum()
missing_phone = df['Phone'].isnull().sum() + df['Phone'].str.strip().eq('').sum()

print(f"Missing values in 'Email' column: {missing_email}")
print(f"Missing or blank values in 'Phone' column: {missing_phone}")

# Part 2: Identify duplicate emails
duplicate_emails = df[df.duplicated(subset=['Email'], keep=False) & df['Email'].notnull()]
print("\nDuplicate email entries:")
print(duplicate_emails)

# Part 3: Data Quality Report
total_records = len(df)
num_duplicates = df.duplicated().sum()
missing_values = df.isnull().sum()

print("\nData Quality Report:")
print(f"Total records: {total_records}")
print(f"Duplicate rows: {num_duplicates}")
print("Missing values per column:")
print(missing_values)

# Identify inconsistencies in 'Phone' (e.g., blanks counted as missing)
blank_phones = df['Phone'].str.strip().eq('').sum()
print(f"Blank phone numbers (treated as missing): {blank_phones}")









Missing values in 'Email' column: 1
Missing or blank values in 'Phone' column: 2

Duplicate email entries:
   CustomerID      Name             Email       Phone
0           1  John Doe  john@example.com  1234567890
4           5  John Doe  john@example.com  1234567890

Data Quality Report:
Total records: 7
Duplicate rows: 0
Missing values per column:
CustomerID    0
Name          0
Email         1
Phone         1
dtype: int64
Blank phone numbers (treated as missing): 1
