In [6]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

# Load your dataset
# Replace 'your_data.csv' with your actual data file
df = pd.read_csv("your_data.csv")

# 1. Completeness Check: Missing Values
print("Missing Values per Column:")
print(df.isnull().sum())

# 2. Completeness Check: Empty Strings
empty_strings = df.applymap(lambda x: isinstance(x, str) and x.strip() == '')
print("\nEmpty Strings per Column:")
print(empty_strings.sum())

# 3. Accuracy Check: Data Type Validation
print("\nData Types:")
print(df.dtypes)

# 4. Accuracy Check: Email Format Validation
email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
df['Email_Valid'] = df['Email'].apply(lambda x: bool(re.match(email_pattern, str(x))))
print("\nInvalid Emails:")
print(df[df['Email_Valid'] == False][['Email']])

# 5. Accuracy Check: Date Format Validation (YYYY-MM-DD)
def is_valid_date(date_str):
    try:
        datetime.strptime(str(date_str), '%Y-%m-%d')
        return True
    except:
        return False

df['Date_Valid'] = df['Date'].apply(is_valid_date)
print("\nInvalid Dates:")
print(df[df['Date_Valid'] == False][['Date']])

# 6. Accuracy Check: Phone Number Validation (10-digit numbers)
phone_pattern = r'^\d{10}$'
df['Phone_Valid'] = df['Phone'].apply(lambda x: bool(re.match(phone_pattern, str(x))))
print("\nInvalid Phone Numbers:")
print(df[df['Phone_Valid'] == False][['Phone']])

# 7. Accuracy Check: Duplicate Records
duplicates = df.duplicated()
print(f"\nNumber of Duplicate Rows: {duplicates.sum()}")
print("Duplicate Rows:")
print(df[duplicates])

# 8. Accuracy Check: Outlier Detection in Numeric Columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    outliers = df[(df[col] < df[col].quantile(0.05)) | (df[col] > df[col].quantile(0.95))]
    if not outliers.empty:
        print(f"\nOutliers in '{col}':")
        print(outliers[[col]])

# 9. Text Normalization (e.g., CustomerName)
if 'CustomerName' in df.columns:
    df['CustomerName'] = df['CustomerName'].str.strip().str.title()
    print("\nNormalized Customer Names:")
    print(df['CustomerName'].head())

# Optional: Save the cleaned and validated dataset
df.to_csv("validated_output.csv", index=False)
print("\nData quality assessment completed. Results saved to 'validated_output.csv'.")

FileNotFoundError: [Errno 2] No such file or directory: 'your_data.csv'